Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_x86.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_x86.h 1 // Simd x86 specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_X86INTRIN 31 #error \ 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 33 #endif 34 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE 36 37 // __to_masktype {{{ 38 // Given
return <__int_for_sizeof_t
, N>. For _SimdWrapper and 39 // __vector_type_t. 40 template
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> 42 __to_masktype(_SimdWrapper<_Tp, _Np> __x) 43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); } 44 45 template
, _VectorTraits<_TV>>, 48 typename _Up = __int_for_sizeof_t
> 49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size> 50 __to_masktype(_TV __x) 51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); } 52 53 // }}} 54 // __interleave128_lo {{{ 55 template
, 56 typename _Trait = _VectorTraits<_Tp>> 57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp 58 __interleave128_lo(const _Ap& __av, const _Bp& __bv) 59 { 60 const _Tp __a(__av); 61 const _Tp __b(__bv); 62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2) 63 return _Tp{__a[0], __b[0]}; 64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4) 65 return _Tp{__a[0], __b[0], __a[1], __b[1]}; 66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8) 67 return _Tp{__a[0], __b[0], __a[1], __b[1], 68 __a[2], __b[2], __a[3], __b[3]}; 69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16) 70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5], 72 __a[6], __b[6], __a[7], __b[7]}; 73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4) 74 return _Tp{__a[0], __b[0], __a[2], __b[2]}; 75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8) 76 return _Tp{__a[0], __b[0], __a[1], __b[1], 77 __a[4], __b[4], __a[5], __b[5]}; 78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16) 79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9], 81 __a[10], __b[10], __a[11], __b[11]}; 82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32) 83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 87 __a[22], __b[22], __a[23], __b[23]}; 88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8) 89 return _Tp{__a[0], __b[0], __a[2], __b[2], 90 __a[4], __b[4], __a[6], __b[6]}; 91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16) 92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4], 93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9], 94 __a[12], __b[12], __a[13], __b[13]}; 95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32) 96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], 98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18], 99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25], 100 __a[26], __b[26], __a[27], __b[27]}; 101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64) 102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33], 107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36], 108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48], 109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51], 110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55], 111 __b[55]}; 112 else 113 __assert_unreachable<_Tp>(); 114 } 115 116 // }}} 117 // __is_zero{{{ 118 template
> 119 _GLIBCXX_SIMD_INTRINSIC constexpr bool 120 __is_zero(_Tp __a) 121 { 122 if (!__builtin_is_constant_evaluated()) 123 { 124 if constexpr (__have_avx) 125 { 126 if constexpr (_TVT::template _S_is
) 127 return _mm256_testz_ps(__a, __a); 128 else if constexpr (_TVT::template _S_is
) 129 return _mm256_testz_pd(__a, __a); 130 else if constexpr (sizeof(_Tp) == 32) 131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a)); 132 else if constexpr (_TVT::template _S_is
) 133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a)); 134 else if constexpr (_TVT::template _S_is
) 135 return _mm_testz_pd(__a, __a); 136 else 137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a)); 138 } 139 else if constexpr (__have_sse4_1) 140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a), 141 __intrin_bitcast<__m128i>(__a)); 142 } 143 else if constexpr (sizeof(_Tp) <= 8) 144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0; 145 else 146 { 147 const auto __b = __vector_bitcast<_LLong>(__a); 148 if constexpr (sizeof(__b) == 16) 149 return (__b[0] | __b[1]) == 0; 150 else if constexpr (sizeof(__b) == 32) 151 return __is_zero(__lo128(__b) | __hi128(__b)); 152 else if constexpr (sizeof(__b) == 64) 153 return __is_zero(__lo256(__b) | __hi256(__b)); 154 else 155 __assert_unreachable<_Tp>(); 156 } 157 } 158 159 // }}} 160 // __movemask{{{ 161 template
> 162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int 163 __movemask(_Tp __a) 164 { 165 if constexpr (sizeof(_Tp) == 32) 166 { 167 if constexpr (_TVT::template _S_is
) 168 return _mm256_movemask_ps(__to_intrin(__a)); 169 else if constexpr (_TVT::template _S_is
) 170 return _mm256_movemask_pd(__to_intrin(__a)); 171 else 172 return _mm256_movemask_epi8(__to_intrin(__a)); 173 } 174 else if constexpr (_TVT::template _S_is
) 175 return _mm_movemask_ps(__to_intrin(__a)); 176 else if constexpr (_TVT::template _S_is
) 177 return _mm_movemask_pd(__to_intrin(__a)); 178 else 179 return _mm_movemask_epi8(__to_intrin(__a)); 180 } 181 182 // }}} 183 // __testz{{{ 184 template
> 185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 186 __testz(_TI __a, _TI __b) 187 { 188 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 190 if (!__builtin_is_constant_evaluated()) 191 { 192 if constexpr (sizeof(_TI) == 32) 193 { 194 if constexpr (_TVT::template _S_is
) 195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b)); 196 else if constexpr (_TVT::template _S_is
) 197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b)); 198 else 199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b)); 200 } 201 else if constexpr (_TVT::template _S_is
&& __have_avx) 202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b)); 203 else if constexpr (_TVT::template _S_is
&& __have_avx) 204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b)); 205 else if constexpr (__have_sse4_1) 206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 207 __intrin_bitcast<__m128i>(__to_intrin(__b))); 208 else 209 return __movemask(0 == __and(__a, __b)) != 0; 210 } 211 else 212 return __is_zero(__and(__a, __b)); 213 } 214 215 // }}} 216 // __testc{{{ 217 // requires SSE4.1 or above 218 template
> 219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 220 __testc(_TI __a, _TI __b) 221 { 222 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 224 if (__builtin_is_constant_evaluated()) 225 return __is_zero(__andnot(__a, __b)); 226 227 if constexpr (sizeof(_TI) == 32) 228 { 229 if constexpr (_TVT::template _S_is
) 230 return _mm256_testc_ps(__a, __b); 231 else if constexpr (_TVT::template _S_is
) 232 return _mm256_testc_pd(__a, __b); 233 else 234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b)); 235 } 236 else if constexpr (_TVT::template _S_is
&& __have_avx) 237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b)); 238 else if constexpr (_TVT::template _S_is
&& __have_avx) 239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b)); 240 else 241 { 242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1); 243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 244 __intrin_bitcast<__m128i>(__to_intrin(__b))); 245 } 246 } 247 248 // }}} 249 // __testnzc{{{ 250 template
> 251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 252 __testnzc(_TI __a, _TI __b) 253 { 254 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 256 if (!__builtin_is_constant_evaluated()) 257 { 258 if constexpr (sizeof(_TI) == 32) 259 { 260 if constexpr (_TVT::template _S_is
) 261 return _mm256_testnzc_ps(__a, __b); 262 else if constexpr (_TVT::template _S_is
) 263 return _mm256_testnzc_pd(__a, __b); 264 else 265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b)); 266 } 267 else if constexpr (_TVT::template _S_is
&& __have_avx) 268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b)); 269 else if constexpr (_TVT::template _S_is
&& __have_avx) 270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b)); 271 else if constexpr (__have_sse4_1) 272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 273 __intrin_bitcast<__m128i>(__to_intrin(__b))); 274 else 275 return __movemask(0 == __and(__a, __b)) == 0 276 && __movemask(0 == __andnot(__a, __b)) == 0; 277 } 278 else 279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b))); 280 } 281 282 // }}} 283 // __xzyw{{{ 284 // shuffles the complete vector, swapping the inner two quarters. Often useful 285 // for AVX for fixing up a shuffle result. 286 template
> 287 _GLIBCXX_SIMD_INTRINSIC _Tp 288 __xzyw(_Tp __a) 289 { 290 if constexpr (sizeof(_Tp) == 16) 291 { 292 const auto __x = __vector_bitcast
, float, int>>(__a); 294 return reinterpret_cast<_Tp>( 295 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 296 } 297 else if constexpr (sizeof(_Tp) == 32) 298 { 299 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 301 return reinterpret_cast<_Tp>( 302 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 303 } 304 else if constexpr (sizeof(_Tp) == 64) 305 { 306 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4], 309 __x[5], __x[2], __x[3], 310 __x[6], __x[7]}); 311 } 312 else 313 __assert_unreachable<_Tp>(); 314 } 315 316 // }}} 317 // __maskload_epi32{{{ 318 template
319 _GLIBCXX_SIMD_INTRINSIC auto 320 __maskload_epi32(const int* __ptr, _Tp __k) 321 { 322 if constexpr (sizeof(__k) == 16) 323 return _mm_maskload_epi32(__ptr, __k); 324 else 325 return _mm256_maskload_epi32(__ptr, __k); 326 } 327 328 // }}} 329 // __maskload_epi64{{{ 330 template
331 _GLIBCXX_SIMD_INTRINSIC auto 332 __maskload_epi64(const _LLong* __ptr, _Tp __k) 333 { 334 if constexpr (sizeof(__k) == 16) 335 return _mm_maskload_epi64(__ptr, __k); 336 else 337 return _mm256_maskload_epi64(__ptr, __k); 338 } 339 340 // }}} 341 // __maskload_ps{{{ 342 template
343 _GLIBCXX_SIMD_INTRINSIC auto 344 __maskload_ps(const float* __ptr, _Tp __k) 345 { 346 if constexpr (sizeof(__k) == 16) 347 return _mm_maskload_ps(__ptr, __k); 348 else 349 return _mm256_maskload_ps(__ptr, __k); 350 } 351 352 // }}} 353 // __maskload_pd{{{ 354 template
355 _GLIBCXX_SIMD_INTRINSIC auto 356 __maskload_pd(const double* __ptr, _Tp __k) 357 { 358 if constexpr (sizeof(__k) == 16) 359 return _mm_maskload_pd(__ptr, __k); 360 else 361 return _mm256_maskload_pd(__ptr, __k); 362 } 363 364 // }}} 365 366 #ifdef __clang__ 367 template
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto 369 __movm(_Kp __k) noexcept 370 { 371 static_assert(is_unsigned_v<_Kp>); 372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw) 373 { 374 if constexpr (_Np <= 16 && __have_avx512vl) 375 return __builtin_ia32_cvtmask2b128(__k); 376 else if constexpr (_Np <= 32 && __have_avx512vl) 377 return __builtin_ia32_cvtmask2b256(__k); 378 else 379 return __builtin_ia32_cvtmask2b512(__k); 380 } 381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw) 382 { 383 if constexpr (_Np <= 8 && __have_avx512vl) 384 return __builtin_ia32_cvtmask2w128(__k); 385 else if constexpr (_Np <= 16 && __have_avx512vl) 386 return __builtin_ia32_cvtmask2w256(__k); 387 else 388 return __builtin_ia32_cvtmask2w512(__k); 389 } 390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq) 391 { 392 if constexpr (_Np <= 4 && __have_avx512vl) 393 return __builtin_ia32_cvtmask2d128(__k); 394 else if constexpr (_Np <= 8 && __have_avx512vl) 395 return __builtin_ia32_cvtmask2d256(__k); 396 else 397 return __builtin_ia32_cvtmask2d512(__k); 398 } 399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq) 400 { 401 if constexpr (_Np <= 2 && __have_avx512vl) 402 return __builtin_ia32_cvtmask2q128(__k); 403 else if constexpr (_Np <= 4 && __have_avx512vl) 404 return __builtin_ia32_cvtmask2q256(__k); 405 else 406 return __builtin_ia32_cvtmask2q512(__k); 407 } 408 else 409 __assert_unreachable<_Tp>(); 410 } 411 #endif // __clang__ 412 413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 415 #endif 416 417 // ISA & type detection {{{ 418 template
419 constexpr bool 420 __is_sse_ps() 421 { 422 return __have_sse 423 && is_same_v<_Tp, 424 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 425 } 426 427 template
428 constexpr bool 429 __is_sse_pd() 430 { 431 return __have_sse2 432 && is_same_v<_Tp, 433 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 434 } 435 436 template
437 constexpr bool 438 __is_avx_ps() 439 { 440 return __have_avx 441 && is_same_v<_Tp, 442 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 443 } 444 445 template
446 constexpr bool 447 __is_avx_pd() 448 { 449 return __have_avx 450 && is_same_v<_Tp, 451 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 452 } 453 454 template
455 constexpr bool 456 __is_avx512_ps() 457 { 458 return __have_avx512f 459 && is_same_v<_Tp, 460 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 461 } 462 463 template
464 constexpr bool 465 __is_avx512_pd() 466 { 467 return __have_avx512f 468 && is_same_v<_Tp, 469 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 470 } 471 472 // }}} 473 struct _MaskImplX86Mixin; 474 475 // _CommonImplX86 {{{ 476 struct _CommonImplX86 : _CommonImplBuiltin 477 { 478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 479 // _S_converts_via_decomposition {{{ 480 template
481 static constexpr bool 482 _S_converts_via_decomposition() 483 { 484 if constexpr (is_integral_v< 485 _From> && is_integral_v<_To> && sizeof(_From) == 8 486 && _ToSize == 16) 487 return (sizeof(_To) == 2 && !__have_ssse3) 488 || (sizeof(_To) == 1 && !__have_avx512f); 489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>) 490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8 491 && !__have_avx512dq) 492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1 493 && _ToSize == 16); 494 else if constexpr ( 495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8 496 && !__have_avx512dq) 497 return (sizeof(_To) == 4 && _ToSize == 16) 498 || (sizeof(_To) == 8 && _ToSize < 64); 499 else 500 return false; 501 } 502 503 template
504 static inline constexpr bool __converts_via_decomposition_v 505 = _S_converts_via_decomposition<_From, _To, _ToSize>(); 506 507 // }}} 508 #endif 509 // _S_store {{{ 510 using _CommonImplBuiltin::_S_store; 511 512 template
513 _GLIBCXX_SIMD_INTRINSIC static constexpr void 514 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 515 { 516 constexpr size_t _Bytes = _Np * sizeof(_Tp); 517 518 if (__builtin_is_constant_evaluated()) 519 _CommonImplBuiltin::_S_store(__x, __addr); 520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl) 521 { 522 const auto __v = __to_intrin(__x); 523 524 if constexpr (_Bytes & 1) 525 { 526 if constexpr (_Bytes < 16) 527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes), 528 __intrin_bitcast<__m128i>(__v)); 529 else if constexpr (_Bytes < 32) 530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes), 531 __intrin_bitcast<__m256i>(__v)); 532 else 533 _mm512_mask_storeu_epi8(__addr, 534 0xffffffffffffffffull >> (64 - _Bytes), 535 __intrin_bitcast<__m512i>(__v)); 536 } 537 else if constexpr (_Bytes & 2) 538 { 539 if constexpr (_Bytes < 16) 540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2), 541 __intrin_bitcast<__m128i>(__v)); 542 else if constexpr (_Bytes < 32) 543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2), 544 __intrin_bitcast<__m256i>(__v)); 545 else 546 _mm512_mask_storeu_epi16(__addr, 547 0xffffffffull >> (32 - _Bytes / 2), 548 __intrin_bitcast<__m512i>(__v)); 549 } 550 else if constexpr (_Bytes & 4) 551 { 552 if constexpr (_Bytes < 16) 553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4), 554 __intrin_bitcast<__m128i>(__v)); 555 else if constexpr (_Bytes < 32) 556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4), 557 __intrin_bitcast<__m256i>(__v)); 558 else 559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4), 560 __intrin_bitcast<__m512i>(__v)); 561 } 562 else 563 { 564 static_assert( 565 _Bytes > 16, 566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible"); 568 if constexpr (_Bytes < 32) 569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8), 570 __intrin_bitcast<__m256i>(__v)); 571 else 572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8), 573 __intrin_bitcast<__m512i>(__v)); 574 } 575 } 576 else 577 _CommonImplBuiltin::_S_store(__x, __addr); 578 } 579 580 // }}} 581 // _S_store_bool_array(_BitMask) {{{ 582 template
583 _GLIBCXX_SIMD_INTRINSIC static constexpr void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) 585 { 586 if (__builtin_is_constant_evaluated()) 587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 588 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL 589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( 590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 591 if constexpr (_Np <= 16) 592 return _mm_movm_epi8(__x._M_to_bits()); 593 else if constexpr (_Np <= 32) 594 return _mm256_movm_epi8(__x._M_to_bits()); 595 else if constexpr (_Np <= 64) 596 return _mm512_movm_epi8(__x._M_to_bits()); 597 else 598 __assert_unreachable<_SizeConstant<_Np>>(); 599 }()), 600 __mem); 601 else if constexpr (__have_bmi2) 602 { 603 if constexpr (_Np <= 4) 604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); 605 else 606 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( 607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 608 constexpr size_t __offset = __i * sizeof(size_t); 609 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); 610 if constexpr (__todo == 1) 611 __mem[__offset] = __x[__offset]; 612 else 613 { 614 const auto __bools = 615 #ifdef __x86_64__ 616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(), 617 0x0101010101010101ULL); 618 #else // __x86_64__ 619 _pdep_u32( 620 __x.template _M_extract<__offset>()._M_to_bits(), 621 0x01010101U); 622 #endif // __x86_64__ 623 _S_store<__todo>(__bools, __mem + __offset); 624 } 625 }); 626 } 627 else if constexpr (__have_sse2 && _Np > 7) 628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 629 constexpr int __offset = __i * 16; 630 constexpr int __todo = std::min(16, int(_Np) - __offset); 631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 632 __vector_type16_t<_UChar> __bools; 633 if constexpr (__have_avx512f) 634 { 635 auto __as32bits 636 = _mm512_maskz_mov_epi32(__bits, __to_intrin( 637 __vector_broadcast<16>(1))); 638 auto __as16bits 639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 640 __todo > 8 ? __hi256(__as32bits) 641 : __m256i())); 642 __bools = __vector_bitcast<_UChar>( 643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 644 } 645 else 646 { 647 using _V = __vector_type_t<_UChar, 16>; 648 auto __tmp = _mm_cvtsi32_si128(__bits); 649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp); 650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp); 651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp); 652 _V __tmp2 = reinterpret_cast<_V>(__tmp); 653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128, 654 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index 655 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01 656 } 657 _S_store<__todo>(__bools, __mem + __offset); 658 }); 659 else 660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 661 } 662 663 // }}} 664 // _S_blend_avx512 {{{ 665 // Returns: __k ? __b : __a 666 // TODO: reverse __a and __b to match COND_EXPR 667 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask 668 // __k 669 template
670 _GLIBCXX_SIMD_INTRINSIC static _TV 671 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept 672 { 673 static_assert(__is_vector_type_v<_TV>); 674 using _Tp = typename _VectorTraits<_TV>::value_type; 675 static_assert(sizeof(_TV) >= 16); 676 static_assert(sizeof(_Tp) <= 8); 677 #ifdef __clang__ 678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a; 679 #else 680 using _IntT 681 = conditional_t<(sizeof(_Tp) > 2), 682 conditional_t
, 683 conditional_t
>; 684 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a); 685 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b); 686 if constexpr (sizeof(_TV) == 64) 687 { 688 if constexpr (sizeof(_Tp) == 1) 689 return reinterpret_cast<_TV>( 690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k)); 691 else if constexpr (sizeof(_Tp) == 2) 692 return reinterpret_cast<_TV>( 693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k)); 694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k); 696 else if constexpr (sizeof(_Tp) == 4) 697 return reinterpret_cast<_TV>( 698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k)); 699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k); 701 else if constexpr (sizeof(_Tp) == 8) 702 return reinterpret_cast<_TV>( 703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k)); 704 } 705 else if constexpr (sizeof(_TV) == 32) 706 { 707 if constexpr (sizeof(_Tp) == 1) 708 return reinterpret_cast<_TV>( 709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k)); 710 else if constexpr (sizeof(_Tp) == 2) 711 return reinterpret_cast<_TV>( 712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k)); 713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k); 715 else if constexpr (sizeof(_Tp) == 4) 716 return reinterpret_cast<_TV>( 717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k)); 718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k); 720 else if constexpr (sizeof(_Tp) == 8) 721 return reinterpret_cast<_TV>( 722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k)); 723 } 724 else if constexpr (sizeof(_TV) == 16) 725 { 726 if constexpr (sizeof(_Tp) == 1) 727 return reinterpret_cast<_TV>( 728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k)); 729 else if constexpr (sizeof(_Tp) == 2) 730 return reinterpret_cast<_TV>( 731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k)); 732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k); 734 else if constexpr (sizeof(_Tp) == 4) 735 return reinterpret_cast<_TV>( 736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k)); 737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k); 739 else if constexpr (sizeof(_Tp) == 8) 740 return reinterpret_cast<_TV>( 741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k)); 742 } 743 #endif 744 } 745 746 // }}} 747 // _S_blend_intrin {{{ 748 // Returns: __k ? __b : __a 749 // TODO: reverse __a and __b to match COND_EXPR 750 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32 751 // Bytes wide 752 template
753 _GLIBCXX_SIMD_INTRINSIC static _Tp 754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept 755 { 756 static_assert(is_same_v
); 757 constexpr struct 758 { 759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b, 760 __m128 __k) const noexcept 761 { 762 return __builtin_ia32_blendvps(__a, __b, __k); 763 } 764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b, 765 __m128d __k) const noexcept 766 { 767 return __builtin_ia32_blendvpd(__a, __b, __k); 768 } 769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b, 770 __m128i __k) const noexcept 771 { 772 return reinterpret_cast<__m128i>( 773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a), 774 reinterpret_cast<__v16qi>(__b), 775 reinterpret_cast<__v16qi>(__k))); 776 } 777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b, 778 __m256 __k) const noexcept 779 { 780 return __builtin_ia32_blendvps256(__a, __b, __k); 781 } 782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b, 783 __m256d __k) const noexcept 784 { 785 return __builtin_ia32_blendvpd256(__a, __b, __k); 786 } 787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b, 788 __m256i __k) const noexcept 789 { 790 if constexpr (__have_avx2) 791 return reinterpret_cast<__m256i>( 792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a), 793 reinterpret_cast<__v32qi>(__b), 794 reinterpret_cast<__v32qi>(__k))); 795 else 796 return reinterpret_cast<__m256i>( 797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a), 798 reinterpret_cast<__v8sf>(__b), 799 reinterpret_cast<__v8sf>(__k))); 800 } 801 } __eval; 802 return __eval(__a, __b, __k); 803 } 804 805 // }}} 806 // _S_blend {{{ 807 // Returns: __k ? __at1 : __at0 808 // TODO: reverse __at0 and __at1 to match COND_EXPR 809 template
810 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 811 _S_blend(_SimdWrapper
__k, _SimdWrapper<_Tp, _Np> __at0, 812 _SimdWrapper<_Tp, _Np> __at1) 813 { 814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); 815 if (__k._M_is_constprop() && __at0._M_is_constprop() 816 && __at1._M_is_constprop()) 817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( 818 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 819 return __k[__i] ? __at1[__i] : __at0[__i]; 820 }); 821 else if constexpr (sizeof(__at0) == 64 822 || (__have_avx512vl && sizeof(__at0) >= 16)) 823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); 824 else 825 { 826 static_assert((__have_avx512vl && sizeof(__at0) < 16) 827 || !__have_avx512vl); 828 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp); 829 return __vector_bitcast<_Tp, _Np>( 830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0), 831 __vector_bitcast<_Tp, __size>(__at1))); 832 } 833 } 834 835 template
836 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 839 { 840 const auto __kk = __wrapper_bitcast<_Tp>(__k); 841 if (__builtin_is_constant_evaluated() 842 || (__kk._M_is_constprop() && __at0._M_is_constprop() 843 && __at1._M_is_constprop())) 844 { 845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1)); 846 if (__r._M_is_constprop()) 847 return __r; 848 } 849 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl) 850 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 851 // convert to bitmask and call overload above 852 return _S_blend( 853 _SimdWrapper
( 854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k) 855 ._M_to_bits()), 856 __at0, __at1); 857 else 858 { 859 // Since GCC does not assume __k to be a mask, using the builtin 860 // conditional operator introduces an extra compare against 0 before 861 // blending. So we rather call the intrinsic here. 862 if constexpr (__have_sse4_1) 863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0), 864 __to_intrin(__at1)); 865 else 866 return __or(__andnot(__kk, __at0), __and(__kk, __at1)); 867 } 868 } 869 870 // }}} 871 }; 872 873 // }}} 874 // _SimdImplX86 {{{ 875 template
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi> 877 { 878 using _Base = _SimdImplBuiltin<_Abi>; 879 880 template
881 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 882 883 template
884 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 885 886 template
887 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 888 889 template
890 static constexpr size_t _S_max_store_size 891 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64 892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32 893 : 16; 894 895 using _MaskImpl = typename _Abi::_MaskImpl; 896 897 // _S_masked_load {{{ 898 template
899 static inline _SimdWrapper<_Tp, _Np> 900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 901 const _Up* __mem) noexcept 902 { 903 static_assert(_Np == _S_size<_Tp>); 904 if constexpr (is_same_v<_Tp, _Up> || // no conversion 905 (sizeof(_Tp) == sizeof(_Up) 906 && is_integral_v< 907 _Tp> == is_integral_v<_Up>) // conversion via bit 908 // reinterpretation 909 ) 910 { 911 [[maybe_unused]] const auto __intrin = __to_intrin(__merge); 912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 913 && sizeof(_Tp) == 1) 914 { 915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 916 if constexpr (sizeof(__intrin) == 16) 917 __merge = __vector_bitcast<_Tp, _Np>( 918 _mm_mask_loadu_epi8(__intrin, __kk, __mem)); 919 else if constexpr (sizeof(__merge) == 32) 920 __merge = __vector_bitcast<_Tp, _Np>( 921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem)); 922 else if constexpr (sizeof(__merge) == 64) 923 __merge = __vector_bitcast<_Tp, _Np>( 924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem)); 925 else 926 __assert_unreachable<_Tp>(); 927 } 928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 929 && sizeof(_Tp) == 2) 930 { 931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 932 if constexpr (sizeof(__intrin) == 16) 933 __merge = __vector_bitcast<_Tp, _Np>( 934 _mm_mask_loadu_epi16(__intrin, __kk, __mem)); 935 else if constexpr (sizeof(__intrin) == 32) 936 __merge = __vector_bitcast<_Tp, _Np>( 937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem)); 938 else if constexpr (sizeof(__intrin) == 64) 939 __merge = __vector_bitcast<_Tp, _Np>( 940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem)); 941 else 942 __assert_unreachable<_Tp>(); 943 } 944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 945 && sizeof(_Tp) == 4 && is_integral_v<_Up>) 946 { 947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 948 if constexpr (sizeof(__intrin) == 16) 949 __merge = __vector_bitcast<_Tp, _Np>( 950 _mm_mask_loadu_epi32(__intrin, __kk, __mem)); 951 else if constexpr (sizeof(__intrin) == 32) 952 __merge = __vector_bitcast<_Tp, _Np>( 953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem)); 954 else if constexpr (sizeof(__intrin) == 64) 955 __merge = __vector_bitcast<_Tp, _Np>( 956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem)); 957 else 958 __assert_unreachable<_Tp>(); 959 } 960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>) 962 { 963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 964 if constexpr (sizeof(__intrin) == 16) 965 __merge = __vector_bitcast<_Tp, _Np>( 966 _mm_mask_loadu_ps(__intrin, __kk, __mem)); 967 else if constexpr (sizeof(__intrin) == 32) 968 __merge = __vector_bitcast<_Tp, _Np>( 969 _mm256_mask_loadu_ps(__intrin, __kk, __mem)); 970 else if constexpr (sizeof(__intrin) == 64) 971 __merge = __vector_bitcast<_Tp, _Np>( 972 _mm512_mask_loadu_ps(__intrin, __kk, __mem)); 973 else 974 __assert_unreachable<_Tp>(); 975 } 976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 977 && is_integral_v<_Up>) 978 { 979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 980 __merge 981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 982 __vector_bitcast<_Tp, _Np>( 983 __maskload_epi32(reinterpret_cast
(__mem), 984 __to_intrin(__k)))); 985 } 986 else if constexpr (__have_avx && sizeof(_Tp) == 4) 987 { 988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 989 __merge 990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 991 __vector_bitcast<_Tp, _Np>( 992 __maskload_ps(reinterpret_cast
(__mem), 993 __to_intrin(__k)))); 994 } 995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 996 && sizeof(_Tp) == 8 && is_integral_v<_Up>) 997 { 998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 999 if constexpr (sizeof(__intrin) == 16) 1000 __merge = __vector_bitcast<_Tp, _Np>( 1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem)); 1002 else if constexpr (sizeof(__intrin) == 32) 1003 __merge = __vector_bitcast<_Tp, _Np>( 1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem)); 1005 else if constexpr (sizeof(__intrin) == 64) 1006 __merge = __vector_bitcast<_Tp, _Np>( 1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem)); 1008 else 1009 __assert_unreachable<_Tp>(); 1010 } 1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>) 1013 { 1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1015 if constexpr (sizeof(__intrin) == 16) 1016 __merge = __vector_bitcast<_Tp, _Np>( 1017 _mm_mask_loadu_pd(__intrin, __kk, __mem)); 1018 else if constexpr (sizeof(__intrin) == 32) 1019 __merge = __vector_bitcast<_Tp, _Np>( 1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem)); 1021 else if constexpr (sizeof(__intrin) == 64) 1022 __merge = __vector_bitcast<_Tp, _Np>( 1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem)); 1024 else 1025 __assert_unreachable<_Tp>(); 1026 } 1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1028 && is_integral_v<_Up>) 1029 { 1030 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1031 __merge 1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64( 1034 reinterpret_cast
(__mem), 1035 __to_intrin(__k)))); 1036 } 1037 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1038 { 1039 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1040 __merge 1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1042 __vector_bitcast<_Tp, _Np>( 1043 __maskload_pd(reinterpret_cast
(__mem), 1044 __to_intrin(__k)))); 1045 } 1046 else 1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1048 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1050 }); 1051 } 1052 /* Very uncertain, that the following improves anything. Needs 1053 benchmarking 1054 * before it's activated. 1055 else if constexpr (sizeof(_Up) <= 8 && // no long double 1056 !__converts_via_decomposition_v< 1057 _Up, _Tp, 1058 sizeof(__merge)> // conversion via decomposition 1059 // is better handled via the 1060 // bit_iteration fallback below 1061 ) 1062 { 1063 // TODO: copy pattern from _S_masked_store, which doesn't resort to 1064 // fixed_size 1065 using _Ap = simd_abi::deduce_t<_Up, _Np>; 1066 using _ATraits = _SimdTraits<_Up, _Ap>; 1067 using _AImpl = typename _ATraits::_SimdImpl; 1068 typename _ATraits::_SimdMember __uncvted{}; 1069 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template 1070 _S_convert<_Up>(__k); 1071 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem); 1072 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter; 1073 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted)); 1074 } 1075 */ 1076 else 1077 __merge = _Base::_S_masked_load(__merge, __k, __mem); 1078 return __merge; 1079 } 1080 1081 // }}} 1082 // _S_masked_store_nocvt {{{ 1083 template
1084 _GLIBCXX_SIMD_INTRINSIC static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper
__k) 1086 { 1087 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1088 if constexpr (sizeof(__vi) == 64) 1089 { 1090 static_assert(sizeof(__v) == 64 && __have_avx512f); 1091 if constexpr (__have_avx512bw && sizeof(_Tp) == 1) 1092 _mm512_mask_storeu_epi8(__mem, __k, __vi); 1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2) 1094 _mm512_mask_storeu_epi16(__mem, __k, __vi); 1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4) 1096 { 1097 if constexpr (is_integral_v<_Tp>) 1098 _mm512_mask_storeu_epi32(__mem, __k, __vi); 1099 else 1100 _mm512_mask_storeu_ps(__mem, __k, __vi); 1101 } 1102 else if constexpr (__have_avx512f && sizeof(_Tp) == 8) 1103 { 1104 if constexpr (is_integral_v<_Tp>) 1105 _mm512_mask_storeu_epi64(__mem, __k, __vi); 1106 else 1107 _mm512_mask_storeu_pd(__mem, __k, __vi); 1108 } 1109 #if 0 // with KNL either sizeof(_Tp) >= 4 or sizeof(_vi) <= 32 1110 // with Skylake-AVX512, __have_avx512bw is true 1111 else if constexpr (__have_sse2) 1112 { 1113 using _M = __vector_type_t<_Tp, _Np>; 1114 using _MVT = _VectorTraits<_M>; 1115 _mm_maskmoveu_si128(__auto_bitcast(__extract<0, 4>(__v._M_data)), 1116 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>(__k._M_data)), 1117 reinterpret_cast
(__mem)); 1118 _mm_maskmoveu_si128(__auto_bitcast(__extract<1, 4>(__v._M_data)), 1119 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1120 __k._M_data >> 1 * _MVT::_S_full_size)), 1121 reinterpret_cast
(__mem) + 1 * 16); 1122 _mm_maskmoveu_si128(__auto_bitcast(__extract<2, 4>(__v._M_data)), 1123 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1124 __k._M_data >> 2 * _MVT::_S_full_size)), 1125 reinterpret_cast
(__mem) + 2 * 16); 1126 if constexpr (_Np > 48 / sizeof(_Tp)) 1127 _mm_maskmoveu_si128( 1128 __auto_bitcast(__extract<3, 4>(__v._M_data)), 1129 __auto_bitcast(_MaskImpl::template _S_convert<_Tp, _Np>( 1130 __k._M_data >> 3 * _MVT::_S_full_size)), 1131 reinterpret_cast
(__mem) + 3 * 16); 1132 } 1133 #endif 1134 else 1135 __assert_unreachable<_Tp>(); 1136 } 1137 else if constexpr (sizeof(__vi) == 32) 1138 { 1139 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1140 _mm256_mask_storeu_epi8(__mem, __k, __vi); 1141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1142 _mm256_mask_storeu_epi16(__mem, __k, __vi); 1143 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1144 { 1145 if constexpr (is_integral_v<_Tp>) 1146 _mm256_mask_storeu_epi32(__mem, __k, __vi); 1147 else 1148 _mm256_mask_storeu_ps(__mem, __k, __vi); 1149 } 1150 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1151 { 1152 if constexpr (is_integral_v<_Tp>) 1153 _mm256_mask_storeu_epi64(__mem, __k, __vi); 1154 else 1155 _mm256_mask_storeu_pd(__mem, __k, __vi); 1156 } 1157 else if constexpr (__have_avx512f 1158 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1159 { 1160 // use a 512-bit maskstore, using zero-extension of the bitmask 1161 _S_masked_store_nocvt( 1162 _SimdWrapper64<_Tp>( 1163 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)), 1164 __mem, _SimdWrapper
(__k._M_data)); 1165 } 1166 else 1167 _S_masked_store_nocvt(__v, __mem, 1168 _MaskImpl::template _S_to_maskvector< 1169 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1170 } 1171 else if constexpr (sizeof(__vi) == 16) 1172 { 1173 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1174 _mm_mask_storeu_epi8(__mem, __k, __vi); 1175 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1176 _mm_mask_storeu_epi16(__mem, __k, __vi); 1177 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1178 { 1179 if constexpr (is_integral_v<_Tp>) 1180 _mm_mask_storeu_epi32(__mem, __k, __vi); 1181 else 1182 _mm_mask_storeu_ps(__mem, __k, __vi); 1183 } 1184 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1185 { 1186 if constexpr (is_integral_v<_Tp>) 1187 _mm_mask_storeu_epi64(__mem, __k, __vi); 1188 else 1189 _mm_mask_storeu_pd(__mem, __k, __vi); 1190 } 1191 else if constexpr (__have_avx512f 1192 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1193 { 1194 // use a 512-bit maskstore, using zero-extension of the bitmask 1195 _S_masked_store_nocvt( 1196 _SimdWrapper64<_Tp>( 1197 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)), 1198 __mem, _SimdWrapper
(__k._M_data)); 1199 } 1200 else 1201 _S_masked_store_nocvt(__v, __mem, 1202 _MaskImpl::template _S_to_maskvector< 1203 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1204 } 1205 else 1206 __assert_unreachable<_Tp>(); 1207 } 1208 1209 template
1210 _GLIBCXX_SIMD_INTRINSIC static void 1211 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 1212 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k) 1213 { 1214 if constexpr (sizeof(__v) <= 16) 1215 { 1216 [[maybe_unused]] const auto __vi 1217 = __intrin_bitcast<__m128i>(__as_vector(__v)); 1218 [[maybe_unused]] const auto __ki 1219 = __intrin_bitcast<__m128i>(__as_vector(__k)); 1220 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1221 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi); 1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1223 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi); 1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1225 && is_integral_v<_Tp>) 1226 _mm_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1227 else if constexpr (__have_avx && sizeof(_Tp) == 4) 1228 _mm_maskstore_ps(reinterpret_cast
(__mem), __ki, 1229 __vector_bitcast
(__vi)); 1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1231 && is_integral_v<_Tp>) 1232 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi); 1233 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1234 _mm_maskstore_pd(reinterpret_cast
(__mem), __ki, 1235 __vector_bitcast
(__vi)); 1236 else if constexpr (__have_sse2) 1237 _mm_maskmoveu_si128(__vi, __ki, reinterpret_cast
(__mem)); 1238 } 1239 else if constexpr (sizeof(__v) == 32) 1240 { 1241 [[maybe_unused]] const auto __vi 1242 = __intrin_bitcast<__m256i>(__as_vector(__v)); 1243 [[maybe_unused]] const auto __ki 1244 = __intrin_bitcast<__m256i>(__as_vector(__k)); 1245 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1246 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi); 1247 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1248 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi); 1249 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1250 && is_integral_v<_Tp>) 1251 _mm256_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1252 else if constexpr (sizeof(_Tp) == 4) 1253 _mm256_maskstore_ps(reinterpret_cast
(__mem), __ki, 1254 __vector_bitcast
(__v)); 1255 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1256 && is_integral_v<_Tp>) 1257 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, 1258 __vi); 1259 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1260 _mm256_maskstore_pd(reinterpret_cast
(__mem), __ki, 1261 __vector_bitcast
(__v)); 1262 else if constexpr (__have_sse2) 1263 { 1264 _mm_maskmoveu_si128(__lo128(__vi), __lo128(__ki), 1265 reinterpret_cast
(__mem)); 1266 _mm_maskmoveu_si128(__hi128(__vi), __hi128(__ki), 1267 reinterpret_cast
(__mem) + 16); 1268 } 1269 } 1270 else 1271 __assert_unreachable<_Tp>(); 1272 } 1273 1274 // }}} 1275 // _S_masked_store {{{ 1276 template
1277 _GLIBCXX_SIMD_INTRINSIC static void 1278 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem, 1279 const _MaskMember<_Tp> __k) noexcept 1280 { 1281 if constexpr (is_integral_v< 1282 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up) 1283 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw) 1284 && (sizeof(__v) == 64 || __have_avx512vl)) 1285 { // truncating store 1286 const auto __vi = __to_intrin(__v); 1287 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1288 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1289 && sizeof(__vi) == 64) 1290 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1291 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1292 && sizeof(__vi) == 32) 1293 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1294 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1295 && sizeof(__vi) == 16) 1296 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1297 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1298 && sizeof(__vi) == 64) 1299 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1300 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1301 && sizeof(__vi) == 32) 1302 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1303 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1304 && sizeof(__vi) == 16) 1305 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1306 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1307 && sizeof(__vi) == 64) 1308 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1309 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1310 && sizeof(__vi) == 32) 1311 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1312 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1313 && sizeof(__vi) == 16) 1314 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1315 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1316 && sizeof(__vi) == 64) 1317 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1318 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1319 && sizeof(__vi) == 32) 1320 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1321 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1322 && sizeof(__vi) == 16) 1323 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1324 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1325 && sizeof(__vi) == 64) 1326 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1327 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1328 && sizeof(__vi) == 32) 1329 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1330 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1331 && sizeof(__vi) == 16) 1332 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1333 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1334 && sizeof(__vi) == 64) 1335 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1336 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1337 && sizeof(__vi) == 32) 1338 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1339 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1340 && sizeof(__vi) == 16) 1341 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1342 else 1343 __assert_unreachable<_Tp>(); 1344 } 1345 else 1346 _Base::_S_masked_store(__v, __mem, __k); 1347 } 1348 1349 // }}} 1350 // _S_multiplies {{{ 1351 template
> 1352 _GLIBCXX_SIMD_INTRINSIC static constexpr _V 1353 _S_multiplies(_V __x, _V __y) 1354 { 1355 using _Tp = typename _VVT::value_type; 1356 if (__builtin_is_constant_evaluated() || __x._M_is_constprop() 1357 || __y._M_is_constprop()) 1358 return __as_vector(__x) * __as_vector(__y); 1359 else if constexpr (sizeof(_Tp) == 1) 1360 { 1361 if constexpr (sizeof(_V) == 2) 1362 { 1363 const auto __xs = reinterpret_cast
(__x._M_data); 1364 const auto __ys = reinterpret_cast
(__y._M_data); 1365 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short( 1366 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00)))); 1367 } 1368 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3) 1369 { 1370 const auto __xi = reinterpret_cast
(__x._M_data); 1371 const auto __yi = reinterpret_cast
(__y._M_data); 1372 return reinterpret_cast<__vector_type_t<_Tp, 3>>( 1373 ((__xi * __yi) & 0xff) 1374 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1375 | ((__xi >> 16) * (__yi & 0xff0000))); 1376 } 1377 else if constexpr (sizeof(_V) == 4) 1378 { 1379 const auto __xi = reinterpret_cast
(__x._M_data); 1380 const auto __yi = reinterpret_cast
(__y._M_data); 1381 return reinterpret_cast<__vector_type_t<_Tp, 4>>( 1382 ((__xi * __yi) & 0xff) 1383 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1384 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000) 1385 | ((__xi >> 24) * (__yi & 0xff000000u))); 1386 } 1387 else if constexpr (sizeof(_V) == 8 && __have_avx2 1388 && is_signed_v<_Tp>) 1389 return __convert
( 1390 __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__x))) 1391 * __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__y)))); 1392 else if constexpr (sizeof(_V) == 8 && __have_avx2 1393 && is_unsigned_v<_Tp>) 1394 return __convert
( 1395 __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__x))) 1396 * __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__y)))); 1397 else 1398 { 1399 // codegen of `x*y` is suboptimal (as of GCC 9.0.1) 1400 constexpr size_t __full_size = _VVT::_S_full_size; 1401 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8; 1402 using _ShortW = _SimdWrapper
; 1403 const _ShortW __even = __vector_bitcast
(__x) 1404 * __vector_bitcast
(__y); 1405 _ShortW __high_byte = _ShortW()._M_data - 256; 1406 //[&]() { asm("" : "+x"(__high_byte._M_data)); }(); 1407 const _ShortW __odd 1408 = (__vector_bitcast
(__x) >> 8) 1409 * (__vector_bitcast
(__y) & __high_byte._M_data); 1410 if constexpr (__have_avx512bw && sizeof(_V) > 2) 1411 return _CommonImplX86::_S_blend_avx512( 1412 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even), 1413 __vector_bitcast<_Tp>(__odd)); 1414 else if constexpr (__have_sse4_1 && sizeof(_V) > 2) 1415 return _CommonImplX86::_S_blend_intrin(__to_intrin( 1416 __high_byte), 1417 __to_intrin(__even), 1418 __to_intrin(__odd)); 1419 else 1420 return __to_intrin( 1421 __or(__andnot(__high_byte, __even), __odd)); 1422 } 1423 } 1424 else 1425 return _Base::_S_multiplies(__x, __y); 1426 } 1427 1428 // }}} 1429 // _S_divides {{{ 1430 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1431 template
1432 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1433 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1434 { 1435 if (!__builtin_is_constant_evaluated() 1436 && !__builtin_constant_p(__y._M_data)) 1437 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4) 1438 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1) 1439 // Note that using floating-point division is likely to raise the 1440 // *Inexact* exception flag and thus appears like an invalid 1441 // "as-if" transformation. However, C++ doesn't specify how the 1442 // fpenv can be observed and points to C. C says that function 1443 // calls are assumed to potentially raise fp exceptions, unless 1444 // documented otherwise. Consequently, operator/, which is a 1445 // function call, may raise fp exceptions. 1446 /*const struct _CsrGuard 1447 { 1448 const unsigned _M_data = _mm_getcsr(); 1449 _CsrGuard() 1450 { 1451 _mm_setcsr(0x9f80); // turn off FP exceptions and 1452 flush-to-zero 1453 } 1454 ~_CsrGuard() { _mm_setcsr(_M_data); } 1455 } __csr;*/ 1456 using _Float = conditional_t
; 1457 constexpr size_t __n_intermediate 1458 = std::min(_Np, (__have_avx512f ? 64 1459 : __have_avx ? 32 1460 : 16) 1461 / sizeof(_Float)); 1462 using _FloatV = __vector_type_t<_Float, __n_intermediate>; 1463 constexpr size_t __n_floatv 1464 = __div_roundup(_Np, __n_intermediate); 1465 using _R = __vector_type_t<_Tp, _Np>; 1466 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x); 1467 const auto __yf = __convert_all<_FloatV, __n_floatv>( 1468 _Abi::__make_padding_nonzero(__as_vector(__y))); 1469 return __call_with_n_evaluations<__n_floatv>( 1470 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1471 return __vector_convert<_R>(__quotients...); 1472 }, 1473 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1474 -> _SimdWrapper<_Float, __n_intermediate> 1475 { 1476 #if __RECIPROCAL_MATH__ 1477 // If -freciprocal-math is active, using the `/` operator is 1478 // incorrect because it may be translated to an imprecise 1479 // multiplication with reciprocal. We need to use inline 1480 // assembly to force a real division. 1481 _FloatV __r; 1482 if constexpr (__have_avx) // -mno-sse2avx is irrelevant 1483 // because once -mavx is given, GCC 1484 // emits VEX encoded vdivp[sd] 1485 { 1486 if constexpr (sizeof(_Tp) == 4) 1487 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}" 1488 : "=x"(__r) 1489 : "x"(__xf[__i]), "x"(__yf[__i])); 1490 else 1491 asm("vdivps\t{%2, %1, %0|%0, %1, %2}" 1492 : "=x"(__r) 1493 : "x"(__xf[__i]), "x"(__yf[__i])); 1494 } 1495 else 1496 { 1497 __r = __xf[__i]; 1498 if constexpr (sizeof(_Tp) == 4) 1499 asm("divpd\t{%1, %0|%0, %1}" 1500 : "=x"(__r) 1501 : "x"(__yf[__i])); 1502 else 1503 asm("divps\t{%1, %0|%0, %1}" 1504 : "=x"(__r) 1505 : "x"(__yf[__i])); 1506 } 1507 return __r; 1508 #else 1509 return __xf[__i] / __yf[__i]; 1510 #endif 1511 }); 1512 } 1513 /* 64-bit int division is potentially optimizable via double division if 1514 * the value in __x is small enough and the conversion between 1515 * int<->double is efficient enough: 1516 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1517 sizeof(_Tp) == 8) 1518 { 1519 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1520 { 1521 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1522 0xffe0'0000'0000'0000ull})) 1523 { 1524 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1525 } 1526 } 1527 } 1528 */ 1529 return _Base::_S_divides(__x, __y); 1530 } 1531 #else 1532 using _Base::_S_divides; 1533 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1534 1535 // }}} 1536 // _S_modulus {{{ 1537 template
1538 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1539 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1540 { 1541 if (__builtin_is_constant_evaluated() 1542 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1543 return _Base::_S_modulus(__x, __y); 1544 else 1545 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1546 } 1547 1548 // }}} 1549 // _S_bit_shift_left {{{ 1550 // Notes on UB. C++2a [expr.shift] says: 1551 // -1- [...] The operands shall be of integral or unscoped enumeration type 1552 // and integral promotions are performed. The type of the result is that 1553 // of the promoted left operand. The behavior is undefined if the right 1554 // operand is negative, or greater than or equal to the width of the 1555 // promoted left operand. 1556 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1557 // 2^N, where N is the width of the type of the result. 1558 // 1559 // C++17 [expr.shift] says: 1560 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1561 // bits are zero-filled. If E1 has an unsigned type, the value of the 1562 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1563 // representable in the result type. Otherwise, if E1 has a signed type 1564 // and non-negative value, and E1 × 2^E2 is representable in the 1565 // corresponding unsigned type of the result type, then that value, 1566 // converted to the result type, is the resulting value; otherwise, the 1567 // behavior is undefined. 1568 // 1569 // Consequences: 1570 // With C++2a signed and unsigned types have the same UB 1571 // characteristics: 1572 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1573 // 1574 // With C++17 there's little room for optimizations because the standard 1575 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1576 // short and char shifts must assume shifts affect bits of neighboring 1577 // values. 1578 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1579 template
> 1580 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1581 _S_bit_shift_left(_Tp __xx, int __y) 1582 { 1583 using _V = typename _TVT::type; 1584 using _Up = typename _TVT::value_type; 1585 _V __x = __xx; 1586 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1587 if (__builtin_is_constant_evaluated()) 1588 return __x << __y; 1589 #if __cplusplus > 201703 1590 // after C++17, signed shifts have no UB, and behave just like unsigned 1591 // shifts 1592 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1593 return __vector_bitcast<_Up>( 1594 _S_bit_shift_left(__vector_bitcast
>(__x), 1595 __y)); 1596 #endif 1597 else if constexpr (sizeof(_Up) == 1) 1598 { 1599 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1600 if (__builtin_constant_p(__y)) 1601 { 1602 if (__y == 0) 1603 return __x; 1604 else if (__y == 1) 1605 return __x + __x; 1606 else if (__y == 2) 1607 { 1608 __x = __x + __x; 1609 return __x + __x; 1610 } 1611 else if (__y > 2 && __y < 8) 1612 { 1613 if constexpr (sizeof(__x) > sizeof(unsigned)) 1614 { 1615 const _UChar __mask = 0xff << __y; // precomputed vector 1616 return __vector_bitcast<_Up>( 1617 __vector_bitcast<_UChar>( 1618 __vector_bitcast
(__x) << __y) 1619 & __mask); 1620 } 1621 else 1622 { 1623 const unsigned __mask 1624 = (0xff & (0xff << __y)) * 0x01010101u; 1625 return reinterpret_cast<_V>( 1626 static_cast<__int_for_sizeof_t<_V>>( 1627 unsigned( 1628 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1629 << __y) 1630 & __mask)); 1631 } 1632 } 1633 else if (__y >= 8 && __y < 32) 1634 return _V(); 1635 else 1636 __builtin_unreachable(); 1637 } 1638 // general strategy in the following: use an sllv instead of sll 1639 // instruction, because it's 2 to 4 times faster: 1640 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1641 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1642 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1643 _mm256_set1_epi16(__y)))); 1644 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1645 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1646 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1647 _mm512_set1_epi16(__y)))); 1648 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1649 { 1650 const auto __shift = _mm512_set1_epi16(__y); 1651 return __vector_bitcast<_Up>( 1652 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1653 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1654 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1655 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1656 } 1657 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1658 { 1659 #if 1 1660 const auto __shift = _mm_cvtsi32_si128(__y); 1661 auto __k 1662 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1663 __k |= _mm256_srli_epi16(__k, 8); 1664 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1665 & __k); 1666 #else 1667 const _Up __k = 0xff << __y; 1668 return __vector_bitcast<_Up>(__vector_bitcast
(__x) << __y) 1669 & __k; 1670 #endif 1671 } 1672 else 1673 { 1674 const auto __shift = _mm_cvtsi32_si128(__y); 1675 auto __k 1676 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1677 __k |= _mm_srli_epi16(__k, 8); 1678 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1679 } 1680 } 1681 return __x << __y; 1682 } 1683 1684 template
> 1685 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1686 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1687 { 1688 using _V = typename _TVT::type; 1689 using _Up = typename _TVT::value_type; 1690 _V __x = __xx; 1691 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1692 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1693 if (__builtin_is_constant_evaluated()) 1694 return __x << __y; 1695 #if __cplusplus > 201703 1696 // after C++17, signed shifts have no UB, and behave just like unsigned 1697 // shifts 1698 else if constexpr (is_signed_v<_Up>) 1699 return __vector_bitcast<_Up>( 1700 _S_bit_shift_left(__vector_bitcast
>(__x), 1701 __vector_bitcast
>(__y))); 1702 #endif 1703 else if constexpr (sizeof(_Up) == 1) 1704 { 1705 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1706 return __vector_bitcast<_Up>(__concat( 1707 _mm512_cvtepi16_epi8( 1708 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1709 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1710 _mm512_cvtepi16_epi8( 1711 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1712 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1713 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1714 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1715 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1716 _mm512_cvtepu8_epi16(__iy)))); 1717 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1718 return __intrin_bitcast<_V>( 1719 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1720 _mm_cvtepu8_epi16(__iy)))); 1721 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1722 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1723 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1724 _mm256_cvtepu8_epi16(__iy)))); 1725 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1726 return __intrin_bitcast<_V>( 1727 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1728 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1729 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1730 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1731 { 1732 auto __mask 1733 = __vector_bitcast<_Up>(__vector_bitcast
(__y) << 5); 1734 auto __x4 1735 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1736 __x4 &= char(0xf0); 1737 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1738 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1739 __mask += __mask; 1740 auto __x2 1741 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1742 __x2 &= char(0xfc); 1743 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1744 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1745 __mask += __mask; 1746 auto __x1 = __x + __x; 1747 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1748 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1749 return __x 1750 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1751 } 1752 else if constexpr (sizeof(__x) == 16) 1753 { 1754 auto __mask 1755 = __vector_bitcast<_UChar>(__vector_bitcast
(__y) << 5); 1756 auto __x4 1757 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1758 __x4 &= char(0xf0); 1759 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1760 __mask += __mask; 1761 auto __x2 1762 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1763 __x2 &= char(0xfc); 1764 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1765 __mask += __mask; 1766 auto __x1 = __x + __x; 1767 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1768 return __x 1769 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1770 } 1771 else 1772 return __x << __y; 1773 } 1774 else if constexpr (sizeof(_Up) == 2) 1775 { 1776 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1777 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1778 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1779 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1780 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1781 return __vector_bitcast<_Up>( 1782 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1783 _mm512_castsi256_si512(__iy)))); 1784 else if constexpr (sizeof __ix == 32 && __have_avx2) 1785 { 1786 const auto __ux = __vector_bitcast
(__x); 1787 const auto __uy = __vector_bitcast
(__y); 1788 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1789 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1790 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1791 } 1792 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1793 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1794 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1795 return __intrin_bitcast<_V>( 1796 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1797 _mm512_castsi128_si512(__iy)))); 1798 else if constexpr (sizeof __ix == 16 && __have_avx2) 1799 { 1800 const auto __ux = __vector_bitcast
(__ix); 1801 const auto __uy = __vector_bitcast
(__iy); 1802 return __intrin_bitcast<_V>(_mm_blend_epi16( 1803 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1804 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1805 } 1806 else if constexpr (sizeof __ix == 16) 1807 { 1808 using _Float4 = __vector_type_t
; 1809 using _Int4 = __vector_type_t
; 1810 using _UInt4 = __vector_type_t
; 1811 const _UInt4 __yu 1812 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1813 return __x 1814 * __intrin_bitcast<_V>( 1815 __vector_convert<_Int4>(_SimdWrapper
( 1816 reinterpret_cast<_Float4>(__yu << 23))) 1817 | (__vector_convert<_Int4>(_SimdWrapper
( 1818 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1819 << 16)); 1820 } 1821 else 1822 __assert_unreachable<_Tp>(); 1823 } 1824 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1825 && !__have_avx2) 1826 // latency is suboptimal, but throughput is at full speedup 1827 return __intrin_bitcast<_V>( 1828 __vector_bitcast
(__ix) 1829 * __vector_convert<__vector_type16_t
>( 1830 _SimdWrapper
(__vector_bitcast
( 1831 (__vector_bitcast
(__y) << 23) + 0x3f80'0000)))); 1832 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1833 && !__have_avx2) 1834 { 1835 const auto __lo = _mm_sll_epi64(__ix, __iy); 1836 const auto __hi 1837 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1838 if constexpr (__have_sse4_1) 1839 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1840 else 1841 return __vector_bitcast<_Up>( 1842 _mm_move_sd(__vector_bitcast
(__hi), 1843 __vector_bitcast
(__lo))); 1844 } 1845 else 1846 return __x << __y; 1847 } 1848 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1849 1850 // }}} 1851 // _S_bit_shift_right {{{ 1852 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1853 template
> 1854 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1855 _S_bit_shift_right(_Tp __xx, int __y) 1856 { 1857 using _V = typename _TVT::type; 1858 using _Up = typename _TVT::value_type; 1859 _V __x = __xx; 1860 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1861 if (__builtin_is_constant_evaluated()) 1862 return __x >> __y; 1863 else if (__builtin_constant_p(__y) 1864 && is_unsigned_v< 1865 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1866 return _V(); 1867 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1868 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1869 & _Up(0xff >> __y); 1870 //}}} 1871 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1872 return __intrin_bitcast<_V>( 1873 (__vector_bitcast<_UShort>(__vector_bitcast
(__ix) 1874 >> (__y + 8)) 1875 << 8) 1876 | (__vector_bitcast<_UShort>( 1877 __vector_bitcast
(__vector_bitcast<_UShort>(__ix) << 8) 1878 >> __y) 1879 >> 8)); 1880 //}}} 1881 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1882 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1883 { 1884 if (__y > 32) 1885 return (__intrin_bitcast<_V>(__vector_bitcast
(__ix) >> 32) 1886 & _Up(0xffff'ffff'0000'0000ull)) 1887 | __vector_bitcast<_Up>( 1888 __vector_bitcast
(__vector_bitcast<_ULLong>(__ix) 1889 >> 32) 1890 >> (__y - 32)); 1891 else 1892 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1893 >> __y) 1894 | __vector_bitcast<_Up>( 1895 __vector_bitcast
(__ix & -0x8000'0000'0000'0000ll) 1896 >> __y); 1897 } 1898 //}}} 1899 else 1900 return __x >> __y; 1901 } 1902 1903 template
> 1904 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1905 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1906 { 1907 using _V = typename _TVT::type; 1908 using _Up = typename _TVT::value_type; 1909 _V __x = __xx; 1910 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1911 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1912 if (__builtin_is_constant_evaluated() 1913 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1914 return __x >> __y; 1915 else if constexpr (sizeof(_Up) == 1) //{{{ 1916 { 1917 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1918 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1919 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1920 _mm_cvtepi8_epi16(__iy)) 1921 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1922 _mm_cvtepu8_epi16(__iy)))); 1923 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1924 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1925 is_signed_v<_Up> 1926 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1927 _mm256_cvtepi8_epi16(__iy)) 1928 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1929 _mm256_cvtepu8_epi16(__iy)))); 1930 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1931 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1932 is_signed_v<_Up> 1933 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1934 _mm512_cvtepi8_epi16(__iy)) 1935 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1936 _mm512_cvtepu8_epi16(__iy)))); 1937 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1938 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1939 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1940 0x5555'5555'5555'5555ull, 1941 _mm512_srav_epi16( 1942 _mm512_slli_epi16(__ix, 8), 1943 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1944 _mm512_set1_epi16(8))))); 1945 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1946 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1947 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1948 0x5555'5555'5555'5555ull, 1949 _mm512_srlv_epi16( 1950 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1951 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1952 /* This has better throughput but higher latency than the impl below 1953 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1954 is_unsigned_v<_Up>) 1955 { 1956 const auto __shorts = __to_intrin(_S_bit_shift_right( 1957 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1958 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1959 return __vector_bitcast<_Up>( 1960 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1961 } 1962 */ 1963 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1964 // the following uses vpsr[al]vd, which requires AVX2 1965 if constexpr (is_signed_v<_Up>) 1966 { 1967 const auto r3 = __vector_bitcast<_UInt>( 1968 (__vector_bitcast
(__x) 1969 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1970 & 0xff000000u; 1971 const auto r2 1972 = __vector_bitcast<_UInt>( 1973 ((__vector_bitcast
(__x) << 8) 1974 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1975 & 0xff000000u; 1976 const auto r1 1977 = __vector_bitcast<_UInt>( 1978 ((__vector_bitcast
(__x) << 16) 1979 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1980 & 0xff000000u; 1981 const auto r0 = __vector_bitcast<_UInt>( 1982 (__vector_bitcast
(__x) << 24) 1983 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1984 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1985 | (r0 >> 24)); 1986 } 1987 else 1988 { 1989 const auto r3 = (__vector_bitcast<_UInt>(__x) 1990 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1991 & 0xff000000u; 1992 const auto r2 1993 = ((__vector_bitcast<_UInt>(__x) << 8) 1994 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1995 & 0xff000000u; 1996 const auto r1 1997 = ((__vector_bitcast<_UInt>(__x) << 16) 1998 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1999 & 0xff000000u; 2000 const auto r0 2001 = (__vector_bitcast<_UInt>(__x) << 24) 2002 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 2003 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 2004 | (r0 >> 24)); 2005 } 2006 else if constexpr (__have_sse4_1 2007 && is_unsigned_v<_Up> && sizeof(__x) > 2) 2008 { 2009 auto __x128 = __vector_bitcast<_Up>(__ix); 2010 auto __mask 2011 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 2012 auto __x4 = __vector_bitcast<_Up>( 2013 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 2014 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2015 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 2016 __mask += __mask; 2017 auto __x2 = __vector_bitcast<_Up>( 2018 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 2019 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2020 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 2021 __mask += __mask; 2022 auto __x1 = __vector_bitcast<_Up>( 2023 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 2024 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 2025 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 2026 return __intrin_bitcast<_V>( 2027 __x128 2028 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2029 == 0)); // y > 7 nulls the result 2030 } 2031 else if constexpr (__have_sse4_1 2032 && is_signed_v<_Up> && sizeof(__x) > 2) 2033 { 2034 auto __mask = __vector_bitcast<_UChar>( 2035 __vector_bitcast<_UShort>(__iy) << 5); 2036 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2037 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2038 }; 2039 auto __xh = __vector_bitcast
(__ix); 2040 auto __xl = __vector_bitcast
(__ix) << 8; 2041 auto __xh4 = __xh >> 4; 2042 auto __xl4 = __xl >> 4; 2043 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2044 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2045 __xl = __vector_bitcast
( 2046 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2047 __to_intrin(__xl4))); 2048 __mask += __mask; 2049 auto __xh2 = __xh >> 2; 2050 auto __xl2 = __xl >> 2; 2051 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2052 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2053 __xl = __vector_bitcast
( 2054 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2055 __to_intrin(__xl2))); 2056 __mask += __mask; 2057 auto __xh1 = __xh >> 1; 2058 auto __xl1 = __xl >> 1; 2059 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2060 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2061 __xl = __vector_bitcast
( 2062 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2063 __to_intrin(__xl1))); 2064 return __intrin_bitcast<_V>( 2065 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2066 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2067 >> 8)) 2068 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2069 == 0)); // y > 7 nulls the result 2070 } 2071 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2072 { 2073 auto __mask 2074 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2075 auto __x4 = __vector_bitcast<_Up>( 2076 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2077 __x = __mask > 0x7f ? __x4 : __x; 2078 __mask += __mask; 2079 auto __x2 = __vector_bitcast<_Up>( 2080 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2081 __x = __mask > 0x7f ? __x2 : __x; 2082 __mask += __mask; 2083 auto __x1 = __vector_bitcast<_Up>( 2084 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2085 __x = __mask > 0x7f ? __x1 : __x; 2086 return __x 2087 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2088 } 2089 else if constexpr (sizeof(__x) > 2) // signed SSE2 2090 { 2091 static_assert(is_signed_v<_Up>); 2092 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2093 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2094 auto __xh = __vector_bitcast
(__x); 2095 auto __xl = __vector_bitcast
(__x) << 8; 2096 auto __xh4 = __xh >> 4; 2097 auto __xl4 = __xl >> 4; 2098 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2099 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2100 __maskh += __maskh; 2101 __maskl += __maskl; 2102 auto __xh2 = __xh >> 2; 2103 auto __xl2 = __xl >> 2; 2104 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2105 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2106 __maskh += __maskh; 2107 __maskl += __maskl; 2108 auto __xh1 = __xh >> 1; 2109 auto __xl1 = __xl >> 1; 2110 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2111 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2112 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2113 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2114 >> 8); 2115 return __x 2116 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2117 } 2118 else 2119 return __x >> __y; 2120 } //}}} 2121 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2122 { 2123 [[maybe_unused]] auto __blend_0xaa 2124 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2125 if constexpr (sizeof(__a) == 16) 2126 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2127 0xaa); 2128 else if constexpr (sizeof(__a) == 32) 2129 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2130 0xaa); 2131 else if constexpr (sizeof(__a) == 64) 2132 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2133 __to_intrin(__b)); 2134 else 2135 __assert_unreachable
(); 2136 }; 2137 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2138 return __intrin_bitcast<_V>(is_signed_v<_Up> 2139 ? _mm_srav_epi16(__ix, __iy) 2140 : _mm_srlv_epi16(__ix, __iy)); 2141 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2142 return __vector_bitcast<_Up>(is_signed_v<_Up> 2143 ? _mm256_srav_epi16(__ix, __iy) 2144 : _mm256_srlv_epi16(__ix, __iy)); 2145 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2146 return __vector_bitcast<_Up>(is_signed_v<_Up> 2147 ? _mm512_srav_epi16(__ix, __iy) 2148 : _mm512_srlv_epi16(__ix, __iy)); 2149 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2150 return __intrin_bitcast<_V>( 2151 __blend_0xaa(((__vector_bitcast
(__ix) << 16) 2152 >> (__vector_bitcast
(__iy) & 0xffffu)) 2153 >> 16, 2154 __vector_bitcast
(__ix) 2155 >> (__vector_bitcast
(__iy) >> 16))); 2156 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2157 return __intrin_bitcast<_V>( 2158 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2159 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2160 __vector_bitcast<_UInt>(__ix) 2161 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2162 else if constexpr (__have_sse4_1) 2163 { 2164 auto __mask = __vector_bitcast<_UShort>(__iy); 2165 auto __x128 = __vector_bitcast<_Up>(__ix); 2166 //__mask *= 0x0808; 2167 __mask = (__mask << 3) | (__mask << 11); 2168 // do __x128 = 0 where __y[4] is set 2169 __x128 = __vector_bitcast<_Up>( 2170 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2171 __to_intrin(__mask))); 2172 // do __x128 =>> 8 where __y[3] is set 2173 __x128 = __vector_bitcast<_Up>( 2174 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2175 __to_intrin(__mask += __mask))); 2176 // do __x128 =>> 4 where __y[2] is set 2177 __x128 = __vector_bitcast<_Up>( 2178 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2179 __to_intrin(__mask += __mask))); 2180 // do __x128 =>> 2 where __y[1] is set 2181 __x128 = __vector_bitcast<_Up>( 2182 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2183 __to_intrin(__mask += __mask))); 2184 // do __x128 =>> 1 where __y[0] is set 2185 return __intrin_bitcast<_V>( 2186 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2187 __to_intrin(__mask + __mask))); 2188 } 2189 else 2190 { 2191 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2192 auto __x128 = __vector_bitcast<_Up>(__ix); 2193 auto __mask 2194 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2195 return __vector_bitcast
(__kk) < 0; 2196 }; 2197 // do __x128 = 0 where __y[4] is set 2198 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2199 // do __x128 =>> 8 where __y[3] is set 2200 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2201 // do __x128 =>> 4 where __y[2] is set 2202 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2203 // do __x128 =>> 2 where __y[1] is set 2204 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2205 // do __x128 =>> 1 where __y[0] is set 2206 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2207 : __x128); 2208 } 2209 } //}}} 2210 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2211 { 2212 if constexpr (is_unsigned_v<_Up>) 2213 { 2214 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2215 const __m128 __factor_f = reinterpret_cast<__m128>( 2216 0x4f00'0000u - (__vector_bitcast
(__y) << 23)); 2217 const __m128i __factor 2218 = __builtin_constant_p(__factor_f) 2219 ? __to_intrin( 2220 __make_vector
(__factor_f[0], __factor_f[1], 2221 __factor_f[2], __factor_f[3])) 2222 : _mm_cvttps_epi32(__factor_f); 2223 const auto __r02 2224 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2225 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2226 _mm_srli_si128(__factor, 4)); 2227 if constexpr (__have_sse4_1) 2228 return __intrin_bitcast<_V>( 2229 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2230 else 2231 return __intrin_bitcast<_V>( 2232 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2233 } 2234 else 2235 { 2236 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2237 if constexpr (is_signed_v<_Up>) 2238 return _mm_sra_epi32(__a, __b); 2239 else 2240 return _mm_srl_epi32(__a, __b); 2241 }; 2242 const auto __r0 2243 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2244 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2245 const auto __r2 2246 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2247 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2248 if constexpr (__have_sse4_1) 2249 return __intrin_bitcast<_V>( 2250 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2251 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2252 else 2253 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2254 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2255 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2256 } 2257 } //}}} 2258 else 2259 return __x >> __y; 2260 } 2261 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2262 2263 // }}} 2264 // compares {{{ 2265 // _S_equal_to {{{ 2266 template
2267 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2268 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2269 { 2270 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2271 { 2272 if (__builtin_is_constant_evaluated() 2273 || (__x._M_is_constprop() && __y._M_is_constprop())) 2274 return _MaskImpl::_S_to_bits( 2275 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2276 2277 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2278 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2279 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2280 if constexpr (is_floating_point_v<_Tp>) 2281 { 2282 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2283 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2284 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2285 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2286 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2287 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2288 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2289 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2290 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2291 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2292 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2293 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2294 else 2295 __assert_unreachable<_Tp>(); 2296 } 2297 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2298 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2299 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2300 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2301 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2302 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2303 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2304 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2305 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2306 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2307 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2308 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2309 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2310 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2311 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2312 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2313 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2314 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2315 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2316 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2317 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2318 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2319 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2320 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2321 else 2322 __assert_unreachable<_Tp>(); 2323 } // }}} 2324 else if (__builtin_is_constant_evaluated()) 2325 return _Base::_S_equal_to(__x, __y); 2326 else if constexpr (sizeof(__x) == 8) 2327 { 2328 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2329 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2330 _MaskMember<_Tp> __r64{}; 2331 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2332 return __r64; 2333 } 2334 else 2335 return _Base::_S_equal_to(__x, __y); 2336 } 2337 2338 // }}} 2339 // _S_not_equal_to {{{ 2340 template
2341 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2342 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2343 { 2344 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2345 { 2346 if (__builtin_is_constant_evaluated() 2347 || (__x._M_is_constprop() && __y._M_is_constprop())) 2348 return _MaskImpl::_S_to_bits( 2349 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2350 2351 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2352 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2353 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2354 if constexpr (is_floating_point_v<_Tp>) 2355 { 2356 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2357 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2358 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2359 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2360 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2361 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2362 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2363 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2364 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2365 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2366 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2367 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2368 else 2369 __assert_unreachable<_Tp>(); 2370 } 2371 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2372 return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2373 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2374 return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2375 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2376 return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2377 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2378 return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2379 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2380 return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2381 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2382 return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2383 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2384 return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2385 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2386 return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2387 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2388 return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2389 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2390 return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2391 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2392 return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2393 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2394 return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2395 else 2396 __assert_unreachable<_Tp>(); 2397 } // }}} 2398 else if (__builtin_is_constant_evaluated()) 2399 return _Base::_S_not_equal_to(__x, __y); 2400 else if constexpr (sizeof(__x) == 8) 2401 { 2402 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2403 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2404 _MaskMember<_Tp> __r64{}; 2405 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2406 return __r64; 2407 } 2408 else 2409 return _Base::_S_not_equal_to(__x, __y); 2410 } 2411 2412 // }}} 2413 // _S_less {{{ 2414 template
2415 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2416 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2417 { 2418 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2419 { 2420 if (__builtin_is_constant_evaluated() 2421 || (__x._M_is_constprop() && __y._M_is_constprop())) 2422 return _MaskImpl::_S_to_bits( 2423 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2424 2425 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2426 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2427 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2428 if constexpr (sizeof(__xi) == 64) 2429 { 2430 if constexpr (is_same_v<_Tp, float>) 2431 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2432 else if constexpr (is_same_v<_Tp, double>) 2433 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2434 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2435 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2436 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2437 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2438 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2439 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2440 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2441 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2442 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2443 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2444 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2445 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2446 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2447 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2448 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2449 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2450 else 2451 __assert_unreachable<_Tp>(); 2452 } 2453 else if constexpr (sizeof(__xi) == 32) 2454 { 2455 if constexpr (is_same_v<_Tp, float>) 2456 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2457 else if constexpr (is_same_v<_Tp, double>) 2458 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2459 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2460 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2461 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2462 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2463 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2464 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2465 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2466 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2467 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2468 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2469 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2470 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2471 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2472 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2473 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2474 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2475 else 2476 __assert_unreachable<_Tp>(); 2477 } 2478 else if constexpr (sizeof(__xi) == 16) 2479 { 2480 if constexpr (is_same_v<_Tp, float>) 2481 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2482 else if constexpr (is_same_v<_Tp, double>) 2483 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2484 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2485 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2486 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2487 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2488 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2489 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2490 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2491 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2492 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2493 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2494 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2495 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2496 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2497 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2498 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2499 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2500 else 2501 __assert_unreachable<_Tp>(); 2502 } 2503 else 2504 __assert_unreachable<_Tp>(); 2505 } // }}} 2506 else if (__builtin_is_constant_evaluated()) 2507 return _Base::_S_less(__x, __y); 2508 else if constexpr (sizeof(__x) == 8) 2509 { 2510 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2511 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2512 _MaskMember<_Tp> __r64{}; 2513 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2514 return __r64; 2515 } 2516 else 2517 return _Base::_S_less(__x, __y); 2518 } 2519 2520 // }}} 2521 // _S_less_equal {{{ 2522 template
2523 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2524 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2525 { 2526 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2527 { 2528 if (__builtin_is_constant_evaluated() 2529 || (__x._M_is_constprop() && __y._M_is_constprop())) 2530 return _MaskImpl::_S_to_bits( 2531 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2532 2533 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2534 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2535 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2536 if constexpr (sizeof(__xi) == 64) 2537 { 2538 if constexpr (is_same_v<_Tp, float>) 2539 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2540 else if constexpr (is_same_v<_Tp, double>) 2541 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2542 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2543 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2544 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2545 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2546 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2547 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2548 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2549 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2550 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2551 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2552 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2553 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2554 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2555 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2556 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2557 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2558 else 2559 __assert_unreachable<_Tp>(); 2560 } 2561 else if constexpr (sizeof(__xi) == 32) 2562 { 2563 if constexpr (is_same_v<_Tp, float>) 2564 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2565 else if constexpr (is_same_v<_Tp, double>) 2566 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2567 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2568 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2569 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2570 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2571 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2572 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2573 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2574 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2575 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2576 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2577 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2578 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2579 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2580 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2581 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2582 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2583 else 2584 __assert_unreachable<_Tp>(); 2585 } 2586 else if constexpr (sizeof(__xi) == 16) 2587 { 2588 if constexpr (is_same_v<_Tp, float>) 2589 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2590 else if constexpr (is_same_v<_Tp, double>) 2591 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2592 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2593 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2594 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2595 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2596 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2597 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2598 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2599 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2600 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2601 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2602 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2603 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2604 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2605 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2606 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2607 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2608 else 2609 __assert_unreachable<_Tp>(); 2610 } 2611 else 2612 __assert_unreachable<_Tp>(); 2613 } // }}} 2614 else if (__builtin_is_constant_evaluated()) 2615 return _Base::_S_less_equal(__x, __y); 2616 else if constexpr (sizeof(__x) == 8) 2617 { 2618 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2619 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2620 _MaskMember<_Tp> __r64{}; 2621 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2622 return __r64; 2623 } 2624 else 2625 return _Base::_S_less_equal(__x, __y); 2626 } 2627 2628 // }}} }}} 2629 // negation {{{ 2630 template
2631 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2632 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2633 { 2634 if constexpr (__is_avx512_abi<_Abi>()) 2635 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2636 else 2637 return _Base::_S_negate(__x); 2638 } 2639 2640 // }}} 2641 // math {{{ 2642 using _Base::_S_abs; 2643 2644 // _S_sqrt {{{ 2645 template
2646 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2647 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2648 { 2649 if constexpr (__is_sse_ps<_Tp, _Np>()) 2650 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2651 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2652 return _mm_sqrt_pd(__x); 2653 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2654 return _mm256_sqrt_ps(__x); 2655 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2656 return _mm256_sqrt_pd(__x); 2657 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2658 return _mm512_sqrt_ps(__x); 2659 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2660 return _mm512_sqrt_pd(__x); 2661 else 2662 __assert_unreachable<_Tp>(); 2663 } 2664 2665 // }}} 2666 // _S_ldexp {{{ 2667 template
2668 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2669 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2670 __fixed_size_storage_t
__exp) 2671 { 2672 if constexpr (__is_avx512_abi<_Abi>()) 2673 { 2674 const auto __xi = __to_intrin(__x); 2675 constexpr _SimdConverter
, _Tp, _Abi> 2676 __cvt; 2677 const auto __expi = __to_intrin(__cvt(__exp)); 2678 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2679 if constexpr (sizeof(__xi) == 16) 2680 { 2681 if constexpr (sizeof(_Tp) == 8) 2682 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2683 else 2684 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2685 } 2686 else if constexpr (sizeof(__xi) == 32) 2687 { 2688 if constexpr (sizeof(_Tp) == 8) 2689 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2690 else 2691 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2692 } 2693 else 2694 { 2695 static_assert(sizeof(__xi) == 64); 2696 if constexpr (sizeof(_Tp) == 8) 2697 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2698 else 2699 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2700 } 2701 } 2702 else 2703 return _Base::_S_ldexp(__x, __exp); 2704 } 2705 2706 // }}} 2707 // _S_trunc {{{ 2708 template
2709 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2710 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2711 { 2712 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2713 return _mm512_roundscale_ps(__x, 0x0b); 2714 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2715 return _mm512_roundscale_pd(__x, 0x0b); 2716 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2717 return _mm256_round_ps(__x, 0x3); 2718 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2719 return _mm256_round_pd(__x, 0x3); 2720 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2721 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3)); 2722 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2723 return _mm_round_pd(__x, 0x3); 2724 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2725 { 2726 auto __truncated 2727 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2728 const auto __no_fractional_values 2729 = __vector_bitcast
(__vector_bitcast<_UInt>(__to_intrin(__x)) 2730 & 0x7f800000u) 2731 < 0x4b000000; // the exponent is so large that no mantissa bits 2732 // signify fractional values (0x3f8 + 23*8 = 2733 // 0x4b0) 2734 return __no_fractional_values ? __truncated : __to_intrin(__x); 2735 } 2736 else 2737 return _Base::_S_trunc(__x); 2738 } 2739 2740 // }}} 2741 // _S_round {{{ 2742 template
2743 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2744 _S_round(_SimdWrapper<_Tp, _Np> __x) 2745 { 2746 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2747 // from zero as required by std::round. Therefore this function is more 2748 // complicated. 2749 using _V = __vector_type_t<_Tp, _Np>; 2750 _V __truncated; 2751 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2752 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2753 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2754 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2755 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2756 __truncated = _mm256_round_ps(__x._M_data, 2757 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2758 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2759 __truncated = _mm256_round_pd(__x._M_data, 2760 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2761 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2762 __truncated = __auto_bitcast( 2763 _mm_round_ps(__to_intrin(__x), 2764 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2765 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2766 __truncated 2767 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2768 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2769 __truncated = __auto_bitcast( 2770 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2771 else 2772 return _Base::_S_round(__x); 2773 2774 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2775 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2776 2777 const _V __rounded 2778 = __truncated 2779 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2780 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2781 : _V()); 2782 if constexpr (__have_sse4_1) 2783 return __rounded; 2784 else // adjust for missing range in cvttps_epi32 2785 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2786 : __x._M_data; 2787 } 2788 2789 // }}} 2790 // _S_nearbyint {{{ 2791 template
> 2792 _GLIBCXX_SIMD_INTRINSIC static _Tp 2793 _S_nearbyint(_Tp __x) noexcept 2794 { 2795 if constexpr (_TVT::template _S_is
) 2796 return _mm512_roundscale_ps(__x, 0x0c); 2797 else if constexpr (_TVT::template _S_is
) 2798 return _mm512_roundscale_pd(__x, 0x0c); 2799 else if constexpr (_TVT::template _S_is
) 2800 return _mm256_round_ps(__x, 2801 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2802 else if constexpr (_TVT::template _S_is
) 2803 return _mm256_round_pd(__x, 2804 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2805 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2806 return _mm_round_ps(__x, 2807 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2808 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2809 return _mm_round_pd(__x, 2810 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2811 else 2812 return _Base::_S_nearbyint(__x); 2813 } 2814 2815 // }}} 2816 // _S_rint {{{ 2817 template
> 2818 _GLIBCXX_SIMD_INTRINSIC static _Tp 2819 _S_rint(_Tp __x) noexcept 2820 { 2821 if constexpr (_TVT::template _S_is
) 2822 return _mm512_roundscale_ps(__x, 0x04); 2823 else if constexpr (_TVT::template _S_is
) 2824 return _mm512_roundscale_pd(__x, 0x04); 2825 else if constexpr (_TVT::template _S_is
) 2826 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2827 else if constexpr (_TVT::template _S_is
) 2828 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2829 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2830 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2831 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2832 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2833 else 2834 return _Base::_S_rint(__x); 2835 } 2836 2837 // }}} 2838 // _S_floor {{{ 2839 template
2840 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2841 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2842 { 2843 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2844 return _mm512_roundscale_ps(__x, 0x09); 2845 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2846 return _mm512_roundscale_pd(__x, 0x09); 2847 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2848 return _mm256_round_ps(__x, 0x1); 2849 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2850 return _mm256_round_pd(__x, 0x1); 2851 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2852 return __auto_bitcast(_mm_floor_ps(__to_intrin(__x))); 2853 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2854 return _mm_floor_pd(__x); 2855 else 2856 return _Base::_S_floor(__x); 2857 } 2858 2859 // }}} 2860 // _S_ceil {{{ 2861 template
2862 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2863 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2864 { 2865 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2866 return _mm512_roundscale_ps(__x, 0x0a); 2867 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2868 return _mm512_roundscale_pd(__x, 0x0a); 2869 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2870 return _mm256_round_ps(__x, 0x2); 2871 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2872 return _mm256_round_pd(__x, 0x2); 2873 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2874 return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x))); 2875 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2876 return _mm_ceil_pd(__x); 2877 else 2878 return _Base::_S_ceil(__x); 2879 } 2880 2881 // }}} 2882 // _S_signbit {{{ 2883 template
2884 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2885 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2886 { 2887 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2888 { 2889 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2890 return _mm512_movepi32_mask( 2891 __intrin_bitcast<__m512i>(__x._M_data)); 2892 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2893 return _mm512_movepi64_mask( 2894 __intrin_bitcast<__m512i>(__x._M_data)); 2895 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2896 return _mm256_movepi32_mask( 2897 __intrin_bitcast<__m256i>(__x._M_data)); 2898 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2899 return _mm256_movepi64_mask( 2900 __intrin_bitcast<__m256i>(__x._M_data)); 2901 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2902 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2903 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2904 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2905 } 2906 else if constexpr (__is_avx512_abi<_Abi>()) 2907 { 2908 const auto __xi = __to_intrin(__x); 2909 [[maybe_unused]] constexpr auto __k1 2910 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2911 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2912 return _mm_movemask_ps(__xi); 2913 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2914 return _mm_movemask_pd(__xi); 2915 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2916 return _mm256_movemask_ps(__xi); 2917 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2918 return _mm256_movemask_pd(__xi); 2919 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2920 return _mm512_mask_cmplt_epi32_mask( 2921 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2922 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2923 return _mm512_mask_cmplt_epi64_mask( 2924 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2925 else 2926 __assert_unreachable<_Tp>(); 2927 } 2928 else 2929 return _Base::_S_signbit(__x); 2930 /*{ 2931 using _I = __int_for_sizeof_t<_Tp>; 2932 if constexpr (sizeof(__x) == 64) 2933 return _S_less(__vector_bitcast<_I>(__x), _I()); 2934 else 2935 { 2936 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2937 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2938 if constexpr ((sizeof(_Tp) == 4 && 2939 (__have_avx2 || sizeof(__x) == 16)) || 2940 __have_avx512vl) 2941 { 2942 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2943 } 2944 else if constexpr ((__have_avx2 || 2945 (__have_ssse3 && sizeof(__x) == 16))) 2946 { 2947 return __vector_bitcast<_Tp>((__xx & __signmask) == 2948 __signmask); 2949 } 2950 else 2951 { // SSE2/3 or AVX (w/o AVX2) 2952 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2953 return __vector_bitcast<_Tp>( 2954 __vector_bitcast<_Tp>( 2955 (__xx & __signmask) | 2956 __vector_bitcast<_I>(__one)) // -1 or 1 2957 != __one); 2958 } 2959 } 2960 }*/ 2961 } 2962 2963 // }}} 2964 // _S_isnonzerovalue_mask {{{ 2965 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2966 template
2967 _GLIBCXX_SIMD_INTRINSIC static auto 2968 _S_isnonzerovalue_mask(_Tp __x) 2969 { 2970 using _Traits = _VectorTraits<_Tp>; 2971 if constexpr (__have_avx512dq_vl) 2972 { 2973 if constexpr (_Traits::template _S_is< 2974 float, 2> || _Traits::template _S_is
) 2975 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2976 else if constexpr (_Traits::template _S_is
) 2977 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2978 else if constexpr (_Traits::template _S_is
) 2979 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2980 else if constexpr (_Traits::template _S_is
) 2981 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2982 else if constexpr (_Traits::template _S_is
) 2983 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2984 else if constexpr (_Traits::template _S_is
) 2985 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2986 else 2987 __assert_unreachable<_Tp>(); 2988 } 2989 else 2990 { 2991 using _Up = typename _Traits::value_type; 2992 constexpr size_t _Np = _Traits::_S_full_size; 2993 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2994 const auto __b = __x * _Up(); // NaN if __x == inf 2995 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2996 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2997 _CMP_ORD_Q); 2998 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2999 return __mmask8(0xf 3000 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 3001 __auto_bitcast(__b), 3002 _CMP_ORD_Q)); 3003 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 3004 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3005 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 3006 return __mmask8(0x3 3007 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3008 __auto_bitcast(__b), 3009 _CMP_ORD_Q)); 3010 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 3011 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3012 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 3013 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 3014 __auto_bitcast(__b), 3015 _CMP_ORD_Q)); 3016 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 3017 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3018 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 3019 return __mmask8(0xf 3020 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 3021 __auto_bitcast(__b), 3022 _CMP_ORD_Q)); 3023 else if constexpr (__is_avx512_ps<_Up, _Np>()) 3024 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 3025 else if constexpr (__is_avx512_pd<_Up, _Np>()) 3026 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 3027 else 3028 __assert_unreachable<_Tp>(); 3029 } 3030 } 3031 3032 // }}} 3033 // _S_isfinite {{{ 3034 template
3035 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3036 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3037 { 3038 static_assert(is_floating_point_v<_Tp>); 3039 #if !__FINITE_MATH_ONLY__ 3040 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3041 { 3042 const auto __xi = __to_intrin(__x); 3043 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3044 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3045 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3046 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3047 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3048 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3049 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3050 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3051 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3052 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3053 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3054 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3055 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3056 } 3057 else if constexpr (__is_avx512_abi<_Abi>()) 3058 { 3059 // if all exponent bits are set, __x is either inf or NaN 3060 using _I = __int_for_sizeof_t<_Tp>; 3061 const auto __inf = __vector_bitcast<_I>( 3062 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3063 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3064 } 3065 else 3066 #endif 3067 return _Base::_S_isfinite(__x); 3068 } 3069 3070 // }}} 3071 // _S_isinf {{{ 3072 template
3073 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3074 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3075 { 3076 #if !__FINITE_MATH_ONLY__ 3077 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3078 { 3079 const auto __xi = __to_intrin(__x); 3080 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3081 return _mm512_fpclass_ps_mask(__xi, 0x18); 3082 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3083 return _mm512_fpclass_pd_mask(__xi, 0x18); 3084 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3085 return _mm256_fpclass_ps_mask(__xi, 0x18); 3086 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3087 return _mm256_fpclass_pd_mask(__xi, 0x18); 3088 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3089 return _mm_fpclass_ps_mask(__xi, 0x18); 3090 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3091 return _mm_fpclass_pd_mask(__xi, 0x18); 3092 else 3093 __assert_unreachable<_Tp>(); 3094 } 3095 else if constexpr (__have_avx512dq_vl) 3096 { 3097 if constexpr (__is_sse_pd<_Tp, _Np>()) 3098 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3099 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3100 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3101 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3102 return _mm_movm_epi32( 3103 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3104 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3105 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3106 else 3107 __assert_unreachable<_Tp>(); 3108 } 3109 else 3110 #endif 3111 return _Base::_S_isinf(__x); 3112 } 3113 3114 // }}} 3115 // _S_isnormal {{{ 3116 template
3117 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3118 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3119 { 3120 #if __FINITE_MATH_ONLY__ 3121 [[maybe_unused]] constexpr int __mode = 0x26; 3122 #else 3123 [[maybe_unused]] constexpr int __mode = 0xbf; 3124 #endif 3125 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3126 { 3127 const auto __xi = __to_intrin(__x); 3128 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3129 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3130 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3131 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3132 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3133 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3134 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3135 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3136 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3137 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3138 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3139 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3140 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3141 else 3142 __assert_unreachable<_Tp>(); 3143 } 3144 else if constexpr (__have_avx512dq) 3145 { 3146 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3147 return _mm_movm_epi32( 3148 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3149 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3150 return _mm256_movm_epi32( 3151 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3152 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3153 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3154 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3155 return _mm_movm_epi64( 3156 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3157 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3158 return _mm256_movm_epi64( 3159 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3160 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3161 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3162 else 3163 __assert_unreachable<_Tp>(); 3164 } 3165 else if constexpr (__is_avx512_abi<_Abi>()) 3166 { 3167 using _I = __int_for_sizeof_t<_Tp>; 3168 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3169 const auto minn = __vector_bitcast<_I>( 3170 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3171 #if __FINITE_MATH_ONLY__ 3172 return _S_less_equal<_I, _Np>(minn, absn); 3173 #else 3174 const auto infn 3175 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3176 return __and(_S_less_equal<_I, _Np>(minn, absn), 3177 _S_less<_I, _Np>(absn, infn)); 3178 #endif 3179 } 3180 else 3181 return _Base::_S_isnormal(__x); 3182 } 3183 3184 // }}} 3185 // _S_isnan {{{ 3186 template
3187 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3188 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3189 { return _S_isunordered(__x, __x); } 3190 3191 // }}} 3192 // _S_isunordered {{{ 3193 template
3194 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3195 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3196 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3197 { 3198 #if __FINITE_MATH_ONLY__ 3199 return {}; // false 3200 #else 3201 const auto __xi = __to_intrin(__x); 3202 const auto __yi = __to_intrin(__y); 3203 if constexpr (__is_avx512_abi<_Abi>()) 3204 { 3205 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3206 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3207 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3208 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3209 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3210 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3211 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3212 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3213 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3214 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3215 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3216 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3217 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3218 } 3219 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3220 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3221 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3222 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3223 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3224 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3225 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3226 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3227 else 3228 __assert_unreachable<_Tp>(); 3229 #endif 3230 } 3231 3232 // }}} 3233 // _S_isgreater {{{ 3234 template
3235 static constexpr _MaskMember<_Tp> 3236 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3237 { 3238 const auto __xi = __to_intrin(__x); 3239 const auto __yi = __to_intrin(__y); 3240 if constexpr (__is_avx512_abi<_Abi>()) 3241 { 3242 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3243 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3244 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3245 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3246 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3247 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3248 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3249 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3250 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3251 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3252 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3253 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3254 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3255 else 3256 __assert_unreachable<_Tp>(); 3257 } 3258 else if constexpr (__have_avx) 3259 { 3260 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3261 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3262 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3263 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3264 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3265 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3266 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3267 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3268 else 3269 __assert_unreachable<_Tp>(); 3270 } 3271 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3272 && sizeof(_Tp) == 4) 3273 { 3274 const auto __xn = __vector_bitcast
(__xi); 3275 const auto __yn = __vector_bitcast
(__yi); 3276 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3277 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3278 return __auto_bitcast( 3279 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3280 } 3281 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3282 && sizeof(_Tp) == 8) 3283 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3284 -_mm_ucomigt_sd(__xi, __yi), 3285 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3286 _mm_unpackhi_pd(__yi, __yi))}; 3287 else 3288 return _Base::_S_isgreater(__x, __y); 3289 } 3290 3291 // }}} 3292 // _S_isgreaterequal {{{ 3293 template
3294 static constexpr _MaskMember<_Tp> 3295 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3296 { 3297 const auto __xi = __to_intrin(__x); 3298 const auto __yi = __to_intrin(__y); 3299 if constexpr (__is_avx512_abi<_Abi>()) 3300 { 3301 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3302 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3303 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3304 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3305 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3306 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3307 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3308 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3309 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3310 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3311 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3312 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3313 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3314 else 3315 __assert_unreachable<_Tp>(); 3316 } 3317 else if constexpr (__have_avx) 3318 { 3319 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3320 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3321 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3322 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3323 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3324 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3325 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3326 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3327 else 3328 __assert_unreachable<_Tp>(); 3329 } 3330 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3331 && sizeof(_Tp) == 4) 3332 { 3333 const auto __xn = __vector_bitcast
(__xi); 3334 const auto __yn = __vector_bitcast
(__yi); 3335 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3336 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3337 return __auto_bitcast( 3338 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3339 } 3340 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3341 && sizeof(_Tp) == 8) 3342 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3343 -_mm_ucomige_sd(__xi, __yi), 3344 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3345 _mm_unpackhi_pd(__yi, __yi))}; 3346 else 3347 return _Base::_S_isgreaterequal(__x, __y); 3348 } 3349 3350 // }}} 3351 // _S_isless {{{ 3352 template
3353 static constexpr _MaskMember<_Tp> 3354 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3355 { 3356 const auto __xi = __to_intrin(__x); 3357 const auto __yi = __to_intrin(__y); 3358 if constexpr (__is_avx512_abi<_Abi>()) 3359 { 3360 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3361 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3362 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3363 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3364 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3365 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3366 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3367 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3368 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3369 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3370 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3371 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3372 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3373 else 3374 __assert_unreachable<_Tp>(); 3375 } 3376 else if constexpr (__have_avx) 3377 { 3378 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3379 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3380 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3381 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3382 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3383 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3384 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3385 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3386 else 3387 __assert_unreachable<_Tp>(); 3388 } 3389 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3390 && sizeof(_Tp) == 4) 3391 { 3392 const auto __xn = __vector_bitcast
(__xi); 3393 const auto __yn = __vector_bitcast
(__yi); 3394 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3395 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3396 return __auto_bitcast( 3397 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3398 } 3399 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3400 && sizeof(_Tp) == 8) 3401 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3402 -_mm_ucomigt_sd(__yi, __xi), 3403 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3404 _mm_unpackhi_pd(__xi, __xi))}; 3405 else 3406 return _Base::_S_isless(__x, __y); 3407 } 3408 3409 // }}} 3410 // _S_islessequal {{{ 3411 template
3412 static constexpr _MaskMember<_Tp> 3413 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3414 { 3415 const auto __xi = __to_intrin(__x); 3416 const auto __yi = __to_intrin(__y); 3417 if constexpr (__is_avx512_abi<_Abi>()) 3418 { 3419 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3420 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3421 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3422 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3423 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3424 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3425 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3426 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3427 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3428 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3429 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3430 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3431 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3432 else 3433 __assert_unreachable<_Tp>(); 3434 } 3435 else if constexpr (__have_avx) 3436 { 3437 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3438 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3439 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3440 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3441 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3442 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3443 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3444 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3445 else 3446 __assert_unreachable<_Tp>(); 3447 } 3448 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3449 && sizeof(_Tp) == 4) 3450 { 3451 const auto __xn = __vector_bitcast
(__xi); 3452 const auto __yn = __vector_bitcast
(__yi); 3453 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3454 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3455 return __auto_bitcast( 3456 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3457 } 3458 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3459 && sizeof(_Tp) == 8) 3460 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3461 -_mm_ucomige_sd(__yi, __xi), 3462 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3463 _mm_unpackhi_pd(__xi, __xi))}; 3464 else 3465 return _Base::_S_islessequal(__x, __y); 3466 } 3467 3468 // }}} 3469 // _S_islessgreater {{{ 3470 template
3471 static constexpr _MaskMember<_Tp> 3472 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3473 { 3474 const auto __xi = __to_intrin(__x); 3475 const auto __yi = __to_intrin(__y); 3476 if constexpr (__is_avx512_abi<_Abi>()) 3477 { 3478 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3479 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3480 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3481 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3482 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3483 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3484 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3485 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3486 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3487 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3488 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3489 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3490 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3491 else 3492 __assert_unreachable<_Tp>(); 3493 } 3494 else if constexpr (__have_avx) 3495 { 3496 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3497 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3498 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3499 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3500 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3501 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3502 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3503 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3504 else 3505 __assert_unreachable<_Tp>(); 3506 } 3507 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3508 return __auto_bitcast( 3509 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3510 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3511 return __to_masktype( 3512 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3513 else 3514 __assert_unreachable<_Tp>(); 3515 } 3516 3517 //}}} }}} 3518 template
class _Op, typename _Tp, typename _K, size_t _Np> 3519 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3520 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3521 { 3522 if (__k._M_is_constprop_none_of()) 3523 return __v; 3524 else if (__k._M_is_constprop_all_of()) 3525 { 3526 auto __vv = _Base::_M_make_simd(__v); 3527 _Op
__op; 3528 return __data(__op(__vv)); 3529 } 3530 else if constexpr (__is_bitmask_v
3531 && (is_same_v<_Op
, __increment
> 3532 || is_same_v<_Op
, __decrement
>)) 3533 { 3534 // optimize masked unary increment and decrement as masked sub +/-1 3535 constexpr int __pm_one 3536 = is_same_v<_Op
, __increment
> ? -1 : 1; 3537 #ifdef __clang__ 3538 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3539 #else // __clang__ 3540 if constexpr (is_integral_v<_Tp>) 3541 { 3542 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3543 using _Ip = std::make_signed_t<_Tp>; 3544 using _Up = std::conditional_t< 3545 std::is_same_v<_Ip, long>, 3546 std::conditional_t<__lp64, long long, int>, 3547 std::conditional_t< 3548 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3549 const auto __value = __vector_bitcast<_Up>(__v._M_data); 3550 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3551 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3552 return __vector_bitcast<_Tp>(__builtin_ia32_##_Instr##_mask(__value, \ 3553 __vector_broadcast<_Np>(_Up(__pm_one)), __value, __k._M_data)) 3554 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3555 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3556 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3557 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3558 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3559 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3560 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3561 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3562 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3563 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3564 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3565 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3566 #undef _GLIBCXX_SIMD_MASK_SUB 3567 } 3568 else 3569 { 3570 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3571 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__v) == _Width) \ 3572 return __builtin_ia32_##_Instr##_mask( \ 3573 __v._M_data, __vector_broadcast<_Np>(_Tp(__pm_one)), __v._M_data, \ 3574 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3575 _GLIBCXX_SIMD_MASK_SUB(4, 64, subps512); 3576 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3577 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3578 _GLIBCXX_SIMD_MASK_SUB(8, 64, subpd512); 3579 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3580 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3581 #undef _GLIBCXX_SIMD_MASK_SUB 3582 } 3583 #endif // __clang__ 3584 } 3585 else 3586 return _Base::template _S_masked_unary<_Op>(__k, __v); 3587 } 3588 }; 3589 3590 // }}} 3591 // _MaskImplX86Mixin {{{ 3592 struct _MaskImplX86Mixin 3593 { 3594 template
3595 using _TypeTag = _Tp*; 3596 3597 using _Base = _MaskImplBuiltinMixin; 3598 3599 // _S_to_maskvector(bool) {{{ 3600 template
3601 _GLIBCXX_SIMD_INTRINSIC static constexpr 3602 enable_if_t
, _SimdWrapper<_Up, _ToN>> 3603 _S_to_maskvector(_Tp __x) 3604 { 3605 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3606 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3607 : __vector_type_t<_Up, _ToN>(); 3608 } 3609 3610 // }}} 3611 // _S_to_maskvector(_SanitizedBitMask) {{{ 3612 template
3613 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3614 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3615 { 3616 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3617 using _UV = __vector_type_t<_Up, _ToN>; 3618 using _UI = __intrinsic_type_t<_Up, _ToN>; 3619 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3620 if constexpr (_Np == 1) 3621 return _S_to_maskvector<_Up, _ToN>(__k); 3622 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3623 return __generate_from_n_evaluations
( 3624 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3625 else if constexpr (sizeof(_Up) == 1) 3626 { 3627 if constexpr (sizeof(_UI) == 16) 3628 { 3629 if constexpr (__have_avx512bw_vl) 3630 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3631 else if constexpr (__have_avx512bw) 3632 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3633 else if constexpr (__have_avx512f) 3634 { 3635 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3636 auto __as16bits 3637 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3638 __hi256(__as32bits))); 3639 return __intrin_bitcast<_UV>( 3640 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3641 } 3642 else if constexpr (__have_ssse3) 3643 { 3644 const auto __bitmask = __to_intrin( 3645 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3646 8, 16, 32, 64, 128)); 3647 return __intrin_bitcast<_UV>( 3648 __vector_bitcast<_Up>( 3649 _mm_shuffle_epi8(__to_intrin( 3650 __vector_type_t<_ULLong, 2>{__k}), 3651 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3652 1, 1, 1, 1, 1, 1, 1)) 3653 & __bitmask) 3654 != 0); 3655 } 3656 // else fall through 3657 } 3658 else if constexpr (sizeof(_UI) == 32) 3659 { 3660 if constexpr (__have_avx512bw_vl) 3661 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3662 else if constexpr (__have_avx512bw) 3663 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3664 else if constexpr (__have_avx512f) 3665 { 3666 auto __as16bits = // 0 16 1 17 ... 15 31 3667 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3668 16) 3669 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3670 ~__m512i()), 3671 16); 3672 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3673 __lo256(__as16bits), 3674 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3675 ); 3676 // deinterleave: 3677 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3678 __0_16_1_17, // 0 16 1 17 2 ... 3679 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3680 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3681 3, 5, 7, 9, 11, 13, 3682 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3683 // 0-3 8-11 16-19 24-27 3684 // 4-7 12-15 20-23 28-31 3685 } 3686 else if constexpr (__have_avx2) 3687 { 3688 const auto __bitmask 3689 = _mm256_broadcastsi128_si256(__to_intrin( 3690 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3691 4, 8, 16, 32, 64, 128))); 3692 return __vector_bitcast<_Up>( 3693 __vector_bitcast<_Up>( 3694 _mm256_shuffle_epi8( 3695 _mm256_broadcastsi128_si256( 3696 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3697 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3698 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3699 3, 3, 3, 3, 3, 3)) 3700 & __bitmask) 3701 != 0); 3702 } 3703 // else fall through 3704 } 3705 else if constexpr (sizeof(_UI) == 64) 3706 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3707 if constexpr (std::min(_ToN, _Np) <= 4) 3708 { 3709 if constexpr (_Np > 7) // avoid overflow 3710 __x &= _SanitizedBitMask<_Np>(0x0f); 3711 const _UInt __char_mask 3712 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3713 * 0xff; 3714 _UV __r = {}; 3715 __builtin_memcpy(&__r, &__char_mask, 3716 std::min(sizeof(__r), sizeof(__char_mask))); 3717 return __r; 3718 } 3719 else if constexpr (std::min(_ToN, _Np) <= 7) 3720 { 3721 if constexpr (_Np > 7) // avoid overflow 3722 __x &= _SanitizedBitMask<_Np>(0x7f); 3723 const _ULLong __char_mask 3724 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3725 * 0xff; 3726 _UV __r = {}; 3727 __builtin_memcpy(&__r, &__char_mask, 3728 std::min(sizeof(__r), sizeof(__char_mask))); 3729 return __r; 3730 } 3731 } 3732 else if constexpr (sizeof(_Up) == 2) 3733 { 3734 if constexpr (sizeof(_UI) == 16) 3735 { 3736 if constexpr (__have_avx512bw_vl) 3737 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3738 else if constexpr (__have_avx512bw) 3739 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3740 else if constexpr (__have_avx512f) 3741 { 3742 __m256i __as32bits = {}; 3743 if constexpr (__have_avx512vl) 3744 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3745 else 3746 __as32bits 3747 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3748 return __intrin_bitcast<_UV>( 3749 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3750 } 3751 // else fall through 3752 } 3753 else if constexpr (sizeof(_UI) == 32) 3754 { 3755 if constexpr (__have_avx512bw_vl) 3756 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3757 else if constexpr (__have_avx512bw) 3758 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3759 else if constexpr (__have_avx512f) 3760 { 3761 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3762 return __vector_bitcast<_Up>( 3763 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3764 __hi256(__as32bits)))); 3765 } 3766 // else fall through 3767 } 3768 else if constexpr (sizeof(_UI) == 64) 3769 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3770 } 3771 else if constexpr (sizeof(_Up) == 4) 3772 { 3773 if constexpr (sizeof(_UI) == 16) 3774 { 3775 if constexpr (__have_avx512dq_vl) 3776 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3777 else if constexpr (__have_avx512dq) 3778 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3779 else if constexpr (__have_avx512vl) 3780 return __intrin_bitcast<_UV>( 3781 _mm_maskz_mov_epi32(__k, ~__m128i())); 3782 else if constexpr (__have_avx512f) 3783 return __intrin_bitcast<_UV>( 3784 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3785 // else fall through 3786 } 3787 else if constexpr (sizeof(_UI) == 32) 3788 { 3789 if constexpr (__have_avx512dq_vl) 3790 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3791 else if constexpr (__have_avx512dq) 3792 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3793 else if constexpr (__have_avx512vl) 3794 return __vector_bitcast<_Up>( 3795 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3796 else if constexpr (__have_avx512f) 3797 return __vector_bitcast<_Up>( 3798 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3799 // else fall through 3800 } 3801 else if constexpr (sizeof(_UI) == 64) 3802 return __vector_bitcast<_Up>( 3803 __have_avx512dq ? _mm512_movm_epi32(__k) 3804 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3805 } 3806 else if constexpr (sizeof(_Up) == 8) 3807 { 3808 if constexpr (sizeof(_UI) == 16) 3809 { 3810 if constexpr (__have_avx512dq_vl) 3811 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3812 else if constexpr (__have_avx512dq) 3813 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3814 else if constexpr (__have_avx512vl) 3815 return __vector_bitcast<_Up>( 3816 _mm_maskz_mov_epi64(__k, ~__m128i())); 3817 else if constexpr (__have_avx512f) 3818 return __vector_bitcast<_Up>( 3819 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3820 // else fall through 3821 } 3822 else if constexpr (sizeof(_UI) == 32) 3823 { 3824 if constexpr (__have_avx512dq_vl) 3825 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3826 else if constexpr (__have_avx512dq) 3827 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3828 else if constexpr (__have_avx512vl) 3829 return __vector_bitcast<_Up>( 3830 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3831 else if constexpr (__have_avx512f) 3832 return __vector_bitcast<_Up>( 3833 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3834 // else fall through 3835 } 3836 else if constexpr (sizeof(_UI) == 64) 3837 return __vector_bitcast<_Up>( 3838 __have_avx512dq ? _mm512_movm_epi64(__k) 3839 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3840 } 3841 3842 using _UpUInt = make_unsigned_t<_Up>; 3843 using _V = __vector_type_t<_UpUInt, _ToN>; 3844 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3845 if constexpr (_ToN == 2) 3846 { 3847 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3848 } 3849 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3850 { 3851 if constexpr (sizeof(_Up) == 4) 3852 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3853 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3854 _mm256_castsi256_ps(_mm256_setr_epi32( 3855 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3856 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3857 else if constexpr (sizeof(_Up) == 8) 3858 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3859 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3860 _mm256_castsi256_pd( 3861 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3862 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3863 else 3864 __assert_unreachable<_Up>(); 3865 } 3866 else if constexpr (__bits_per_element >= _ToN) 3867 { 3868 constexpr auto __bitmask 3869 = __generate_vector<_V>([](auto __i) 3870 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3871 { return __i < _ToN ? 1ull << __i : 0; }); 3872 const auto __bits 3873 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3874 if constexpr (__bits_per_element > _ToN) 3875 return __vector_bitcast<_Up>(__bits) > 0; 3876 else 3877 return __vector_bitcast<_Up>(__bits != 0); 3878 } 3879 else 3880 { 3881 const _V __tmp 3882 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3883 return static_cast<_UpUInt>( 3884 __k >> (__bits_per_element * (__i / __bits_per_element))); 3885 }) 3886 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3887 return static_cast<_UpUInt>(1ull 3888 << (__i % __bits_per_element)); 3889 }); // mask bit index 3890 return __intrin_bitcast<_UV>(__tmp != _V()); 3891 } 3892 } 3893 3894 // }}} 3895 // _S_to_maskvector(_SimdWrapper) {{{ 3896 template
3898 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3899 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3900 { 3901 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3902 using _TW = _SimdWrapper<_Tp, _Np>; 3903 using _UW = _SimdWrapper<_Up, _ToN>; 3904 using _UI = __intrinsic_type_t<_Up, _ToN>; 3905 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3906 return _S_to_maskvector<_Up, _ToN>( 3907 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3908 // vector -> vector bitcast 3909 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3910 && sizeof(_TW) == sizeof(_UW)) 3911 return __wrapper_bitcast<_Up, _ToN>( 3912 _ToN <= _Np 3913 ? __x 3914 : simd_abi::_VecBuiltin
::_S_masked(__x)); 3915 else // vector -> vector {{{ 3916 { 3917 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3918 { 3919 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3920 return __generate_from_n_evaluations
>( 3922 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3923 } 3924 using _To = __vector_type_t<_Up, _ToN>; 3925 [[maybe_unused]] constexpr size_t _FromN = _Np; 3926 constexpr int _FromBytes = sizeof(_Tp); 3927 constexpr int _ToBytes = sizeof(_Up); 3928 const auto __k = __x._M_data; 3929 3930 if constexpr (_FromBytes == _ToBytes) 3931 return __intrin_bitcast<_To>(__k); 3932 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3933 { // SSE -> SSE {{{ 3934 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3935 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3936 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3937 { 3938 const auto __y 3939 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3940 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3941 } 3942 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3943 { 3944 auto __y 3945 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3946 auto __z 3947 = __vector_bitcast
(__interleave128_lo(__y, __y)); 3948 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3949 } 3950 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3951 && __have_sse2) 3952 return __intrin_bitcast<_To>( 3953 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3954 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3955 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3956 _UI()); 3957 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3958 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3959 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3960 { 3961 const auto __y 3962 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3963 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3964 } 3965 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3966 { 3967 if constexpr (__have_sse2 && !__have_ssse3) 3968 return __intrin_bitcast<_To>(_mm_packs_epi32( 3969 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3970 __m128i())); 3971 else 3972 return __intrin_bitcast<_To>( 3973 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3974 __vector_bitcast<_Up>(__k))); 3975 } 3976 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3977 return __intrin_bitcast<_To>( 3978 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3979 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3980 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3981 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3982 && __have_ssse3) 3983 return __intrin_bitcast<_To>( 3984 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3985 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3986 -1, -1, -1, -1, -1, -1, -1, 3987 -1))); 3988 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3989 { 3990 auto __y 3991 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3992 __y = _mm_packs_epi32(__y, __m128i()); 3993 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3994 } 3995 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3996 && __have_ssse3) 3997 return __intrin_bitcast<_To>( 3998 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3999 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4000 -1, -1, -1, -1, -1, -1, -1, 4001 -1))); 4002 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4003 { 4004 const auto __y 4005 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 4006 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 4007 } 4008 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 4009 return __intrin_bitcast<_To>( 4010 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 4011 else 4012 __assert_unreachable<_Tp>(); 4013 } // }}} 4014 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 4015 { // AVX -> AVX {{{ 4016 if constexpr (_FromBytes == _ToBytes) 4017 __assert_unreachable<_Tp>(); 4018 else if constexpr (_FromBytes == _ToBytes * 2) 4019 { 4020 const auto __y = __vector_bitcast<_LLong>(__k); 4021 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4022 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4023 } 4024 else if constexpr (_FromBytes == _ToBytes * 4) 4025 { 4026 const auto __y = __vector_bitcast<_LLong>(__k); 4027 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4028 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4029 __m128i()))); 4030 } 4031 else if constexpr (_FromBytes == _ToBytes * 8) 4032 { 4033 const auto __y = __vector_bitcast<_LLong>(__k); 4034 return __intrin_bitcast<_To>( 4035 _mm256_castsi128_si256(_mm_shuffle_epi8( 4036 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4037 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4038 -1, -1, -1, -1, -1)))); 4039 } 4040 else if constexpr (_FromBytes * 2 == _ToBytes) 4041 { 4042 auto __y = __xzyw(__to_intrin(__k)); 4043 if constexpr (is_floating_point_v< 4044 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4045 { 4046 const auto __yy = __vector_bitcast
(__y); 4047 return __intrin_bitcast<_To>( 4048 _mm256_unpacklo_ps(__yy, __yy)); 4049 } 4050 else 4051 return __intrin_bitcast<_To>( 4052 _mm256_unpacklo_epi8(__y, __y)); 4053 } 4054 else if constexpr (_FromBytes * 4 == _ToBytes) 4055 { 4056 auto __y 4057 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4058 __lo128(__vector_bitcast<_LLong>( 4059 __k))); // drops 3/4 of input 4060 return __intrin_bitcast<_To>( 4061 __concat(_mm_unpacklo_epi16(__y, __y), 4062 _mm_unpackhi_epi16(__y, __y))); 4063 } 4064 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4065 { 4066 auto __y 4067 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4068 __lo128(__vector_bitcast<_LLong>( 4069 __k))); // drops 3/4 of input 4070 __y 4071 = _mm_unpacklo_epi16(__y, 4072 __y); // drops another 1/2 => 7/8 total 4073 return __intrin_bitcast<_To>( 4074 __concat(_mm_unpacklo_epi32(__y, __y), 4075 _mm_unpackhi_epi32(__y, __y))); 4076 } 4077 else 4078 __assert_unreachable<_Tp>(); 4079 } // }}} 4080 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4081 { // SSE -> AVX {{{ 4082 if constexpr (_FromBytes == _ToBytes) 4083 return __intrin_bitcast<_To>( 4084 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4085 __zero_extend(__to_intrin(__k)))); 4086 else if constexpr (_FromBytes * 2 == _ToBytes) 4087 { // keep all 4088 return __intrin_bitcast<_To>( 4089 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4090 __vector_bitcast<_LLong>(__k)), 4091 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4092 __vector_bitcast<_LLong>(__k)))); 4093 } 4094 else if constexpr (_FromBytes * 4 == _ToBytes) 4095 { 4096 if constexpr (__have_avx2) 4097 { 4098 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4099 __concat(__vector_bitcast<_LLong>(__k), 4100 __vector_bitcast<_LLong>(__k)), 4101 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4102 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4103 6, 6, 7, 7, 7, 7))); 4104 } 4105 else 4106 { 4107 return __intrin_bitcast<_To>(__concat( 4108 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4109 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4110 2, 2, 2, 2, 3, 3, 3, 3)), 4111 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4112 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4113 6, 6, 6, 6, 7, 7, 7, 4114 7)))); 4115 } 4116 } 4117 else if constexpr (_FromBytes * 8 == _ToBytes) 4118 { 4119 if constexpr (__have_avx2) 4120 { 4121 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4122 __concat(__vector_bitcast<_LLong>(__k), 4123 __vector_bitcast<_LLong>(__k)), 4124 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4125 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4126 3, 3, 3, 3, 3, 3))); 4127 } 4128 else 4129 { 4130 return __intrin_bitcast<_To>(__concat( 4131 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4132 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4133 1, 1, 1, 1, 1, 1, 1, 1)), 4134 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4135 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4136 3, 3, 3, 3, 3, 3, 3, 4137 3)))); 4138 } 4139 } 4140 else if constexpr (_FromBytes == _ToBytes * 2) 4141 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4142 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4143 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4144 { 4145 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4146 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4147 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4148 -1, -1, -1, -1, -1, -1, -1, 4149 -1))))); 4150 } 4151 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4152 { 4153 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4154 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4155 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4156 -1, -1, -1, -1, -1, -1, -1, 4157 -1))))); 4158 } 4159 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4160 { 4161 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4162 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4163 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4164 -1, -1, -1, -1, -1, -1, -1, 4165 -1, -1))))); 4166 } 4167 else 4168 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4169 } // }}} 4170 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4171 { // AVX -> SSE {{{ 4172 if constexpr (_FromBytes == _ToBytes) 4173 { // keep low 1/2 4174 return __intrin_bitcast<_To>(__lo128(__k)); 4175 } 4176 else if constexpr (_FromBytes == _ToBytes * 2) 4177 { // keep all 4178 auto __y = __vector_bitcast<_LLong>(__k); 4179 return __intrin_bitcast<_To>( 4180 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4181 } 4182 else if constexpr (_FromBytes == _ToBytes * 4) 4183 { // add 1/2 undef 4184 auto __y = __vector_bitcast<_LLong>(__k); 4185 return __intrin_bitcast<_To>( 4186 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4187 __m128i())); 4188 } 4189 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4190 { // add 3/4 undef 4191 auto __y = __vector_bitcast<_LLong>(__k); 4192 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4193 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4194 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4195 -1, -1, -1, -1))); 4196 } 4197 else if constexpr (_FromBytes * 2 == _ToBytes) 4198 { // keep low 1/4 4199 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4200 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4201 } 4202 else if constexpr (_FromBytes * 4 == _ToBytes) 4203 { // keep low 1/8 4204 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4205 __y = _mm_unpacklo_epi8(__y, __y); 4206 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4207 } 4208 else if constexpr (_FromBytes * 8 == _ToBytes) 4209 { // keep low 1/16 4210 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4211 __y = _mm_unpacklo_epi8(__y, __y); 4212 __y = _mm_unpacklo_epi8(__y, __y); 4213 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4214 } 4215 else 4216 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4217 } // }}} 4218 else 4219 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4220 /* 4221 if constexpr (_FromBytes > _ToBytes) { 4222 const _To __y = __vector_bitcast<_Up>(__k); 4223 return [&]
(index_sequence<_Is...>) { 4224 constexpr int _Stride = _FromBytes / _ToBytes; 4225 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4226 }(make_index_sequence
()); 4227 } else { 4228 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4229 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4230 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4231 // ... 4232 return [&]
(index_sequence<_Is...>) { 4233 constexpr int __dup = _ToBytes / _FromBytes; 4234 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4235 }(make_index_sequence<_FromN>()); 4236 } 4237 */ 4238 } // }}} 4239 } 4240 4241 // }}} 4242 // _S_to_bits {{{ 4243 template
4244 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4245 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4246 { 4247 if constexpr (is_same_v<_Tp, bool>) 4248 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4249 else 4250 { 4251 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4252 if (__builtin_is_constant_evaluated() 4253 || __builtin_constant_p(__x._M_data)) 4254 { 4255 const auto __bools = -__x._M_data; 4256 const _ULLong __k = __call_with_n_evaluations<_Np>( 4257 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4258 return (__bits | ...); 4259 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4260 return _ULLong(__bools[+__i]) << __i; 4261 }); 4262 if (__builtin_is_constant_evaluated() 4263 || __builtin_constant_p(__k)) 4264 return __k; 4265 } 4266 const auto __xi = __to_intrin(__x); 4267 if constexpr (sizeof(_Tp) == 1) 4268 if constexpr (sizeof(__xi) == 16) 4269 if constexpr (__have_avx512bw_vl) 4270 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4271 else // implies SSE2 4272 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4273 else if constexpr (sizeof(__xi) == 32) 4274 if constexpr (__have_avx512bw_vl) 4275 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4276 else // implies AVX2 4277 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4278 else // implies AVX512BW 4279 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4280 4281 else if constexpr (sizeof(_Tp) == 2) 4282 if constexpr (sizeof(__xi) == 16) 4283 if constexpr (__have_avx512bw_vl) 4284 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4285 else if constexpr (__have_avx512bw) 4286 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4287 else // implies SSE2 4288 return _BitMask<_Np>( 4289 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4290 else if constexpr (sizeof(__xi) == 32) 4291 if constexpr (__have_avx512bw_vl) 4292 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4293 else if constexpr (__have_avx512bw) 4294 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4295 else // implies SSE2 4296 return _BitMask<_Np>(_mm_movemask_epi8( 4297 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4298 else // implies AVX512BW 4299 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4300 4301 else if constexpr (sizeof(_Tp) == 4) 4302 if constexpr (sizeof(__xi) == 16) 4303 if constexpr (__have_avx512dq_vl) 4304 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4305 else if constexpr (__have_avx512vl) 4306 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4307 else if constexpr (__have_avx512dq) 4308 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4309 else if constexpr (__have_avx512f) 4310 return _BitMask<_Np>( 4311 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4312 else // implies SSE 4313 return _BitMask<_Np>( 4314 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4315 else if constexpr (sizeof(__xi) == 32) 4316 if constexpr (__have_avx512dq_vl) 4317 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4318 else if constexpr (__have_avx512dq) 4319 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4320 else if constexpr (__have_avx512vl) 4321 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4322 else if constexpr (__have_avx512f) 4323 return _BitMask<_Np>( 4324 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4325 else // implies AVX 4326 return _BitMask<_Np>( 4327 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4328 else // implies AVX512?? 4329 if constexpr (__have_avx512dq) 4330 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4331 else // implies AVX512F 4332 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4333 4334 else if constexpr (sizeof(_Tp) == 8) 4335 if constexpr (sizeof(__xi) == 16) 4336 if constexpr (__have_avx512dq_vl) 4337 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4338 else if constexpr (__have_avx512dq) 4339 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4340 else if constexpr (__have_avx512vl) 4341 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4342 else if constexpr (__have_avx512f) 4343 return _BitMask<_Np>( 4344 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4345 else // implies SSE2 4346 return _BitMask<_Np>( 4347 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4348 else if constexpr (sizeof(__xi) == 32) 4349 if constexpr (__have_avx512dq_vl) 4350 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4351 else if constexpr (__have_avx512dq) 4352 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4353 else if constexpr (__have_avx512vl) 4354 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4355 else if constexpr (__have_avx512f) 4356 return _BitMask<_Np>( 4357 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4358 else // implies AVX 4359 return _BitMask<_Np>( 4360 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4361 else // implies AVX512?? 4362 if constexpr (__have_avx512dq) 4363 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4364 else // implies AVX512F 4365 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4366 4367 else 4368 __assert_unreachable<_Tp>(); 4369 } 4370 } 4371 // }}} 4372 }; 4373 4374 // }}} 4375 // _MaskImplX86 {{{ 4376 template