Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/13/experimental/bits/simd_x86.h
$ cat -n /usr/include/c++/13/experimental/bits/simd_x86.h 1 // Simd x86 specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2023 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_X86INTRIN 31 #error \ 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 33 #endif 34 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE 36 37 // __to_masktype {{{ 38 // Given
return <__int_for_sizeof_t
, N>. For _SimdWrapper and 39 // __vector_type_t. 40 template
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> 42 __to_masktype(_SimdWrapper<_Tp, _Np> __x) 43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); } 44 45 template
, _VectorTraits<_TV>>, 48 typename _Up = __int_for_sizeof_t
> 49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size> 50 __to_masktype(_TV __x) 51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); } 52 53 // }}} 54 // __interleave128_lo {{{ 55 template
, 56 typename _Trait = _VectorTraits<_Tp>> 57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp 58 __interleave128_lo(const _Ap& __av, const _Bp& __bv) 59 { 60 const _Tp __a(__av); 61 const _Tp __b(__bv); 62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2) 63 return _Tp{__a[0], __b[0]}; 64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4) 65 return _Tp{__a[0], __b[0], __a[1], __b[1]}; 66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8) 67 return _Tp{__a[0], __b[0], __a[1], __b[1], 68 __a[2], __b[2], __a[3], __b[3]}; 69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16) 70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5], 72 __a[6], __b[6], __a[7], __b[7]}; 73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4) 74 return _Tp{__a[0], __b[0], __a[2], __b[2]}; 75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8) 76 return _Tp{__a[0], __b[0], __a[1], __b[1], 77 __a[4], __b[4], __a[5], __b[5]}; 78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16) 79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9], 81 __a[10], __b[10], __a[11], __b[11]}; 82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32) 83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 87 __a[22], __b[22], __a[23], __b[23]}; 88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8) 89 return _Tp{__a[0], __b[0], __a[2], __b[2], 90 __a[4], __b[4], __a[6], __b[6]}; 91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16) 92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4], 93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9], 94 __a[12], __b[12], __a[13], __b[13]}; 95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32) 96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], 98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18], 99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25], 100 __a[26], __b[26], __a[27], __b[27]}; 101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64) 102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33], 107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36], 108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48], 109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51], 110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55], 111 __b[55]}; 112 else 113 __assert_unreachable<_Tp>(); 114 } 115 116 // }}} 117 // __is_zero{{{ 118 template
> 119 _GLIBCXX_SIMD_INTRINSIC constexpr bool 120 __is_zero(_Tp __a) 121 { 122 if (!__builtin_is_constant_evaluated()) 123 { 124 if constexpr (__have_avx) 125 { 126 if constexpr (_TVT::template _S_is
) 127 return _mm256_testz_ps(__a, __a); 128 else if constexpr (_TVT::template _S_is
) 129 return _mm256_testz_pd(__a, __a); 130 else if constexpr (sizeof(_Tp) == 32) 131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a)); 132 else if constexpr (_TVT::template _S_is
) 133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a)); 134 else if constexpr (_TVT::template _S_is
) 135 return _mm_testz_pd(__a, __a); 136 else 137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a)); 138 } 139 else if constexpr (__have_sse4_1) 140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a), 141 __intrin_bitcast<__m128i>(__a)); 142 } 143 else if constexpr (sizeof(_Tp) <= 8) 144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0; 145 else 146 { 147 const auto __b = __vector_bitcast<_LLong>(__a); 148 if constexpr (sizeof(__b) == 16) 149 return (__b[0] | __b[1]) == 0; 150 else if constexpr (sizeof(__b) == 32) 151 return __is_zero(__lo128(__b) | __hi128(__b)); 152 else if constexpr (sizeof(__b) == 64) 153 return __is_zero(__lo256(__b) | __hi256(__b)); 154 else 155 __assert_unreachable<_Tp>(); 156 } 157 } 158 159 // }}} 160 // __movemask{{{ 161 template
> 162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int 163 __movemask(_Tp __a) 164 { 165 if constexpr (sizeof(_Tp) == 32) 166 { 167 if constexpr (_TVT::template _S_is
) 168 return _mm256_movemask_ps(__to_intrin(__a)); 169 else if constexpr (_TVT::template _S_is
) 170 return _mm256_movemask_pd(__to_intrin(__a)); 171 else 172 return _mm256_movemask_epi8(__to_intrin(__a)); 173 } 174 else if constexpr (_TVT::template _S_is
) 175 return _mm_movemask_ps(__to_intrin(__a)); 176 else if constexpr (_TVT::template _S_is
) 177 return _mm_movemask_pd(__to_intrin(__a)); 178 else 179 return _mm_movemask_epi8(__to_intrin(__a)); 180 } 181 182 // }}} 183 // __testz{{{ 184 template
> 185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 186 __testz(_TI __a, _TI __b) 187 { 188 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 190 if (!__builtin_is_constant_evaluated()) 191 { 192 if constexpr (sizeof(_TI) == 32) 193 { 194 if constexpr (_TVT::template _S_is
) 195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b)); 196 else if constexpr (_TVT::template _S_is
) 197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b)); 198 else 199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b)); 200 } 201 else if constexpr (_TVT::template _S_is
&& __have_avx) 202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b)); 203 else if constexpr (_TVT::template _S_is
&& __have_avx) 204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b)); 205 else if constexpr (__have_sse4_1) 206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 207 __intrin_bitcast<__m128i>(__to_intrin(__b))); 208 else 209 return __movemask(0 == __and(__a, __b)) != 0; 210 } 211 else 212 return __is_zero(__and(__a, __b)); 213 } 214 215 // }}} 216 // __testc{{{ 217 // requires SSE4.1 or above 218 template
> 219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 220 __testc(_TI __a, _TI __b) 221 { 222 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 224 if (__builtin_is_constant_evaluated()) 225 return __is_zero(__andnot(__a, __b)); 226 227 if constexpr (sizeof(_TI) == 32) 228 { 229 if constexpr (_TVT::template _S_is
) 230 return _mm256_testc_ps(__a, __b); 231 else if constexpr (_TVT::template _S_is
) 232 return _mm256_testc_pd(__a, __b); 233 else 234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b)); 235 } 236 else if constexpr (_TVT::template _S_is
&& __have_avx) 237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b)); 238 else if constexpr (_TVT::template _S_is
&& __have_avx) 239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b)); 240 else 241 { 242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1); 243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 244 __intrin_bitcast<__m128i>(__to_intrin(__b))); 245 } 246 } 247 248 // }}} 249 // __testnzc{{{ 250 template
> 251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 252 __testnzc(_TI __a, _TI __b) 253 { 254 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 256 if (!__builtin_is_constant_evaluated()) 257 { 258 if constexpr (sizeof(_TI) == 32) 259 { 260 if constexpr (_TVT::template _S_is
) 261 return _mm256_testnzc_ps(__a, __b); 262 else if constexpr (_TVT::template _S_is
) 263 return _mm256_testnzc_pd(__a, __b); 264 else 265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b)); 266 } 267 else if constexpr (_TVT::template _S_is
&& __have_avx) 268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b)); 269 else if constexpr (_TVT::template _S_is
&& __have_avx) 270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b)); 271 else if constexpr (__have_sse4_1) 272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 273 __intrin_bitcast<__m128i>(__to_intrin(__b))); 274 else 275 return __movemask(0 == __and(__a, __b)) == 0 276 && __movemask(0 == __andnot(__a, __b)) == 0; 277 } 278 else 279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b))); 280 } 281 282 // }}} 283 // __xzyw{{{ 284 // shuffles the complete vector, swapping the inner two quarters. Often useful 285 // for AVX for fixing up a shuffle result. 286 template
> 287 _GLIBCXX_SIMD_INTRINSIC _Tp 288 __xzyw(_Tp __a) 289 { 290 if constexpr (sizeof(_Tp) == 16) 291 { 292 const auto __x = __vector_bitcast
, float, int>>(__a); 294 return reinterpret_cast<_Tp>( 295 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 296 } 297 else if constexpr (sizeof(_Tp) == 32) 298 { 299 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 301 return reinterpret_cast<_Tp>( 302 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 303 } 304 else if constexpr (sizeof(_Tp) == 64) 305 { 306 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4], 309 __x[5], __x[2], __x[3], 310 __x[6], __x[7]}); 311 } 312 else 313 __assert_unreachable<_Tp>(); 314 } 315 316 // }}} 317 // __maskload_epi32{{{ 318 template
319 _GLIBCXX_SIMD_INTRINSIC auto 320 __maskload_epi32(const int* __ptr, _Tp __k) 321 { 322 if constexpr (sizeof(__k) == 16) 323 return _mm_maskload_epi32(__ptr, __k); 324 else 325 return _mm256_maskload_epi32(__ptr, __k); 326 } 327 328 // }}} 329 // __maskload_epi64{{{ 330 template
331 _GLIBCXX_SIMD_INTRINSIC auto 332 __maskload_epi64(const _LLong* __ptr, _Tp __k) 333 { 334 if constexpr (sizeof(__k) == 16) 335 return _mm_maskload_epi64(__ptr, __k); 336 else 337 return _mm256_maskload_epi64(__ptr, __k); 338 } 339 340 // }}} 341 // __maskload_ps{{{ 342 template
343 _GLIBCXX_SIMD_INTRINSIC auto 344 __maskload_ps(const float* __ptr, _Tp __k) 345 { 346 if constexpr (sizeof(__k) == 16) 347 return _mm_maskload_ps(__ptr, __k); 348 else 349 return _mm256_maskload_ps(__ptr, __k); 350 } 351 352 // }}} 353 // __maskload_pd{{{ 354 template
355 _GLIBCXX_SIMD_INTRINSIC auto 356 __maskload_pd(const double* __ptr, _Tp __k) 357 { 358 if constexpr (sizeof(__k) == 16) 359 return _mm_maskload_pd(__ptr, __k); 360 else 361 return _mm256_maskload_pd(__ptr, __k); 362 } 363 364 // }}} 365 366 #ifdef __clang__ 367 template
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto 369 __movm(_Kp __k) noexcept 370 { 371 static_assert(is_unsigned_v<_Kp>); 372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw) 373 { 374 if constexpr (_Np <= 16 && __have_avx512vl) 375 return __builtin_ia32_cvtmask2b128(__k); 376 else if constexpr (_Np <= 32 && __have_avx512vl) 377 return __builtin_ia32_cvtmask2b256(__k); 378 else 379 return __builtin_ia32_cvtmask2b512(__k); 380 } 381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw) 382 { 383 if constexpr (_Np <= 8 && __have_avx512vl) 384 return __builtin_ia32_cvtmask2w128(__k); 385 else if constexpr (_Np <= 16 && __have_avx512vl) 386 return __builtin_ia32_cvtmask2w256(__k); 387 else 388 return __builtin_ia32_cvtmask2w512(__k); 389 } 390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq) 391 { 392 if constexpr (_Np <= 4 && __have_avx512vl) 393 return __builtin_ia32_cvtmask2d128(__k); 394 else if constexpr (_Np <= 8 && __have_avx512vl) 395 return __builtin_ia32_cvtmask2d256(__k); 396 else 397 return __builtin_ia32_cvtmask2d512(__k); 398 } 399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq) 400 { 401 if constexpr (_Np <= 2 && __have_avx512vl) 402 return __builtin_ia32_cvtmask2q128(__k); 403 else if constexpr (_Np <= 4 && __have_avx512vl) 404 return __builtin_ia32_cvtmask2q256(__k); 405 else 406 return __builtin_ia32_cvtmask2q512(__k); 407 } 408 else 409 __assert_unreachable<_Tp>(); 410 } 411 #endif // __clang__ 412 413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 415 #endif 416 417 // ISA & type detection {{{ 418 template
419 constexpr bool 420 __is_sse_ps() 421 { 422 return __have_sse 423 && is_same_v<_Tp, 424 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 425 } 426 427 template
428 constexpr bool 429 __is_sse_pd() 430 { 431 return __have_sse2 432 && is_same_v<_Tp, 433 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 434 } 435 436 template
437 constexpr bool 438 __is_avx_ps() 439 { 440 return __have_avx 441 && is_same_v<_Tp, 442 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 443 } 444 445 template
446 constexpr bool 447 __is_avx_pd() 448 { 449 return __have_avx 450 && is_same_v<_Tp, 451 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 452 } 453 454 template
455 constexpr bool 456 __is_avx512_ps() 457 { 458 return __have_avx512f 459 && is_same_v<_Tp, 460 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 461 } 462 463 template
464 constexpr bool 465 __is_avx512_pd() 466 { 467 return __have_avx512f 468 && is_same_v<_Tp, 469 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 470 } 471 472 // }}} 473 struct _MaskImplX86Mixin; 474 475 // _CommonImplX86 {{{ 476 struct _CommonImplX86 : _CommonImplBuiltin 477 { 478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 479 // _S_converts_via_decomposition {{{ 480 template
481 static constexpr bool 482 _S_converts_via_decomposition() 483 { 484 if constexpr (is_integral_v< 485 _From> && is_integral_v<_To> && sizeof(_From) == 8 486 && _ToSize == 16) 487 return (sizeof(_To) == 2 && !__have_ssse3) 488 || (sizeof(_To) == 1 && !__have_avx512f); 489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>) 490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8 491 && !__have_avx512dq) 492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1 493 && _ToSize == 16); 494 else if constexpr ( 495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8 496 && !__have_avx512dq) 497 return (sizeof(_To) == 4 && _ToSize == 16) 498 || (sizeof(_To) == 8 && _ToSize < 64); 499 else 500 return false; 501 } 502 503 template
504 static inline constexpr bool __converts_via_decomposition_v 505 = _S_converts_via_decomposition<_From, _To, _ToSize>(); 506 507 // }}} 508 #endif 509 // _S_store {{{ 510 using _CommonImplBuiltin::_S_store; 511 512 template
513 _GLIBCXX_SIMD_INTRINSIC static constexpr void 514 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 515 { 516 constexpr size_t _Bytes = _Np * sizeof(_Tp); 517 518 if (__builtin_is_constant_evaluated()) 519 _CommonImplBuiltin::_S_store(__x, __addr); 520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl) 521 { 522 const auto __v = __to_intrin(__x); 523 524 if constexpr (_Bytes & 1) 525 { 526 if constexpr (_Bytes < 16) 527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes), 528 __intrin_bitcast<__m128i>(__v)); 529 else if constexpr (_Bytes < 32) 530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes), 531 __intrin_bitcast<__m256i>(__v)); 532 else 533 _mm512_mask_storeu_epi8(__addr, 534 0xffffffffffffffffull >> (64 - _Bytes), 535 __intrin_bitcast<__m512i>(__v)); 536 } 537 else if constexpr (_Bytes & 2) 538 { 539 if constexpr (_Bytes < 16) 540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2), 541 __intrin_bitcast<__m128i>(__v)); 542 else if constexpr (_Bytes < 32) 543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2), 544 __intrin_bitcast<__m256i>(__v)); 545 else 546 _mm512_mask_storeu_epi16(__addr, 547 0xffffffffull >> (32 - _Bytes / 2), 548 __intrin_bitcast<__m512i>(__v)); 549 } 550 else if constexpr (_Bytes & 4) 551 { 552 if constexpr (_Bytes < 16) 553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4), 554 __intrin_bitcast<__m128i>(__v)); 555 else if constexpr (_Bytes < 32) 556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4), 557 __intrin_bitcast<__m256i>(__v)); 558 else 559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4), 560 __intrin_bitcast<__m512i>(__v)); 561 } 562 else 563 { 564 static_assert( 565 _Bytes > 16, 566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible"); 568 if constexpr (_Bytes < 32) 569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8), 570 __intrin_bitcast<__m256i>(__v)); 571 else 572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8), 573 __intrin_bitcast<__m512i>(__v)); 574 } 575 } 576 else 577 _CommonImplBuiltin::_S_store(__x, __addr); 578 } 579 580 // }}} 581 // _S_store_bool_array(_BitMask) {{{ 582 template
583 _GLIBCXX_SIMD_INTRINSIC static constexpr void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) 585 { 586 if (__builtin_is_constant_evaluated()) 587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 588 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL 589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( 590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 591 if constexpr (_Np <= 16) 592 return _mm_movm_epi8(__x._M_to_bits()); 593 else if constexpr (_Np <= 32) 594 return _mm256_movm_epi8(__x._M_to_bits()); 595 else if constexpr (_Np <= 64) 596 return _mm512_movm_epi8(__x._M_to_bits()); 597 else 598 __assert_unreachable<_SizeConstant<_Np>>(); 599 }()), 600 __mem); 601 else if constexpr (__have_bmi2) 602 { 603 if constexpr (_Np <= 4) 604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); 605 else 606 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( 607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 608 constexpr size_t __offset = __i * sizeof(size_t); 609 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); 610 if constexpr (__todo == 1) 611 __mem[__offset] = __x[__offset]; 612 else 613 { 614 const auto __bools = 615 #ifdef __x86_64__ 616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(), 617 0x0101010101010101ULL); 618 #else // __x86_64__ 619 _pdep_u32( 620 __x.template _M_extract<__offset>()._M_to_bits(), 621 0x01010101U); 622 #endif // __x86_64__ 623 _S_store<__todo>(__bools, __mem + __offset); 624 } 625 }); 626 } 627 else if constexpr (__have_sse2 && _Np > 7) 628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 629 constexpr int __offset = __i * 16; 630 constexpr int __todo = std::min(16, int(_Np) - __offset); 631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 632 __vector_type16_t<_UChar> __bools; 633 if constexpr (__have_avx512f) 634 { 635 auto __as32bits 636 = _mm512_maskz_mov_epi32(__bits, __to_intrin( 637 __vector_broadcast<16>(1))); 638 auto __as16bits 639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 640 __todo > 8 ? __hi256(__as32bits) 641 : __m256i())); 642 __bools = __vector_bitcast<_UChar>( 643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 644 } 645 else 646 { 647 using _V = __vector_type_t<_UChar, 16>; 648 auto __tmp = _mm_cvtsi32_si128(__bits); 649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp); 650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp); 651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp); 652 _V __tmp2 = reinterpret_cast<_V>(__tmp); 653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128, 654 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index 655 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01 656 } 657 _S_store<__todo>(__bools, __mem + __offset); 658 }); 659 else 660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 661 } 662 663 // }}} 664 // _S_blend_avx512 {{{ 665 // Returns: __k ? __b : __a 666 // TODO: reverse __a and __b to match COND_EXPR 667 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask 668 // __k 669 template
670 _GLIBCXX_SIMD_INTRINSIC static _TV 671 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept 672 { 673 static_assert(__is_vector_type_v<_TV>); 674 using _Tp = typename _VectorTraits<_TV>::value_type; 675 static_assert(sizeof(_TV) >= 16); 676 static_assert(sizeof(_Tp) <= 8); 677 #ifdef __clang__ 678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a; 679 #else 680 using _IntT 681 = conditional_t<(sizeof(_Tp) > 2), 682 conditional_t
, 683 conditional_t
>; 684 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a); 685 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b); 686 if constexpr (sizeof(_TV) == 64) 687 { 688 if constexpr (sizeof(_Tp) == 1) 689 return reinterpret_cast<_TV>( 690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k)); 691 else if constexpr (sizeof(_Tp) == 2) 692 return reinterpret_cast<_TV>( 693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k)); 694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k); 696 else if constexpr (sizeof(_Tp) == 4) 697 return reinterpret_cast<_TV>( 698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k)); 699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k); 701 else if constexpr (sizeof(_Tp) == 8) 702 return reinterpret_cast<_TV>( 703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k)); 704 } 705 else if constexpr (sizeof(_TV) == 32) 706 { 707 if constexpr (sizeof(_Tp) == 1) 708 return reinterpret_cast<_TV>( 709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k)); 710 else if constexpr (sizeof(_Tp) == 2) 711 return reinterpret_cast<_TV>( 712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k)); 713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k); 715 else if constexpr (sizeof(_Tp) == 4) 716 return reinterpret_cast<_TV>( 717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k)); 718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k); 720 else if constexpr (sizeof(_Tp) == 8) 721 return reinterpret_cast<_TV>( 722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k)); 723 } 724 else if constexpr (sizeof(_TV) == 16) 725 { 726 if constexpr (sizeof(_Tp) == 1) 727 return reinterpret_cast<_TV>( 728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k)); 729 else if constexpr (sizeof(_Tp) == 2) 730 return reinterpret_cast<_TV>( 731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k)); 732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k); 734 else if constexpr (sizeof(_Tp) == 4) 735 return reinterpret_cast<_TV>( 736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k)); 737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k); 739 else if constexpr (sizeof(_Tp) == 8) 740 return reinterpret_cast<_TV>( 741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k)); 742 } 743 #endif 744 } 745 746 // }}} 747 // _S_blend_intrin {{{ 748 // Returns: __k ? __b : __a 749 // TODO: reverse __a and __b to match COND_EXPR 750 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32 751 // Bytes wide 752 template
753 _GLIBCXX_SIMD_INTRINSIC static _Tp 754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept 755 { 756 static_assert(is_same_v
); 757 constexpr struct 758 { 759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b, 760 __m128 __k) const noexcept 761 { 762 return __builtin_ia32_blendvps(__a, __b, __k); 763 } 764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b, 765 __m128d __k) const noexcept 766 { 767 return __builtin_ia32_blendvpd(__a, __b, __k); 768 } 769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b, 770 __m128i __k) const noexcept 771 { 772 return reinterpret_cast<__m128i>( 773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a), 774 reinterpret_cast<__v16qi>(__b), 775 reinterpret_cast<__v16qi>(__k))); 776 } 777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b, 778 __m256 __k) const noexcept 779 { 780 return __builtin_ia32_blendvps256(__a, __b, __k); 781 } 782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b, 783 __m256d __k) const noexcept 784 { 785 return __builtin_ia32_blendvpd256(__a, __b, __k); 786 } 787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b, 788 __m256i __k) const noexcept 789 { 790 if constexpr (__have_avx2) 791 return reinterpret_cast<__m256i>( 792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a), 793 reinterpret_cast<__v32qi>(__b), 794 reinterpret_cast<__v32qi>(__k))); 795 else 796 return reinterpret_cast<__m256i>( 797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a), 798 reinterpret_cast<__v8sf>(__b), 799 reinterpret_cast<__v8sf>(__k))); 800 } 801 } __eval; 802 return __eval(__a, __b, __k); 803 } 804 805 // }}} 806 // _S_blend {{{ 807 // Returns: __k ? __at1 : __at0 808 // TODO: reverse __at0 and __at1 to match COND_EXPR 809 template
810 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 811 _S_blend(_SimdWrapper
__k, _SimdWrapper<_Tp, _Np> __at0, 812 _SimdWrapper<_Tp, _Np> __at1) 813 { 814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); 815 if (__k._M_is_constprop() && __at0._M_is_constprop() 816 && __at1._M_is_constprop()) 817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( 818 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 819 return __k[__i] ? __at1[__i] : __at0[__i]; 820 }); 821 else if constexpr (sizeof(__at0) == 64 822 || (__have_avx512vl && sizeof(__at0) >= 16)) 823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); 824 else 825 { 826 static_assert((__have_avx512vl && sizeof(__at0) < 16) 827 || !__have_avx512vl); 828 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp); 829 return __vector_bitcast<_Tp, _Np>( 830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0), 831 __vector_bitcast<_Tp, __size>(__at1))); 832 } 833 } 834 835 template
836 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 839 { 840 const auto __kk = __wrapper_bitcast<_Tp>(__k); 841 if (__builtin_is_constant_evaluated() 842 || (__kk._M_is_constprop() && __at0._M_is_constprop() 843 && __at1._M_is_constprop())) 844 { 845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1)); 846 if (__r._M_is_constprop()) 847 return __r; 848 } 849 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl) 850 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 851 // convert to bitmask and call overload above 852 return _S_blend( 853 _SimdWrapper
( 854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k) 855 ._M_to_bits()), 856 __at0, __at1); 857 else 858 { 859 // Since GCC does not assume __k to be a mask, using the builtin 860 // conditional operator introduces an extra compare against 0 before 861 // blending. So we rather call the intrinsic here. 862 if constexpr (__have_sse4_1) 863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0), 864 __to_intrin(__at1)); 865 else 866 return __or(__andnot(__kk, __at0), __and(__kk, __at1)); 867 } 868 } 869 870 // }}} 871 }; 872 873 // }}} 874 // _SimdImplX86 {{{ 875 template
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi> 877 { 878 using _Base = _SimdImplBuiltin<_Abi>; 879 880 template
881 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 882 883 template
884 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 885 886 template
887 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 888 889 template
890 static constexpr size_t _S_max_store_size 891 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64 892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32 893 : 16; 894 895 using _MaskImpl = typename _Abi::_MaskImpl; 896 897 // _S_masked_load {{{ 898 template
899 static inline _SimdWrapper<_Tp, _Np> 900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 901 const _Up* __mem) noexcept 902 { 903 static_assert(_Np == _S_size<_Tp>); 904 if constexpr (is_same_v<_Tp, _Up> || // no conversion 905 (sizeof(_Tp) == sizeof(_Up) 906 && is_integral_v< 907 _Tp> == is_integral_v<_Up>) // conversion via bit 908 // reinterpretation 909 ) 910 { 911 [[maybe_unused]] const auto __intrin = __to_intrin(__merge); 912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 913 && sizeof(_Tp) == 1) 914 { 915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 916 if constexpr (sizeof(__intrin) == 16) 917 __merge = __vector_bitcast<_Tp, _Np>( 918 _mm_mask_loadu_epi8(__intrin, __kk, __mem)); 919 else if constexpr (sizeof(__merge) == 32) 920 __merge = __vector_bitcast<_Tp, _Np>( 921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem)); 922 else if constexpr (sizeof(__merge) == 64) 923 __merge = __vector_bitcast<_Tp, _Np>( 924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem)); 925 else 926 __assert_unreachable<_Tp>(); 927 } 928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 929 && sizeof(_Tp) == 2) 930 { 931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 932 if constexpr (sizeof(__intrin) == 16) 933 __merge = __vector_bitcast<_Tp, _Np>( 934 _mm_mask_loadu_epi16(__intrin, __kk, __mem)); 935 else if constexpr (sizeof(__intrin) == 32) 936 __merge = __vector_bitcast<_Tp, _Np>( 937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem)); 938 else if constexpr (sizeof(__intrin) == 64) 939 __merge = __vector_bitcast<_Tp, _Np>( 940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem)); 941 else 942 __assert_unreachable<_Tp>(); 943 } 944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 945 && sizeof(_Tp) == 4 && is_integral_v<_Up>) 946 { 947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 948 if constexpr (sizeof(__intrin) == 16) 949 __merge = __vector_bitcast<_Tp, _Np>( 950 _mm_mask_loadu_epi32(__intrin, __kk, __mem)); 951 else if constexpr (sizeof(__intrin) == 32) 952 __merge = __vector_bitcast<_Tp, _Np>( 953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem)); 954 else if constexpr (sizeof(__intrin) == 64) 955 __merge = __vector_bitcast<_Tp, _Np>( 956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem)); 957 else 958 __assert_unreachable<_Tp>(); 959 } 960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>) 962 { 963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 964 if constexpr (sizeof(__intrin) == 16) 965 __merge = __vector_bitcast<_Tp, _Np>( 966 _mm_mask_loadu_ps(__intrin, __kk, __mem)); 967 else if constexpr (sizeof(__intrin) == 32) 968 __merge = __vector_bitcast<_Tp, _Np>( 969 _mm256_mask_loadu_ps(__intrin, __kk, __mem)); 970 else if constexpr (sizeof(__intrin) == 64) 971 __merge = __vector_bitcast<_Tp, _Np>( 972 _mm512_mask_loadu_ps(__intrin, __kk, __mem)); 973 else 974 __assert_unreachable<_Tp>(); 975 } 976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 977 && is_integral_v<_Up>) 978 { 979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 980 __merge 981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 982 __vector_bitcast<_Tp, _Np>( 983 __maskload_epi32(reinterpret_cast
(__mem), 984 __to_intrin(__k)))); 985 } 986 else if constexpr (__have_avx && sizeof(_Tp) == 4) 987 { 988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 989 __merge 990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 991 __vector_bitcast<_Tp, _Np>( 992 __maskload_ps(reinterpret_cast
(__mem), 993 __to_intrin(__k)))); 994 } 995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 996 && sizeof(_Tp) == 8 && is_integral_v<_Up>) 997 { 998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 999 if constexpr (sizeof(__intrin) == 16) 1000 __merge = __vector_bitcast<_Tp, _Np>( 1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem)); 1002 else if constexpr (sizeof(__intrin) == 32) 1003 __merge = __vector_bitcast<_Tp, _Np>( 1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem)); 1005 else if constexpr (sizeof(__intrin) == 64) 1006 __merge = __vector_bitcast<_Tp, _Np>( 1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem)); 1008 else 1009 __assert_unreachable<_Tp>(); 1010 } 1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>) 1013 { 1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1015 if constexpr (sizeof(__intrin) == 16) 1016 __merge = __vector_bitcast<_Tp, _Np>( 1017 _mm_mask_loadu_pd(__intrin, __kk, __mem)); 1018 else if constexpr (sizeof(__intrin) == 32) 1019 __merge = __vector_bitcast<_Tp, _Np>( 1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem)); 1021 else if constexpr (sizeof(__intrin) == 64) 1022 __merge = __vector_bitcast<_Tp, _Np>( 1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem)); 1024 else 1025 __assert_unreachable<_Tp>(); 1026 } 1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1028 && is_integral_v<_Up>) 1029 { 1030 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1031 __merge 1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64( 1034 reinterpret_cast
(__mem), 1035 __to_intrin(__k)))); 1036 } 1037 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1038 { 1039 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1040 __merge 1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1042 __vector_bitcast<_Tp, _Np>( 1043 __maskload_pd(reinterpret_cast
(__mem), 1044 __to_intrin(__k)))); 1045 } 1046 else 1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1048 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1050 }); 1051 } 1052 /* Very uncertain, that the following improves anything. Needs 1053 benchmarking 1054 * before it's activated. 1055 else if constexpr (sizeof(_Up) <= 8 && // no long double 1056 !__converts_via_decomposition_v< 1057 _Up, _Tp, 1058 sizeof(__merge)> // conversion via decomposition 1059 // is better handled via the 1060 // bit_iteration fallback below 1061 ) 1062 { 1063 // TODO: copy pattern from _S_masked_store, which doesn't resort to 1064 // fixed_size 1065 using _Ap = simd_abi::deduce_t<_Up, _Np>; 1066 using _ATraits = _SimdTraits<_Up, _Ap>; 1067 using _AImpl = typename _ATraits::_SimdImpl; 1068 typename _ATraits::_SimdMember __uncvted{}; 1069 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template 1070 _S_convert<_Up>(__k); 1071 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem); 1072 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter; 1073 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted)); 1074 } 1075 */ 1076 else 1077 __merge = _Base::_S_masked_load(__merge, __k, __mem); 1078 return __merge; 1079 } 1080 1081 // }}} 1082 // _S_masked_store_nocvt {{{ 1083 template
1084 _GLIBCXX_SIMD_INTRINSIC static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper
__k) 1086 { 1087 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1088 if constexpr (sizeof(__vi) == 64) 1089 { 1090 static_assert(sizeof(__v) == 64 && __have_avx512f); 1091 if constexpr (__have_avx512bw && sizeof(_Tp) == 1) 1092 _mm512_mask_storeu_epi8(__mem, __k, __vi); 1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2) 1094 _mm512_mask_storeu_epi16(__mem, __k, __vi); 1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4) 1096 { 1097 if constexpr (is_integral_v<_Tp>) 1098 _mm512_mask_storeu_epi32(__mem, __k, __vi); 1099 else 1100 _mm512_mask_storeu_ps(__mem, __k, __vi); 1101 } 1102 else if constexpr (__have_avx512f && sizeof(_Tp) == 8) 1103 { 1104 if constexpr (is_integral_v<_Tp>) 1105 _mm512_mask_storeu_epi64(__mem, __k, __vi); 1106 else 1107 _mm512_mask_storeu_pd(__mem, __k, __vi); 1108 } 1109 else 1110 __assert_unreachable<_Tp>(); 1111 } 1112 else if constexpr (sizeof(__vi) == 32) 1113 { 1114 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1115 _mm256_mask_storeu_epi8(__mem, __k, __vi); 1116 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1117 _mm256_mask_storeu_epi16(__mem, __k, __vi); 1118 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1119 { 1120 if constexpr (is_integral_v<_Tp>) 1121 _mm256_mask_storeu_epi32(__mem, __k, __vi); 1122 else 1123 _mm256_mask_storeu_ps(__mem, __k, __vi); 1124 } 1125 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1126 { 1127 if constexpr (is_integral_v<_Tp>) 1128 _mm256_mask_storeu_epi64(__mem, __k, __vi); 1129 else 1130 _mm256_mask_storeu_pd(__mem, __k, __vi); 1131 } 1132 else if constexpr (__have_avx512f 1133 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1134 { 1135 // use a 512-bit maskstore, using zero-extension of the bitmask 1136 _S_masked_store_nocvt( 1137 _SimdWrapper64<_Tp>( 1138 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)), 1139 __mem, _SimdWrapper
(__k._M_data)); 1140 } 1141 else 1142 _S_masked_store_nocvt(__v, __mem, 1143 _MaskImpl::template _S_to_maskvector< 1144 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1145 } 1146 else if constexpr (sizeof(__vi) == 16) 1147 { 1148 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1149 _mm_mask_storeu_epi8(__mem, __k, __vi); 1150 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1151 _mm_mask_storeu_epi16(__mem, __k, __vi); 1152 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1153 { 1154 if constexpr (is_integral_v<_Tp>) 1155 _mm_mask_storeu_epi32(__mem, __k, __vi); 1156 else 1157 _mm_mask_storeu_ps(__mem, __k, __vi); 1158 } 1159 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1160 { 1161 if constexpr (is_integral_v<_Tp>) 1162 _mm_mask_storeu_epi64(__mem, __k, __vi); 1163 else 1164 _mm_mask_storeu_pd(__mem, __k, __vi); 1165 } 1166 else if constexpr (__have_avx512f 1167 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1168 { 1169 // use a 512-bit maskstore, using zero-extension of the bitmask 1170 _S_masked_store_nocvt( 1171 _SimdWrapper64<_Tp>( 1172 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)), 1173 __mem, _SimdWrapper
(__k._M_data)); 1174 } 1175 else 1176 _S_masked_store_nocvt(__v, __mem, 1177 _MaskImpl::template _S_to_maskvector< 1178 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1179 } 1180 else 1181 __assert_unreachable<_Tp>(); 1182 } 1183 1184 template
1185 _GLIBCXX_SIMD_INTRINSIC static void 1186 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 1187 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k) 1188 { 1189 if constexpr (sizeof(__v) <= 16) 1190 { 1191 [[maybe_unused]] const auto __vi 1192 = __intrin_bitcast<__m128i>(__as_vector(__v)); 1193 [[maybe_unused]] const auto __ki 1194 = __intrin_bitcast<__m128i>(__as_vector(__k)); 1195 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1196 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi); 1197 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1198 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi); 1199 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1200 && is_integral_v<_Tp>) 1201 _mm_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1202 else if constexpr (__have_avx && sizeof(_Tp) == 4) 1203 _mm_maskstore_ps(reinterpret_cast
(__mem), __ki, 1204 __vector_bitcast
(__vi)); 1205 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1206 && is_integral_v<_Tp>) 1207 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi); 1208 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1209 _mm_maskstore_pd(reinterpret_cast
(__mem), __ki, 1210 __vector_bitcast
(__vi)); 1211 else 1212 _Base::_S_masked_store_nocvt(__v, __mem, __k); 1213 } 1214 else if constexpr (sizeof(__v) == 32) 1215 { 1216 [[maybe_unused]] const auto __vi 1217 = __intrin_bitcast<__m256i>(__as_vector(__v)); 1218 [[maybe_unused]] const auto __ki 1219 = __intrin_bitcast<__m256i>(__as_vector(__k)); 1220 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1221 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi); 1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1223 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi); 1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1225 && is_integral_v<_Tp>) 1226 _mm256_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1227 else if constexpr (sizeof(_Tp) == 4) 1228 _mm256_maskstore_ps(reinterpret_cast
(__mem), __ki, 1229 __vector_bitcast
(__v)); 1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1231 && is_integral_v<_Tp>) 1232 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, 1233 __vi); 1234 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1235 _mm256_maskstore_pd(reinterpret_cast
(__mem), __ki, 1236 __vector_bitcast
(__v)); 1237 else 1238 _Base::_S_masked_store_nocvt(__v, __mem, __k); 1239 } 1240 else 1241 __assert_unreachable<_Tp>(); 1242 } 1243 1244 // }}} 1245 // _S_masked_store {{{ 1246 template
1247 _GLIBCXX_SIMD_INTRINSIC static void 1248 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem, 1249 const _MaskMember<_Tp> __k) noexcept 1250 { 1251 if constexpr (is_integral_v< 1252 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up) 1253 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw) 1254 && (sizeof(__v) == 64 || __have_avx512vl)) 1255 { // truncating store 1256 const auto __vi = __to_intrin(__v); 1257 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1258 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1259 && sizeof(__vi) == 64) 1260 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1261 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1262 && sizeof(__vi) == 32) 1263 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1264 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1265 && sizeof(__vi) == 16) 1266 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1267 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1268 && sizeof(__vi) == 64) 1269 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1270 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1271 && sizeof(__vi) == 32) 1272 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1273 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1274 && sizeof(__vi) == 16) 1275 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1276 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1277 && sizeof(__vi) == 64) 1278 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1279 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1280 && sizeof(__vi) == 32) 1281 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1282 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1283 && sizeof(__vi) == 16) 1284 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1285 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1286 && sizeof(__vi) == 64) 1287 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1288 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1289 && sizeof(__vi) == 32) 1290 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1291 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1292 && sizeof(__vi) == 16) 1293 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1294 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1295 && sizeof(__vi) == 64) 1296 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1297 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1298 && sizeof(__vi) == 32) 1299 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1300 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1301 && sizeof(__vi) == 16) 1302 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1303 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1304 && sizeof(__vi) == 64) 1305 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1306 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1307 && sizeof(__vi) == 32) 1308 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1309 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1310 && sizeof(__vi) == 16) 1311 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1312 else 1313 __assert_unreachable<_Tp>(); 1314 } 1315 else 1316 _Base::_S_masked_store(__v, __mem, __k); 1317 } 1318 1319 // }}} 1320 // _S_multiplies {{{ 1321 template
> 1322 _GLIBCXX_SIMD_INTRINSIC static constexpr _V 1323 _S_multiplies(_V __x, _V __y) 1324 { 1325 using _Tp = typename _VVT::value_type; 1326 if (__builtin_is_constant_evaluated() || __x._M_is_constprop() 1327 || __y._M_is_constprop()) 1328 return __as_vector(__x) * __as_vector(__y); 1329 else if constexpr (sizeof(_Tp) == 1) 1330 { 1331 if constexpr (sizeof(_V) == 2) 1332 { 1333 const auto __xs = reinterpret_cast
(__x._M_data); 1334 const auto __ys = reinterpret_cast
(__y._M_data); 1335 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short( 1336 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00)))); 1337 } 1338 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3) 1339 { 1340 const auto __xi = reinterpret_cast
(__x._M_data); 1341 const auto __yi = reinterpret_cast
(__y._M_data); 1342 return reinterpret_cast<__vector_type_t<_Tp, 3>>( 1343 ((__xi * __yi) & 0xff) 1344 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1345 | ((__xi >> 16) * (__yi & 0xff0000))); 1346 } 1347 else if constexpr (sizeof(_V) == 4) 1348 { 1349 const auto __xi = reinterpret_cast
(__x._M_data); 1350 const auto __yi = reinterpret_cast
(__y._M_data); 1351 return reinterpret_cast<__vector_type_t<_Tp, 4>>( 1352 ((__xi * __yi) & 0xff) 1353 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1354 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000) 1355 | ((__xi >> 24) * (__yi & 0xff000000u))); 1356 } 1357 else if constexpr (sizeof(_V) == 8 && __have_avx2 1358 && is_signed_v<_Tp>) 1359 return __convert
( 1360 __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__x))) 1361 * __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__y)))); 1362 else if constexpr (sizeof(_V) == 8 && __have_avx2 1363 && is_unsigned_v<_Tp>) 1364 return __convert
( 1365 __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__x))) 1366 * __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__y)))); 1367 else 1368 { 1369 // codegen of `x*y` is suboptimal (as of GCC 9.0.1) 1370 constexpr size_t __full_size = _VVT::_S_full_size; 1371 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8; 1372 using _ShortW = _SimdWrapper
; 1373 const _ShortW __even = __vector_bitcast
(__x) 1374 * __vector_bitcast
(__y); 1375 _ShortW __high_byte = _ShortW()._M_data - 256; 1376 //[&]() { asm("" : "+x"(__high_byte._M_data)); }(); 1377 const _ShortW __odd 1378 = (__vector_bitcast
(__x) >> 8) 1379 * (__vector_bitcast
(__y) & __high_byte._M_data); 1380 if constexpr (__have_avx512bw && sizeof(_V) > 2) 1381 return _CommonImplX86::_S_blend_avx512( 1382 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even), 1383 __vector_bitcast<_Tp>(__odd)); 1384 else if constexpr (__have_sse4_1 && sizeof(_V) > 2) 1385 return _CommonImplX86::_S_blend_intrin(__to_intrin( 1386 __high_byte), 1387 __to_intrin(__even), 1388 __to_intrin(__odd)); 1389 else 1390 return __to_intrin( 1391 __or(__andnot(__high_byte, __even), __odd)); 1392 } 1393 } 1394 else 1395 return _Base::_S_multiplies(__x, __y); 1396 } 1397 1398 // }}} 1399 // _S_divides {{{ 1400 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1401 template
1402 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1403 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1404 { 1405 if (!__builtin_is_constant_evaluated() 1406 && !__builtin_constant_p(__y._M_data)) 1407 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4) 1408 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1) 1409 // Note that using floating-point division is likely to raise the 1410 // *Inexact* exception flag and thus appears like an invalid 1411 // "as-if" transformation. However, C++ doesn't specify how the 1412 // fpenv can be observed and points to C. C says that function 1413 // calls are assumed to potentially raise fp exceptions, unless 1414 // documented otherwise. Consequently, operator/, which is a 1415 // function call, may raise fp exceptions. 1416 /*const struct _CsrGuard 1417 { 1418 const unsigned _M_data = _mm_getcsr(); 1419 _CsrGuard() 1420 { 1421 _mm_setcsr(0x9f80); // turn off FP exceptions and 1422 flush-to-zero 1423 } 1424 ~_CsrGuard() { _mm_setcsr(_M_data); } 1425 } __csr;*/ 1426 using _Float = conditional_t
; 1427 constexpr size_t __n_intermediate 1428 = std::min(_Np, (__have_avx512f ? 64 1429 : __have_avx ? 32 1430 : 16) 1431 / sizeof(_Float)); 1432 using _FloatV = __vector_type_t<_Float, __n_intermediate>; 1433 constexpr size_t __n_floatv 1434 = __div_roundup(_Np, __n_intermediate); 1435 using _R = __vector_type_t<_Tp, _Np>; 1436 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x); 1437 const auto __yf = __convert_all<_FloatV, __n_floatv>( 1438 _Abi::__make_padding_nonzero(__as_vector(__y))); 1439 return __call_with_n_evaluations<__n_floatv>( 1440 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1441 return __vector_convert<_R>(__quotients...); 1442 }, 1443 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1444 -> _SimdWrapper<_Float, __n_intermediate> 1445 { 1446 #if __RECIPROCAL_MATH__ 1447 // If -freciprocal-math is active, using the `/` operator is 1448 // incorrect because it may be translated to an imprecise 1449 // multiplication with reciprocal. We need to use inline 1450 // assembly to force a real division. 1451 _FloatV __r; 1452 if constexpr (__have_avx) // -mno-sse2avx is irrelevant 1453 // because once -mavx is given, GCC 1454 // emits VEX encoded vdivp[sd] 1455 { 1456 if constexpr (sizeof(_Tp) == 4) 1457 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}" 1458 : "=x"(__r) 1459 : "x"(__xf[__i]), "x"(__yf[__i])); 1460 else 1461 asm("vdivps\t{%2, %1, %0|%0, %1, %2}" 1462 : "=x"(__r) 1463 : "x"(__xf[__i]), "x"(__yf[__i])); 1464 } 1465 else 1466 { 1467 __r = __xf[__i]; 1468 if constexpr (sizeof(_Tp) == 4) 1469 asm("divpd\t{%1, %0|%0, %1}" 1470 : "=x"(__r) 1471 : "x"(__yf[__i])); 1472 else 1473 asm("divps\t{%1, %0|%0, %1}" 1474 : "=x"(__r) 1475 : "x"(__yf[__i])); 1476 } 1477 return __r; 1478 #else 1479 return __xf[__i] / __yf[__i]; 1480 #endif 1481 }); 1482 } 1483 /* 64-bit int division is potentially optimizable via double division if 1484 * the value in __x is small enough and the conversion between 1485 * int<->double is efficient enough: 1486 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1487 sizeof(_Tp) == 8) 1488 { 1489 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1490 { 1491 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1492 0xffe0'0000'0000'0000ull})) 1493 { 1494 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1495 } 1496 } 1497 } 1498 */ 1499 return _Base::_S_divides(__x, __y); 1500 } 1501 #else 1502 using _Base::_S_divides; 1503 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1504 1505 // }}} 1506 // _S_modulus {{{ 1507 template
1508 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1509 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1510 { 1511 if (__builtin_is_constant_evaluated() 1512 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1513 return _Base::_S_modulus(__x, __y); 1514 else 1515 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1516 } 1517 1518 // }}} 1519 // _S_bit_shift_left {{{ 1520 // Notes on UB. C++2a [expr.shift] says: 1521 // -1- [...] The operands shall be of integral or unscoped enumeration type 1522 // and integral promotions are performed. The type of the result is that 1523 // of the promoted left operand. The behavior is undefined if the right 1524 // operand is negative, or greater than or equal to the width of the 1525 // promoted left operand. 1526 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1527 // 2^N, where N is the width of the type of the result. 1528 // 1529 // C++17 [expr.shift] says: 1530 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1531 // bits are zero-filled. If E1 has an unsigned type, the value of the 1532 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1533 // representable in the result type. Otherwise, if E1 has a signed type 1534 // and non-negative value, and E1 × 2^E2 is representable in the 1535 // corresponding unsigned type of the result type, then that value, 1536 // converted to the result type, is the resulting value; otherwise, the 1537 // behavior is undefined. 1538 // 1539 // Consequences: 1540 // With C++2a signed and unsigned types have the same UB 1541 // characteristics: 1542 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1543 // 1544 // With C++17 there's little room for optimizations because the standard 1545 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1546 // short and char shifts must assume shifts affect bits of neighboring 1547 // values. 1548 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1549 template
> 1550 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1551 _S_bit_shift_left(_Tp __xx, int __y) 1552 { 1553 using _V = typename _TVT::type; 1554 using _Up = typename _TVT::value_type; 1555 _V __x = __xx; 1556 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1557 if (__builtin_is_constant_evaluated()) 1558 return __x << __y; 1559 #if __cplusplus > 201703 1560 // after C++17, signed shifts have no UB, and behave just like unsigned 1561 // shifts 1562 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1563 return __vector_bitcast<_Up>( 1564 _S_bit_shift_left(__vector_bitcast
>(__x), 1565 __y)); 1566 #endif 1567 else if constexpr (sizeof(_Up) == 1) 1568 { 1569 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1570 if (__builtin_constant_p(__y)) 1571 { 1572 if (__y == 0) 1573 return __x; 1574 else if (__y == 1) 1575 return __x + __x; 1576 else if (__y == 2) 1577 { 1578 __x = __x + __x; 1579 return __x + __x; 1580 } 1581 else if (__y > 2 && __y < 8) 1582 { 1583 if constexpr (sizeof(__x) > sizeof(unsigned)) 1584 { 1585 const _UChar __mask = 0xff << __y; // precomputed vector 1586 return __vector_bitcast<_Up>( 1587 __vector_bitcast<_UChar>( 1588 __vector_bitcast
(__x) << __y) 1589 & __mask); 1590 } 1591 else 1592 { 1593 const unsigned __mask 1594 = (0xff & (0xff << __y)) * 0x01010101u; 1595 return reinterpret_cast<_V>( 1596 static_cast<__int_for_sizeof_t<_V>>( 1597 unsigned( 1598 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1599 << __y) 1600 & __mask)); 1601 } 1602 } 1603 else if (__y >= 8 && __y < 32) 1604 return _V(); 1605 else 1606 __builtin_unreachable(); 1607 } 1608 // general strategy in the following: use an sllv instead of sll 1609 // instruction, because it's 2 to 4 times faster: 1610 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1611 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1612 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1613 _mm256_set1_epi16(__y)))); 1614 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1615 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1616 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1617 _mm512_set1_epi16(__y)))); 1618 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1619 { 1620 const auto __shift = _mm512_set1_epi16(__y); 1621 return __vector_bitcast<_Up>( 1622 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1623 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1624 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1625 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1626 } 1627 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1628 { 1629 #if 1 1630 const auto __shift = _mm_cvtsi32_si128(__y); 1631 auto __k 1632 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1633 __k |= _mm256_srli_epi16(__k, 8); 1634 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1635 & __k); 1636 #else 1637 const _Up __k = 0xff << __y; 1638 return __vector_bitcast<_Up>(__vector_bitcast
(__x) << __y) 1639 & __k; 1640 #endif 1641 } 1642 else 1643 { 1644 const auto __shift = _mm_cvtsi32_si128(__y); 1645 auto __k 1646 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1647 __k |= _mm_srli_epi16(__k, 8); 1648 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1649 } 1650 } 1651 return __x << __y; 1652 } 1653 1654 template
> 1655 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1656 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1657 { 1658 using _V = typename _TVT::type; 1659 using _Up = typename _TVT::value_type; 1660 _V __x = __xx; 1661 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1662 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1663 if (__builtin_is_constant_evaluated()) 1664 return __x << __y; 1665 #if __cplusplus > 201703 1666 // after C++17, signed shifts have no UB, and behave just like unsigned 1667 // shifts 1668 else if constexpr (is_signed_v<_Up>) 1669 return __vector_bitcast<_Up>( 1670 _S_bit_shift_left(__vector_bitcast
>(__x), 1671 __vector_bitcast
>(__y))); 1672 #endif 1673 else if constexpr (sizeof(_Up) == 1) 1674 { 1675 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1676 return __vector_bitcast<_Up>(__concat( 1677 _mm512_cvtepi16_epi8( 1678 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1679 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1680 _mm512_cvtepi16_epi8( 1681 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1682 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1683 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1684 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1685 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1686 _mm512_cvtepu8_epi16(__iy)))); 1687 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1688 return __intrin_bitcast<_V>( 1689 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1690 _mm_cvtepu8_epi16(__iy)))); 1691 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1692 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1693 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1694 _mm256_cvtepu8_epi16(__iy)))); 1695 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1696 return __intrin_bitcast<_V>( 1697 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1698 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1699 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1700 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1701 { 1702 auto __mask 1703 = __vector_bitcast<_Up>(__vector_bitcast
(__y) << 5); 1704 auto __x4 1705 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1706 __x4 &= char(0xf0); 1707 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1708 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1709 __mask += __mask; 1710 auto __x2 1711 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1712 __x2 &= char(0xfc); 1713 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1714 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1715 __mask += __mask; 1716 auto __x1 = __x + __x; 1717 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1718 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1719 return __x 1720 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1721 } 1722 else if constexpr (sizeof(__x) == 16) 1723 { 1724 auto __mask 1725 = __vector_bitcast<_UChar>(__vector_bitcast
(__y) << 5); 1726 auto __x4 1727 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1728 __x4 &= char(0xf0); 1729 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1730 __mask += __mask; 1731 auto __x2 1732 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1733 __x2 &= char(0xfc); 1734 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1735 __mask += __mask; 1736 auto __x1 = __x + __x; 1737 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1738 return __x 1739 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1740 } 1741 else 1742 return __x << __y; 1743 } 1744 else if constexpr (sizeof(_Up) == 2) 1745 { 1746 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1747 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1748 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1749 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1750 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1751 return __vector_bitcast<_Up>( 1752 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1753 _mm512_castsi256_si512(__iy)))); 1754 else if constexpr (sizeof __ix == 32 && __have_avx2) 1755 { 1756 const auto __ux = __vector_bitcast
(__x); 1757 const auto __uy = __vector_bitcast
(__y); 1758 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1759 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1760 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1761 } 1762 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1763 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1764 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1765 return __intrin_bitcast<_V>( 1766 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1767 _mm512_castsi128_si512(__iy)))); 1768 else if constexpr (sizeof __ix == 16 && __have_avx2) 1769 { 1770 const auto __ux = __vector_bitcast
(__ix); 1771 const auto __uy = __vector_bitcast
(__iy); 1772 return __intrin_bitcast<_V>(_mm_blend_epi16( 1773 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1774 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1775 } 1776 else if constexpr (sizeof __ix == 16) 1777 { 1778 using _Float4 = __vector_type_t
; 1779 using _Int4 = __vector_type_t
; 1780 using _UInt4 = __vector_type_t
; 1781 const _UInt4 __yu 1782 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1783 return __x 1784 * __intrin_bitcast<_V>( 1785 __vector_convert<_Int4>(_SimdWrapper
( 1786 reinterpret_cast<_Float4>(__yu << 23))) 1787 | (__vector_convert<_Int4>(_SimdWrapper
( 1788 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1789 << 16)); 1790 } 1791 else 1792 __assert_unreachable<_Tp>(); 1793 } 1794 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1795 && !__have_avx2) 1796 // latency is suboptimal, but throughput is at full speedup 1797 return __intrin_bitcast<_V>( 1798 __vector_bitcast
(__ix) 1799 * __vector_convert<__vector_type16_t
>( 1800 _SimdWrapper
(__vector_bitcast
( 1801 (__vector_bitcast
(__y) << 23) + 0x3f80'0000)))); 1802 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1803 && !__have_avx2) 1804 { 1805 const auto __lo = _mm_sll_epi64(__ix, __iy); 1806 const auto __hi 1807 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1808 if constexpr (__have_sse4_1) 1809 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1810 else 1811 return __vector_bitcast<_Up>( 1812 _mm_move_sd(__vector_bitcast
(__hi), 1813 __vector_bitcast
(__lo))); 1814 } 1815 else 1816 return __x << __y; 1817 } 1818 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1819 1820 // }}} 1821 // _S_bit_shift_right {{{ 1822 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1823 template
> 1824 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1825 _S_bit_shift_right(_Tp __xx, int __y) 1826 { 1827 using _V = typename _TVT::type; 1828 using _Up = typename _TVT::value_type; 1829 _V __x = __xx; 1830 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1831 if (__builtin_is_constant_evaluated()) 1832 return __x >> __y; 1833 else if (__builtin_constant_p(__y) 1834 && is_unsigned_v< 1835 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1836 return _V(); 1837 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1838 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1839 & _Up(0xff >> __y); 1840 //}}} 1841 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1842 return __intrin_bitcast<_V>( 1843 (__vector_bitcast<_UShort>(__vector_bitcast
(__ix) 1844 >> (__y + 8)) 1845 << 8) 1846 | (__vector_bitcast<_UShort>( 1847 __vector_bitcast
(__vector_bitcast<_UShort>(__ix) << 8) 1848 >> __y) 1849 >> 8)); 1850 //}}} 1851 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1852 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1853 { 1854 if (__y > 32) 1855 return (__intrin_bitcast<_V>(__vector_bitcast
(__ix) >> 32) 1856 & _Up(0xffff'ffff'0000'0000ull)) 1857 | __vector_bitcast<_Up>( 1858 __vector_bitcast
(__vector_bitcast<_ULLong>(__ix) 1859 >> 32) 1860 >> (__y - 32)); 1861 else 1862 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1863 >> __y) 1864 | __vector_bitcast<_Up>( 1865 __vector_bitcast
(__ix & -0x8000'0000'0000'0000ll) 1866 >> __y); 1867 } 1868 //}}} 1869 else 1870 return __x >> __y; 1871 } 1872 1873 template
> 1874 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1875 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1876 { 1877 using _V = typename _TVT::type; 1878 using _Up = typename _TVT::value_type; 1879 _V __x = __xx; 1880 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1881 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1882 if (__builtin_is_constant_evaluated() 1883 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1884 return __x >> __y; 1885 else if constexpr (sizeof(_Up) == 1) //{{{ 1886 { 1887 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1888 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1889 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1890 _mm_cvtepi8_epi16(__iy)) 1891 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1892 _mm_cvtepu8_epi16(__iy)))); 1893 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1894 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1895 is_signed_v<_Up> 1896 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1897 _mm256_cvtepi8_epi16(__iy)) 1898 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1899 _mm256_cvtepu8_epi16(__iy)))); 1900 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1901 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1902 is_signed_v<_Up> 1903 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1904 _mm512_cvtepi8_epi16(__iy)) 1905 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1906 _mm512_cvtepu8_epi16(__iy)))); 1907 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1908 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1909 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1910 0x5555'5555'5555'5555ull, 1911 _mm512_srav_epi16( 1912 _mm512_slli_epi16(__ix, 8), 1913 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1914 _mm512_set1_epi16(8))))); 1915 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1916 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1917 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1918 0x5555'5555'5555'5555ull, 1919 _mm512_srlv_epi16( 1920 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1921 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1922 /* This has better throughput but higher latency than the impl below 1923 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1924 is_unsigned_v<_Up>) 1925 { 1926 const auto __shorts = __to_intrin(_S_bit_shift_right( 1927 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1928 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1929 return __vector_bitcast<_Up>( 1930 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1931 } 1932 */ 1933 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1934 // the following uses vpsr[al]vd, which requires AVX2 1935 if constexpr (is_signed_v<_Up>) 1936 { 1937 const auto r3 = __vector_bitcast<_UInt>( 1938 (__vector_bitcast
(__x) 1939 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1940 & 0xff000000u; 1941 const auto r2 1942 = __vector_bitcast<_UInt>( 1943 ((__vector_bitcast
(__x) << 8) 1944 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1945 & 0xff000000u; 1946 const auto r1 1947 = __vector_bitcast<_UInt>( 1948 ((__vector_bitcast
(__x) << 16) 1949 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1950 & 0xff000000u; 1951 const auto r0 = __vector_bitcast<_UInt>( 1952 (__vector_bitcast
(__x) << 24) 1953 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1954 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1955 | (r0 >> 24)); 1956 } 1957 else 1958 { 1959 const auto r3 = (__vector_bitcast<_UInt>(__x) 1960 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1961 & 0xff000000u; 1962 const auto r2 1963 = ((__vector_bitcast<_UInt>(__x) << 8) 1964 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1965 & 0xff000000u; 1966 const auto r1 1967 = ((__vector_bitcast<_UInt>(__x) << 16) 1968 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1969 & 0xff000000u; 1970 const auto r0 1971 = (__vector_bitcast<_UInt>(__x) << 24) 1972 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 1973 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1974 | (r0 >> 24)); 1975 } 1976 else if constexpr (__have_sse4_1 1977 && is_unsigned_v<_Up> && sizeof(__x) > 2) 1978 { 1979 auto __x128 = __vector_bitcast<_Up>(__ix); 1980 auto __mask 1981 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 1982 auto __x4 = __vector_bitcast<_Up>( 1983 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 1984 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1985 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 1986 __mask += __mask; 1987 auto __x2 = __vector_bitcast<_Up>( 1988 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 1989 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1990 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 1991 __mask += __mask; 1992 auto __x1 = __vector_bitcast<_Up>( 1993 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 1994 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1995 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 1996 return __intrin_bitcast<_V>( 1997 __x128 1998 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 1999 == 0)); // y > 7 nulls the result 2000 } 2001 else if constexpr (__have_sse4_1 2002 && is_signed_v<_Up> && sizeof(__x) > 2) 2003 { 2004 auto __mask = __vector_bitcast<_UChar>( 2005 __vector_bitcast<_UShort>(__iy) << 5); 2006 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2007 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2008 }; 2009 auto __xh = __vector_bitcast
(__ix); 2010 auto __xl = __vector_bitcast
(__ix) << 8; 2011 auto __xh4 = __xh >> 4; 2012 auto __xl4 = __xl >> 4; 2013 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2014 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2015 __xl = __vector_bitcast
( 2016 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2017 __to_intrin(__xl4))); 2018 __mask += __mask; 2019 auto __xh2 = __xh >> 2; 2020 auto __xl2 = __xl >> 2; 2021 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2022 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2023 __xl = __vector_bitcast
( 2024 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2025 __to_intrin(__xl2))); 2026 __mask += __mask; 2027 auto __xh1 = __xh >> 1; 2028 auto __xl1 = __xl >> 1; 2029 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2030 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2031 __xl = __vector_bitcast
( 2032 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2033 __to_intrin(__xl1))); 2034 return __intrin_bitcast<_V>( 2035 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2036 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2037 >> 8)) 2038 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2039 == 0)); // y > 7 nulls the result 2040 } 2041 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2042 { 2043 auto __mask 2044 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2045 auto __x4 = __vector_bitcast<_Up>( 2046 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2047 __x = __mask > 0x7f ? __x4 : __x; 2048 __mask += __mask; 2049 auto __x2 = __vector_bitcast<_Up>( 2050 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2051 __x = __mask > 0x7f ? __x2 : __x; 2052 __mask += __mask; 2053 auto __x1 = __vector_bitcast<_Up>( 2054 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2055 __x = __mask > 0x7f ? __x1 : __x; 2056 return __x 2057 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2058 } 2059 else if constexpr (sizeof(__x) > 2) // signed SSE2 2060 { 2061 static_assert(is_signed_v<_Up>); 2062 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2063 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2064 auto __xh = __vector_bitcast
(__x); 2065 auto __xl = __vector_bitcast
(__x) << 8; 2066 auto __xh4 = __xh >> 4; 2067 auto __xl4 = __xl >> 4; 2068 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2069 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2070 __maskh += __maskh; 2071 __maskl += __maskl; 2072 auto __xh2 = __xh >> 2; 2073 auto __xl2 = __xl >> 2; 2074 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2075 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2076 __maskh += __maskh; 2077 __maskl += __maskl; 2078 auto __xh1 = __xh >> 1; 2079 auto __xl1 = __xl >> 1; 2080 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2081 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2082 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2083 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2084 >> 8); 2085 return __x 2086 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2087 } 2088 else 2089 return __x >> __y; 2090 } //}}} 2091 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2092 { 2093 [[maybe_unused]] auto __blend_0xaa 2094 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2095 if constexpr (sizeof(__a) == 16) 2096 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2097 0xaa); 2098 else if constexpr (sizeof(__a) == 32) 2099 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2100 0xaa); 2101 else if constexpr (sizeof(__a) == 64) 2102 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2103 __to_intrin(__b)); 2104 else 2105 __assert_unreachable
(); 2106 }; 2107 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2108 return __intrin_bitcast<_V>(is_signed_v<_Up> 2109 ? _mm_srav_epi16(__ix, __iy) 2110 : _mm_srlv_epi16(__ix, __iy)); 2111 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2112 return __vector_bitcast<_Up>(is_signed_v<_Up> 2113 ? _mm256_srav_epi16(__ix, __iy) 2114 : _mm256_srlv_epi16(__ix, __iy)); 2115 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2116 return __vector_bitcast<_Up>(is_signed_v<_Up> 2117 ? _mm512_srav_epi16(__ix, __iy) 2118 : _mm512_srlv_epi16(__ix, __iy)); 2119 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2120 return __intrin_bitcast<_V>( 2121 __blend_0xaa(((__vector_bitcast
(__ix) << 16) 2122 >> (__vector_bitcast
(__iy) & 0xffffu)) 2123 >> 16, 2124 __vector_bitcast
(__ix) 2125 >> (__vector_bitcast
(__iy) >> 16))); 2126 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2127 return __intrin_bitcast<_V>( 2128 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2129 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2130 __vector_bitcast<_UInt>(__ix) 2131 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2132 else if constexpr (__have_sse4_1) 2133 { 2134 auto __mask = __vector_bitcast<_UShort>(__iy); 2135 auto __x128 = __vector_bitcast<_Up>(__ix); 2136 //__mask *= 0x0808; 2137 __mask = (__mask << 3) | (__mask << 11); 2138 // do __x128 = 0 where __y[4] is set 2139 __x128 = __vector_bitcast<_Up>( 2140 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2141 __to_intrin(__mask))); 2142 // do __x128 =>> 8 where __y[3] is set 2143 __x128 = __vector_bitcast<_Up>( 2144 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2145 __to_intrin(__mask += __mask))); 2146 // do __x128 =>> 4 where __y[2] is set 2147 __x128 = __vector_bitcast<_Up>( 2148 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2149 __to_intrin(__mask += __mask))); 2150 // do __x128 =>> 2 where __y[1] is set 2151 __x128 = __vector_bitcast<_Up>( 2152 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2153 __to_intrin(__mask += __mask))); 2154 // do __x128 =>> 1 where __y[0] is set 2155 return __intrin_bitcast<_V>( 2156 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2157 __to_intrin(__mask + __mask))); 2158 } 2159 else 2160 { 2161 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2162 auto __x128 = __vector_bitcast<_Up>(__ix); 2163 auto __mask 2164 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2165 return __vector_bitcast
(__kk) < 0; 2166 }; 2167 // do __x128 = 0 where __y[4] is set 2168 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2169 // do __x128 =>> 8 where __y[3] is set 2170 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2171 // do __x128 =>> 4 where __y[2] is set 2172 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2173 // do __x128 =>> 2 where __y[1] is set 2174 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2175 // do __x128 =>> 1 where __y[0] is set 2176 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2177 : __x128); 2178 } 2179 } //}}} 2180 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2181 { 2182 if constexpr (is_unsigned_v<_Up>) 2183 { 2184 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2185 const __m128 __factor_f = reinterpret_cast<__m128>( 2186 0x4f00'0000u - (__vector_bitcast
(__y) << 23)); 2187 const __m128i __factor 2188 = __builtin_constant_p(__factor_f) 2189 ? __to_intrin( 2190 __make_vector
(__factor_f[0], __factor_f[1], 2191 __factor_f[2], __factor_f[3])) 2192 : _mm_cvttps_epi32(__factor_f); 2193 const auto __r02 2194 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2195 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2196 _mm_srli_si128(__factor, 4)); 2197 if constexpr (__have_sse4_1) 2198 return __intrin_bitcast<_V>( 2199 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2200 else 2201 return __intrin_bitcast<_V>( 2202 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2203 } 2204 else 2205 { 2206 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2207 if constexpr (is_signed_v<_Up>) 2208 return _mm_sra_epi32(__a, __b); 2209 else 2210 return _mm_srl_epi32(__a, __b); 2211 }; 2212 const auto __r0 2213 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2214 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2215 const auto __r2 2216 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2217 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2218 if constexpr (__have_sse4_1) 2219 return __intrin_bitcast<_V>( 2220 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2221 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2222 else 2223 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2224 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2225 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2226 } 2227 } //}}} 2228 else 2229 return __x >> __y; 2230 } 2231 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2232 2233 // }}} 2234 // compares {{{ 2235 // _S_equal_to {{{ 2236 template
2237 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2238 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2239 { 2240 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2241 { 2242 if (__builtin_is_constant_evaluated() 2243 || (__x._M_is_constprop() && __y._M_is_constprop())) 2244 return _MaskImpl::_S_to_bits( 2245 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2246 2247 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2248 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2249 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2250 if constexpr (is_floating_point_v<_Tp>) 2251 { 2252 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2253 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2254 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2255 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2257 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2258 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2259 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2261 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2263 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2264 else 2265 __assert_unreachable<_Tp>(); 2266 } 2267 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2268 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2269 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2270 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2271 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2272 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2273 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2274 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2275 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2276 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2278 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2280 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2281 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2282 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2284 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2285 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2286 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2287 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2288 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2289 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2290 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2291 else 2292 __assert_unreachable<_Tp>(); 2293 } // }}} 2294 else if (__builtin_is_constant_evaluated()) 2295 return _Base::_S_equal_to(__x, __y); 2296 else if constexpr (sizeof(__x) == 8) 2297 { 2298 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2299 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2300 _MaskMember<_Tp> __r64{}; 2301 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2302 return __r64; 2303 } 2304 else 2305 return _Base::_S_equal_to(__x, __y); 2306 } 2307 2308 // }}} 2309 // _S_not_equal_to {{{ 2310 template
2311 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2312 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2313 { 2314 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2315 { 2316 if (__builtin_is_constant_evaluated() 2317 || (__x._M_is_constprop() && __y._M_is_constprop())) 2318 return _MaskImpl::_S_to_bits( 2319 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2320 2321 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2322 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2323 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2324 if constexpr (is_floating_point_v<_Tp>) 2325 { 2326 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2327 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2328 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2329 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2331 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2332 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2333 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2335 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2337 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2338 else 2339 __assert_unreachable<_Tp>(); 2340 } 2341 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2342 return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2343 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2344 return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2345 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2346 return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2347 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2348 return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2349 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2350 return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2352 return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2353 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2354 return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2355 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2356 return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2357 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2358 return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2359 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2360 return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2361 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2362 return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2363 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2364 return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2365 else 2366 __assert_unreachable<_Tp>(); 2367 } // }}} 2368 else if (__builtin_is_constant_evaluated()) 2369 return _Base::_S_not_equal_to(__x, __y); 2370 else if constexpr (sizeof(__x) == 8) 2371 { 2372 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2373 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2374 _MaskMember<_Tp> __r64{}; 2375 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2376 return __r64; 2377 } 2378 else 2379 return _Base::_S_not_equal_to(__x, __y); 2380 } 2381 2382 // }}} 2383 // _S_less {{{ 2384 template
2385 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2386 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2387 { 2388 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2389 { 2390 if (__builtin_is_constant_evaluated() 2391 || (__x._M_is_constprop() && __y._M_is_constprop())) 2392 return _MaskImpl::_S_to_bits( 2393 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2394 2395 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2396 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2397 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2398 if constexpr (sizeof(__xi) == 64) 2399 { 2400 if constexpr (is_same_v<_Tp, float>) 2401 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2402 else if constexpr (is_same_v<_Tp, double>) 2403 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2404 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2405 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2406 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2407 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2408 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2409 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2410 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2411 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2412 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2413 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2414 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2415 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2416 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2417 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2418 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2419 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2420 else 2421 __assert_unreachable<_Tp>(); 2422 } 2423 else if constexpr (sizeof(__xi) == 32) 2424 { 2425 if constexpr (is_same_v<_Tp, float>) 2426 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2427 else if constexpr (is_same_v<_Tp, double>) 2428 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2429 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2430 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2431 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2432 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2433 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2434 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2435 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2436 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2437 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2438 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2439 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2440 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2441 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2442 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2443 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2444 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2445 else 2446 __assert_unreachable<_Tp>(); 2447 } 2448 else if constexpr (sizeof(__xi) == 16) 2449 { 2450 if constexpr (is_same_v<_Tp, float>) 2451 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2452 else if constexpr (is_same_v<_Tp, double>) 2453 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2454 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2455 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2456 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2457 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2458 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2459 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2460 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2461 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2462 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2463 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2464 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2465 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2466 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2467 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2468 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2469 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2470 else 2471 __assert_unreachable<_Tp>(); 2472 } 2473 else 2474 __assert_unreachable<_Tp>(); 2475 } // }}} 2476 else if (__builtin_is_constant_evaluated()) 2477 return _Base::_S_less(__x, __y); 2478 else if constexpr (sizeof(__x) == 8) 2479 { 2480 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2481 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2482 _MaskMember<_Tp> __r64{}; 2483 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2484 return __r64; 2485 } 2486 else 2487 return _Base::_S_less(__x, __y); 2488 } 2489 2490 // }}} 2491 // _S_less_equal {{{ 2492 template
2493 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2494 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2495 { 2496 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2497 { 2498 if (__builtin_is_constant_evaluated() 2499 || (__x._M_is_constprop() && __y._M_is_constprop())) 2500 return _MaskImpl::_S_to_bits( 2501 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2502 2503 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2504 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2505 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2506 if constexpr (sizeof(__xi) == 64) 2507 { 2508 if constexpr (is_same_v<_Tp, float>) 2509 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2510 else if constexpr (is_same_v<_Tp, double>) 2511 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2513 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2515 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2517 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2518 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2519 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2521 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2523 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2525 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2526 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2527 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2528 else 2529 __assert_unreachable<_Tp>(); 2530 } 2531 else if constexpr (sizeof(__xi) == 32) 2532 { 2533 if constexpr (is_same_v<_Tp, float>) 2534 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2535 else if constexpr (is_same_v<_Tp, double>) 2536 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2538 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2540 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2542 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2543 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2544 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2546 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2548 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2550 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2551 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2552 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2553 else 2554 __assert_unreachable<_Tp>(); 2555 } 2556 else if constexpr (sizeof(__xi) == 16) 2557 { 2558 if constexpr (is_same_v<_Tp, float>) 2559 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2560 else if constexpr (is_same_v<_Tp, double>) 2561 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2562 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2563 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2564 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2565 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2566 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2567 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2568 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2569 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2570 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2571 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2572 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2573 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2574 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2575 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2576 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2577 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2578 else 2579 __assert_unreachable<_Tp>(); 2580 } 2581 else 2582 __assert_unreachable<_Tp>(); 2583 } // }}} 2584 else if (__builtin_is_constant_evaluated()) 2585 return _Base::_S_less_equal(__x, __y); 2586 else if constexpr (sizeof(__x) == 8) 2587 { 2588 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2589 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2590 _MaskMember<_Tp> __r64{}; 2591 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2592 return __r64; 2593 } 2594 else 2595 return _Base::_S_less_equal(__x, __y); 2596 } 2597 2598 // }}} }}} 2599 // negation {{{ 2600 template
2601 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2602 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2603 { 2604 if constexpr (__is_avx512_abi<_Abi>()) 2605 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2606 else 2607 return _Base::_S_negate(__x); 2608 } 2609 2610 // }}} 2611 // math {{{ 2612 using _Base::_S_abs; 2613 2614 // _S_sqrt {{{ 2615 template
2616 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2617 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2618 { 2619 if constexpr (__is_sse_ps<_Tp, _Np>()) 2620 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2621 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2622 return _mm_sqrt_pd(__x); 2623 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2624 return _mm256_sqrt_ps(__x); 2625 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2626 return _mm256_sqrt_pd(__x); 2627 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2628 return _mm512_sqrt_ps(__x); 2629 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2630 return _mm512_sqrt_pd(__x); 2631 else 2632 __assert_unreachable<_Tp>(); 2633 } 2634 2635 // }}} 2636 // _S_ldexp {{{ 2637 template
2638 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2639 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2640 __fixed_size_storage_t
__exp) 2641 { 2642 if constexpr (sizeof(__x) == 64 || __have_avx512vl) 2643 { 2644 const auto __xi = __to_intrin(__x); 2645 constexpr _SimdConverter
, _Tp, _Abi> 2646 __cvt; 2647 const auto __expi = __to_intrin(__cvt(__exp)); 2648 using _Up = __bool_storage_member_type_t<_Np>; 2649 constexpr _Up __k1 = _Np < sizeof(_Up) * __CHAR_BIT__ ? _Up((1ULL << _Np) - 1) : ~_Up(); 2650 if constexpr (sizeof(__xi) == 16) 2651 { 2652 if constexpr (sizeof(_Tp) == 8) 2653 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2654 else 2655 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2656 } 2657 else if constexpr (sizeof(__xi) == 32) 2658 { 2659 if constexpr (sizeof(_Tp) == 8) 2660 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2661 else 2662 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2663 } 2664 else 2665 { 2666 static_assert(sizeof(__xi) == 64); 2667 if constexpr (sizeof(_Tp) == 8) 2668 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2669 else 2670 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2671 } 2672 } 2673 else 2674 return _Base::_S_ldexp(__x, __exp); 2675 } 2676 2677 // }}} 2678 // _S_trunc {{{ 2679 template
2680 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2681 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2682 { 2683 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2684 return _mm512_roundscale_ps(__x, 0x0b); 2685 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2686 return _mm512_roundscale_pd(__x, 0x0b); 2687 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2688 return _mm256_round_ps(__x, 0xb); 2689 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2690 return _mm256_round_pd(__x, 0xb); 2691 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2692 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xb)); 2693 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2694 return _mm_round_pd(__x, 0xb); 2695 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2696 { 2697 auto __truncated 2698 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2699 const auto __no_fractional_values 2700 = __vector_bitcast
(__vector_bitcast<_UInt>(__to_intrin(__x)) 2701 & 0x7f800000u) 2702 < 0x4b000000; // the exponent is so large that no mantissa bits 2703 // signify fractional values (0x3f8 + 23*8 = 2704 // 0x4b0) 2705 return __no_fractional_values ? __truncated : __to_intrin(__x); 2706 } 2707 else 2708 return _Base::_S_trunc(__x); 2709 } 2710 2711 // }}} 2712 // _S_round {{{ 2713 template
2714 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2715 _S_round(_SimdWrapper<_Tp, _Np> __x) 2716 { 2717 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2718 // from zero as required by std::round. Therefore this function is more 2719 // complicated. 2720 using _V = __vector_type_t<_Tp, _Np>; 2721 _V __truncated; 2722 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2723 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2724 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2725 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2726 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2727 __truncated = _mm256_round_ps(__x._M_data, 2728 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2729 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2730 __truncated = _mm256_round_pd(__x._M_data, 2731 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2732 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2733 __truncated = __auto_bitcast( 2734 _mm_round_ps(__to_intrin(__x), 2735 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2736 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2737 __truncated 2738 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2739 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2740 __truncated = __auto_bitcast( 2741 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2742 else 2743 return _Base::_S_round(__x); 2744 2745 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2746 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2747 2748 const _V __rounded 2749 = __truncated 2750 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2751 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2752 : _V()); 2753 if constexpr (__have_sse4_1) 2754 return __rounded; 2755 else // adjust for missing range in cvttps_epi32 2756 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2757 : __x._M_data; 2758 } 2759 2760 // }}} 2761 // _S_nearbyint {{{ 2762 template
> 2763 _GLIBCXX_SIMD_INTRINSIC static _Tp 2764 _S_nearbyint(_Tp __x) noexcept 2765 { 2766 if constexpr (_TVT::template _S_is
) 2767 return _mm512_roundscale_ps(__x, 0x0c); 2768 else if constexpr (_TVT::template _S_is
) 2769 return _mm512_roundscale_pd(__x, 0x0c); 2770 else if constexpr (_TVT::template _S_is
) 2771 return _mm256_round_ps(__x, 2772 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2773 else if constexpr (_TVT::template _S_is
) 2774 return _mm256_round_pd(__x, 2775 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2776 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2777 return _mm_round_ps(__x, 2778 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2779 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2780 return _mm_round_pd(__x, 2781 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2782 else 2783 return _Base::_S_nearbyint(__x); 2784 } 2785 2786 // }}} 2787 // _S_rint {{{ 2788 template
> 2789 _GLIBCXX_SIMD_INTRINSIC static _Tp 2790 _S_rint(_Tp __x) noexcept 2791 { 2792 if constexpr (_TVT::template _S_is
) 2793 return _mm512_roundscale_ps(__x, 0x04); 2794 else if constexpr (_TVT::template _S_is
) 2795 return _mm512_roundscale_pd(__x, 0x04); 2796 else if constexpr (_TVT::template _S_is
) 2797 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2798 else if constexpr (_TVT::template _S_is
) 2799 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2800 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2801 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2802 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2803 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2804 else 2805 return _Base::_S_rint(__x); 2806 } 2807 2808 // }}} 2809 // _S_floor {{{ 2810 template
2811 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2812 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2813 { 2814 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2815 return _mm512_roundscale_ps(__x, 0x09); 2816 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2817 return _mm512_roundscale_pd(__x, 0x09); 2818 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2819 return _mm256_round_ps(__x, 0x9); 2820 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2821 return _mm256_round_pd(__x, 0x9); 2822 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2823 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x9)); 2824 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2825 return _mm_round_pd(__x, 0x9); 2826 else 2827 return _Base::_S_floor(__x); 2828 } 2829 2830 // }}} 2831 // _S_ceil {{{ 2832 template
2833 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2834 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2835 { 2836 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2837 return _mm512_roundscale_ps(__x, 0x0a); 2838 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2839 return _mm512_roundscale_pd(__x, 0x0a); 2840 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2841 return _mm256_round_ps(__x, 0xa); 2842 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2843 return _mm256_round_pd(__x, 0xa); 2844 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2845 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0xa)); 2846 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2847 return _mm_round_pd(__x, 0xa); 2848 else 2849 return _Base::_S_ceil(__x); 2850 } 2851 2852 // }}} 2853 // _S_signbit {{{ 2854 template
2855 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2856 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2857 { 2858 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2859 { 2860 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2861 return _mm512_movepi32_mask( 2862 __intrin_bitcast<__m512i>(__x._M_data)); 2863 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2864 return _mm512_movepi64_mask( 2865 __intrin_bitcast<__m512i>(__x._M_data)); 2866 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2867 return _mm256_movepi32_mask( 2868 __intrin_bitcast<__m256i>(__x._M_data)); 2869 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2870 return _mm256_movepi64_mask( 2871 __intrin_bitcast<__m256i>(__x._M_data)); 2872 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2873 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2874 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2875 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2876 } 2877 else if constexpr (__is_avx512_abi<_Abi>()) 2878 { 2879 const auto __xi = __to_intrin(__x); 2880 [[maybe_unused]] constexpr auto __k1 2881 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2882 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2883 return _mm_movemask_ps(__xi); 2884 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2885 return _mm_movemask_pd(__xi); 2886 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2887 return _mm256_movemask_ps(__xi); 2888 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2889 return _mm256_movemask_pd(__xi); 2890 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2891 return _mm512_mask_cmplt_epi32_mask( 2892 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2893 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2894 return _mm512_mask_cmplt_epi64_mask( 2895 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2896 else 2897 __assert_unreachable<_Tp>(); 2898 } 2899 else 2900 return _Base::_S_signbit(__x); 2901 /*{ 2902 using _I = __int_for_sizeof_t<_Tp>; 2903 if constexpr (sizeof(__x) == 64) 2904 return _S_less(__vector_bitcast<_I>(__x), _I()); 2905 else 2906 { 2907 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2908 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2909 if constexpr ((sizeof(_Tp) == 4 && 2910 (__have_avx2 || sizeof(__x) == 16)) || 2911 __have_avx512vl) 2912 { 2913 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2914 } 2915 else if constexpr ((__have_avx2 || 2916 (__have_ssse3 && sizeof(__x) == 16))) 2917 { 2918 return __vector_bitcast<_Tp>((__xx & __signmask) == 2919 __signmask); 2920 } 2921 else 2922 { // SSE2/3 or AVX (w/o AVX2) 2923 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2924 return __vector_bitcast<_Tp>( 2925 __vector_bitcast<_Tp>( 2926 (__xx & __signmask) | 2927 __vector_bitcast<_I>(__one)) // -1 or 1 2928 != __one); 2929 } 2930 } 2931 }*/ 2932 } 2933 2934 // }}} 2935 // _S_isnonzerovalue_mask {{{ 2936 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2937 template
2938 _GLIBCXX_SIMD_INTRINSIC static auto 2939 _S_isnonzerovalue_mask(_Tp __x) 2940 { 2941 using _Traits = _VectorTraits<_Tp>; 2942 if constexpr (__have_avx512dq_vl) 2943 { 2944 if constexpr (_Traits::template _S_is< 2945 float, 2> || _Traits::template _S_is
) 2946 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2947 else if constexpr (_Traits::template _S_is
) 2948 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2949 else if constexpr (_Traits::template _S_is
) 2950 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2951 else if constexpr (_Traits::template _S_is
) 2952 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2953 else if constexpr (_Traits::template _S_is
) 2954 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2955 else if constexpr (_Traits::template _S_is
) 2956 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2957 else 2958 __assert_unreachable<_Tp>(); 2959 } 2960 else 2961 { 2962 using _Up = typename _Traits::value_type; 2963 constexpr size_t _Np = _Traits::_S_full_size; 2964 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2965 const auto __b = __x * _Up(); // NaN if __x == inf 2966 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2967 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2968 _CMP_ORD_Q); 2969 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2970 return __mmask8(0xf 2971 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 2972 __auto_bitcast(__b), 2973 _CMP_ORD_Q)); 2974 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 2975 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2976 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 2977 return __mmask8(0x3 2978 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2979 __auto_bitcast(__b), 2980 _CMP_ORD_Q)); 2981 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 2982 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2983 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 2984 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 2985 __auto_bitcast(__b), 2986 _CMP_ORD_Q)); 2987 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 2988 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2989 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 2990 return __mmask8(0xf 2991 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2992 __auto_bitcast(__b), 2993 _CMP_ORD_Q)); 2994 else if constexpr (__is_avx512_ps<_Up, _Np>()) 2995 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2996 else if constexpr (__is_avx512_pd<_Up, _Np>()) 2997 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2998 else 2999 __assert_unreachable<_Tp>(); 3000 } 3001 } 3002 3003 // }}} 3004 // _S_isfinite {{{ 3005 template
3006 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3007 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3008 { 3009 static_assert(is_floating_point_v<_Tp>); 3010 #if !__FINITE_MATH_ONLY__ 3011 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3012 { 3013 const auto __xi = __to_intrin(__x); 3014 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3015 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3016 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3017 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3018 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3019 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3020 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3021 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3022 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3023 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3024 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3025 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3026 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3027 } 3028 else if constexpr (__is_avx512_abi<_Abi>()) 3029 { 3030 // if all exponent bits are set, __x is either inf or NaN 3031 using _I = __int_for_sizeof_t<_Tp>; 3032 const auto __inf = __vector_bitcast<_I>( 3033 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3034 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3035 } 3036 else 3037 #endif 3038 return _Base::_S_isfinite(__x); 3039 } 3040 3041 // }}} 3042 // _S_isinf {{{ 3043 template
3044 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3045 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3046 { 3047 #if !__FINITE_MATH_ONLY__ 3048 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3049 { 3050 const auto __xi = __to_intrin(__x); 3051 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3052 return _mm512_fpclass_ps_mask(__xi, 0x18); 3053 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3054 return _mm512_fpclass_pd_mask(__xi, 0x18); 3055 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3056 return _mm256_fpclass_ps_mask(__xi, 0x18); 3057 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3058 return _mm256_fpclass_pd_mask(__xi, 0x18); 3059 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3060 return _mm_fpclass_ps_mask(__xi, 0x18); 3061 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3062 return _mm_fpclass_pd_mask(__xi, 0x18); 3063 else 3064 __assert_unreachable<_Tp>(); 3065 } 3066 else if constexpr (__have_avx512dq_vl) 3067 { 3068 if constexpr (__is_sse_pd<_Tp, _Np>()) 3069 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3070 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3071 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3072 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3073 return _mm_movm_epi32( 3074 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3075 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3076 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3077 else 3078 __assert_unreachable<_Tp>(); 3079 } 3080 else 3081 #endif 3082 return _Base::_S_isinf(__x); 3083 } 3084 3085 // }}} 3086 // _S_isnormal {{{ 3087 template
3088 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3089 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3090 { 3091 #if __FINITE_MATH_ONLY__ 3092 [[maybe_unused]] constexpr int __mode = 0x26; 3093 #else 3094 [[maybe_unused]] constexpr int __mode = 0xbf; 3095 #endif 3096 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3097 { 3098 const auto __xi = __to_intrin(__x); 3099 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3100 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3101 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3102 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3103 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3104 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3105 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3106 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3107 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3108 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3109 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3110 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3111 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3112 else 3113 __assert_unreachable<_Tp>(); 3114 } 3115 else if constexpr (__have_avx512dq) 3116 { 3117 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3118 return _mm_movm_epi32( 3119 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3120 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3121 return _mm256_movm_epi32( 3122 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3123 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3124 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3125 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3126 return _mm_movm_epi64( 3127 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3128 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3129 return _mm256_movm_epi64( 3130 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3131 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3132 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3133 else 3134 __assert_unreachable<_Tp>(); 3135 } 3136 else if constexpr (__is_avx512_abi<_Abi>()) 3137 { 3138 using _I = __int_for_sizeof_t<_Tp>; 3139 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3140 const auto minn = __vector_bitcast<_I>( 3141 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3142 #if __FINITE_MATH_ONLY__ 3143 return _S_less_equal<_I, _Np>(minn, absn); 3144 #else 3145 const auto infn 3146 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3147 return __and(_S_less_equal<_I, _Np>(minn, absn), 3148 _S_less<_I, _Np>(absn, infn)); 3149 #endif 3150 } 3151 else 3152 return _Base::_S_isnormal(__x); 3153 } 3154 3155 // }}} 3156 // _S_isnan {{{ 3157 template
3158 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3159 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3160 { return _S_isunordered(__x, __x); } 3161 3162 // }}} 3163 // _S_isunordered {{{ 3164 template
3165 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3166 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3167 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3168 { 3169 #if __FINITE_MATH_ONLY__ 3170 return {}; // false 3171 #else 3172 const auto __xi = __to_intrin(__x); 3173 const auto __yi = __to_intrin(__y); 3174 if constexpr (__is_avx512_abi<_Abi>()) 3175 { 3176 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3177 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3178 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3179 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3180 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3181 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3182 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3183 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3184 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3185 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3186 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3187 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3188 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3189 } 3190 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3191 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3192 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3193 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3194 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3195 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3196 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3197 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3198 else 3199 __assert_unreachable<_Tp>(); 3200 #endif 3201 } 3202 3203 // }}} 3204 // _S_isgreater {{{ 3205 template
3206 static constexpr _MaskMember<_Tp> 3207 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3208 { 3209 const auto __xi = __to_intrin(__x); 3210 const auto __yi = __to_intrin(__y); 3211 if constexpr (__is_avx512_abi<_Abi>()) 3212 { 3213 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3214 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3215 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3216 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3217 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3218 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3219 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3220 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3221 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3222 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3223 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3224 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3225 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3226 else 3227 __assert_unreachable<_Tp>(); 3228 } 3229 else if constexpr (__have_avx) 3230 { 3231 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3232 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3233 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3234 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3235 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3236 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3237 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3238 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3239 else 3240 __assert_unreachable<_Tp>(); 3241 } 3242 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3243 && sizeof(_Tp) == 4) 3244 { 3245 const auto __xn = __vector_bitcast
(__xi); 3246 const auto __yn = __vector_bitcast
(__yi); 3247 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3248 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3249 return __auto_bitcast( 3250 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3251 } 3252 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3253 && sizeof(_Tp) == 8) 3254 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3255 -_mm_ucomigt_sd(__xi, __yi), 3256 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3257 _mm_unpackhi_pd(__yi, __yi))}; 3258 else 3259 return _Base::_S_isgreater(__x, __y); 3260 } 3261 3262 // }}} 3263 // _S_isgreaterequal {{{ 3264 template
3265 static constexpr _MaskMember<_Tp> 3266 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3267 { 3268 const auto __xi = __to_intrin(__x); 3269 const auto __yi = __to_intrin(__y); 3270 if constexpr (__is_avx512_abi<_Abi>()) 3271 { 3272 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3273 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3274 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3275 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3276 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3278 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3280 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3281 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3282 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3284 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3285 else 3286 __assert_unreachable<_Tp>(); 3287 } 3288 else if constexpr (__have_avx) 3289 { 3290 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3291 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3292 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3293 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3294 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3295 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3296 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3297 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3298 else 3299 __assert_unreachable<_Tp>(); 3300 } 3301 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3302 && sizeof(_Tp) == 4) 3303 { 3304 const auto __xn = __vector_bitcast
(__xi); 3305 const auto __yn = __vector_bitcast
(__yi); 3306 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3307 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3308 return __auto_bitcast( 3309 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3310 } 3311 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3312 && sizeof(_Tp) == 8) 3313 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3314 -_mm_ucomige_sd(__xi, __yi), 3315 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3316 _mm_unpackhi_pd(__yi, __yi))}; 3317 else 3318 return _Base::_S_isgreaterequal(__x, __y); 3319 } 3320 3321 // }}} 3322 // _S_isless {{{ 3323 template
3324 static constexpr _MaskMember<_Tp> 3325 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3326 { 3327 const auto __xi = __to_intrin(__x); 3328 const auto __yi = __to_intrin(__y); 3329 if constexpr (__is_avx512_abi<_Abi>()) 3330 { 3331 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3332 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3333 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3334 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3335 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3336 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3337 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3338 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3339 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3340 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3341 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3342 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3343 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3344 else 3345 __assert_unreachable<_Tp>(); 3346 } 3347 else if constexpr (__have_avx) 3348 { 3349 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3350 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3352 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3353 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3354 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3355 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3356 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3357 else 3358 __assert_unreachable<_Tp>(); 3359 } 3360 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3361 && sizeof(_Tp) == 4) 3362 { 3363 const auto __xn = __vector_bitcast
(__xi); 3364 const auto __yn = __vector_bitcast
(__yi); 3365 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3366 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3367 return __auto_bitcast( 3368 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3369 } 3370 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3371 && sizeof(_Tp) == 8) 3372 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3373 -_mm_ucomigt_sd(__yi, __xi), 3374 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3375 _mm_unpackhi_pd(__xi, __xi))}; 3376 else 3377 return _Base::_S_isless(__x, __y); 3378 } 3379 3380 // }}} 3381 // _S_islessequal {{{ 3382 template
3383 static constexpr _MaskMember<_Tp> 3384 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3385 { 3386 const auto __xi = __to_intrin(__x); 3387 const auto __yi = __to_intrin(__y); 3388 if constexpr (__is_avx512_abi<_Abi>()) 3389 { 3390 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3391 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3392 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3393 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3394 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3395 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3396 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3397 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3398 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3399 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3400 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3401 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3402 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3403 else 3404 __assert_unreachable<_Tp>(); 3405 } 3406 else if constexpr (__have_avx) 3407 { 3408 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3409 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3410 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3411 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3412 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3413 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3414 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3415 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3416 else 3417 __assert_unreachable<_Tp>(); 3418 } 3419 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3420 && sizeof(_Tp) == 4) 3421 { 3422 const auto __xn = __vector_bitcast
(__xi); 3423 const auto __yn = __vector_bitcast
(__yi); 3424 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3425 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3426 return __auto_bitcast( 3427 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3428 } 3429 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3430 && sizeof(_Tp) == 8) 3431 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3432 -_mm_ucomige_sd(__yi, __xi), 3433 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3434 _mm_unpackhi_pd(__xi, __xi))}; 3435 else 3436 return _Base::_S_islessequal(__x, __y); 3437 } 3438 3439 // }}} 3440 // _S_islessgreater {{{ 3441 template
3442 static constexpr _MaskMember<_Tp> 3443 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3444 { 3445 const auto __xi = __to_intrin(__x); 3446 const auto __yi = __to_intrin(__y); 3447 if constexpr (__is_avx512_abi<_Abi>()) 3448 { 3449 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3450 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3451 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3452 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3453 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3454 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3455 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3456 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3457 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3458 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3459 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3460 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3461 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3462 else 3463 __assert_unreachable<_Tp>(); 3464 } 3465 else if constexpr (__have_avx) 3466 { 3467 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3468 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3469 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3470 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3471 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3472 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3473 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3474 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3475 else 3476 __assert_unreachable<_Tp>(); 3477 } 3478 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3479 return __auto_bitcast( 3480 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3481 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3482 return __to_masktype( 3483 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3484 else 3485 __assert_unreachable<_Tp>(); 3486 } 3487 3488 //}}} }}} 3489 template
class _Op, typename _Tp, typename _K, size_t _Np> 3490 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3491 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3492 { 3493 if (__k._M_is_constprop_none_of()) 3494 return __v; 3495 else if (__k._M_is_constprop_all_of()) 3496 { 3497 auto __vv = _Base::_M_make_simd(__v); 3498 _Op
__op; 3499 return __data(__op(__vv)); 3500 } 3501 else if constexpr (__is_bitmask_v
3502 && (is_same_v<_Op
, __increment
> 3503 || is_same_v<_Op
, __decrement
>)) 3504 { 3505 // optimize masked unary increment and decrement as masked sub +/-1 3506 constexpr int __pm_one 3507 = is_same_v<_Op
, __increment
> ? -1 : 1; 3508 #ifdef __clang__ 3509 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3510 #else // __clang__ 3511 using _TV = __vector_type_t<_Tp, _Np>; 3512 constexpr size_t __bytes = sizeof(__v) < 16 ? 16 : sizeof(__v); 3513 constexpr size_t __width = __bytes / sizeof(_Tp); 3514 if constexpr (is_integral_v<_Tp>) 3515 { 3516 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3517 using _Ip = std::make_signed_t<_Tp>; 3518 using _Up = std::conditional_t< 3519 std::is_same_v<_Ip, long>, 3520 std::conditional_t<__lp64, long long, int>, 3521 std::conditional_t< 3522 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3523 const auto __value = __intrin_bitcast<__vector_type_t<_Up, __width>>(__v._M_data); 3524 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3525 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3526 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask(__value, \ 3527 __vector_broadcast<__width>(_Up(__pm_one)), __value, __k._M_data)) 3528 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3529 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3530 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3531 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3532 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3533 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3534 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3535 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3536 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3537 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3538 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3539 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3540 #undef _GLIBCXX_SIMD_MASK_SUB 3541 } 3542 else 3543 { 3544 const auto __value = __intrin_bitcast<__vector_type_t<_Tp, __width>>(__v._M_data); 3545 #define _GLIBCXX_SIMD_MASK_SUB_512(_Sizeof, _Width, _Instr) \ 3546 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3547 return __builtin_ia32_##_Instr##_mask( \ 3548 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3549 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3550 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3551 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3552 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask( \ 3553 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3554 __k._M_data)) 3555 _GLIBCXX_SIMD_MASK_SUB_512(4, 64, subps512); 3556 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3557 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3558 _GLIBCXX_SIMD_MASK_SUB_512(8, 64, subpd512); 3559 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3560 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3561 #undef _GLIBCXX_SIMD_MASK_SUB_512 3562 #undef _GLIBCXX_SIMD_MASK_SUB 3563 } 3564 #endif // __clang__ 3565 } 3566 else 3567 return _Base::template _S_masked_unary<_Op>(__k, __v); 3568 } 3569 }; 3570 3571 // }}} 3572 // _MaskImplX86Mixin {{{ 3573 struct _MaskImplX86Mixin 3574 { 3575 template
3576 using _TypeTag = _Tp*; 3577 3578 using _Base = _MaskImplBuiltinMixin; 3579 3580 // _S_to_maskvector(bool) {{{ 3581 template
3582 _GLIBCXX_SIMD_INTRINSIC static constexpr 3583 enable_if_t
, _SimdWrapper<_Up, _ToN>> 3584 _S_to_maskvector(_Tp __x) 3585 { 3586 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3587 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3588 : __vector_type_t<_Up, _ToN>(); 3589 } 3590 3591 // }}} 3592 // _S_to_maskvector(_SanitizedBitMask) {{{ 3593 template
3594 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3595 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3596 { 3597 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3598 using _UV = __vector_type_t<_Up, _ToN>; 3599 using _UI = __intrinsic_type_t<_Up, _ToN>; 3600 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3601 if constexpr (_Np == 1) 3602 return _S_to_maskvector<_Up, _ToN>(__k); 3603 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3604 return __generate_from_n_evaluations
( 3605 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3606 else if constexpr (sizeof(_Up) == 1) 3607 { 3608 if constexpr (sizeof(_UI) == 16) 3609 { 3610 if constexpr (__have_avx512bw_vl) 3611 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3612 else if constexpr (__have_avx512bw) 3613 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3614 else if constexpr (__have_avx512f) 3615 { 3616 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3617 auto __as16bits 3618 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3619 __hi256(__as32bits))); 3620 return __intrin_bitcast<_UV>( 3621 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3622 } 3623 else if constexpr (__have_ssse3) 3624 { 3625 const auto __bitmask = __to_intrin( 3626 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3627 8, 16, 32, 64, 128)); 3628 return __intrin_bitcast<_UV>( 3629 __vector_bitcast<_Up>( 3630 _mm_shuffle_epi8(__to_intrin( 3631 __vector_type_t<_ULLong, 2>{__k}), 3632 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3633 1, 1, 1, 1, 1, 1, 1)) 3634 & __bitmask) 3635 != 0); 3636 } 3637 // else fall through 3638 } 3639 else if constexpr (sizeof(_UI) == 32) 3640 { 3641 if constexpr (__have_avx512bw_vl) 3642 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3643 else if constexpr (__have_avx512bw) 3644 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3645 else if constexpr (__have_avx512f) 3646 { 3647 auto __as16bits = // 0 16 1 17 ... 15 31 3648 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3649 16) 3650 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3651 ~__m512i()), 3652 16); 3653 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3654 __lo256(__as16bits), 3655 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3656 ); 3657 // deinterleave: 3658 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3659 __0_16_1_17, // 0 16 1 17 2 ... 3660 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3661 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3662 3, 5, 7, 9, 11, 13, 3663 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3664 // 0-3 8-11 16-19 24-27 3665 // 4-7 12-15 20-23 28-31 3666 } 3667 else if constexpr (__have_avx2) 3668 { 3669 const auto __bitmask 3670 = _mm256_broadcastsi128_si256(__to_intrin( 3671 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3672 4, 8, 16, 32, 64, 128))); 3673 return __vector_bitcast<_Up>( 3674 __vector_bitcast<_Up>( 3675 _mm256_shuffle_epi8( 3676 _mm256_broadcastsi128_si256( 3677 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3678 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3679 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3680 3, 3, 3, 3, 3, 3)) 3681 & __bitmask) 3682 != 0); 3683 } 3684 // else fall through 3685 } 3686 else if constexpr (sizeof(_UI) == 64) 3687 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3688 if constexpr (std::min(_ToN, _Np) <= 4) 3689 { 3690 if constexpr (_Np > 7) // avoid overflow 3691 __x &= _SanitizedBitMask<_Np>(0x0f); 3692 const _UInt __char_mask 3693 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3694 * 0xff; 3695 _UV __r = {}; 3696 __builtin_memcpy(&__r, &__char_mask, 3697 std::min(sizeof(__r), sizeof(__char_mask))); 3698 return __r; 3699 } 3700 else if constexpr (std::min(_ToN, _Np) <= 7) 3701 { 3702 if constexpr (_Np > 7) // avoid overflow 3703 __x &= _SanitizedBitMask<_Np>(0x7f); 3704 const _ULLong __char_mask 3705 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3706 * 0xff; 3707 _UV __r = {}; 3708 __builtin_memcpy(&__r, &__char_mask, 3709 std::min(sizeof(__r), sizeof(__char_mask))); 3710 return __r; 3711 } 3712 } 3713 else if constexpr (sizeof(_Up) == 2) 3714 { 3715 if constexpr (sizeof(_UI) == 16) 3716 { 3717 if constexpr (__have_avx512bw_vl) 3718 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3719 else if constexpr (__have_avx512bw) 3720 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3721 else if constexpr (__have_avx512f) 3722 { 3723 __m256i __as32bits = {}; 3724 if constexpr (__have_avx512vl) 3725 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3726 else 3727 __as32bits 3728 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3729 return __intrin_bitcast<_UV>( 3730 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3731 } 3732 // else fall through 3733 } 3734 else if constexpr (sizeof(_UI) == 32) 3735 { 3736 if constexpr (__have_avx512bw_vl) 3737 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3738 else if constexpr (__have_avx512bw) 3739 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3740 else if constexpr (__have_avx512f) 3741 { 3742 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3743 return __vector_bitcast<_Up>( 3744 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3745 __hi256(__as32bits)))); 3746 } 3747 // else fall through 3748 } 3749 else if constexpr (sizeof(_UI) == 64) 3750 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3751 } 3752 else if constexpr (sizeof(_Up) == 4) 3753 { 3754 if constexpr (sizeof(_UI) == 16) 3755 { 3756 if constexpr (__have_avx512dq_vl) 3757 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3758 else if constexpr (__have_avx512dq) 3759 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3760 else if constexpr (__have_avx512vl) 3761 return __intrin_bitcast<_UV>( 3762 _mm_maskz_mov_epi32(__k, ~__m128i())); 3763 else if constexpr (__have_avx512f) 3764 return __intrin_bitcast<_UV>( 3765 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3766 // else fall through 3767 } 3768 else if constexpr (sizeof(_UI) == 32) 3769 { 3770 if constexpr (__have_avx512dq_vl) 3771 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3772 else if constexpr (__have_avx512dq) 3773 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3774 else if constexpr (__have_avx512vl) 3775 return __vector_bitcast<_Up>( 3776 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3777 else if constexpr (__have_avx512f) 3778 return __vector_bitcast<_Up>( 3779 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3780 // else fall through 3781 } 3782 else if constexpr (sizeof(_UI) == 64) 3783 return __vector_bitcast<_Up>( 3784 __have_avx512dq ? _mm512_movm_epi32(__k) 3785 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3786 } 3787 else if constexpr (sizeof(_Up) == 8) 3788 { 3789 if constexpr (sizeof(_UI) == 16) 3790 { 3791 if constexpr (__have_avx512dq_vl) 3792 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3793 else if constexpr (__have_avx512dq) 3794 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3795 else if constexpr (__have_avx512vl) 3796 return __vector_bitcast<_Up>( 3797 _mm_maskz_mov_epi64(__k, ~__m128i())); 3798 else if constexpr (__have_avx512f) 3799 return __vector_bitcast<_Up>( 3800 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3801 // else fall through 3802 } 3803 else if constexpr (sizeof(_UI) == 32) 3804 { 3805 if constexpr (__have_avx512dq_vl) 3806 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3807 else if constexpr (__have_avx512dq) 3808 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3809 else if constexpr (__have_avx512vl) 3810 return __vector_bitcast<_Up>( 3811 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3812 else if constexpr (__have_avx512f) 3813 return __vector_bitcast<_Up>( 3814 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3815 // else fall through 3816 } 3817 else if constexpr (sizeof(_UI) == 64) 3818 return __vector_bitcast<_Up>( 3819 __have_avx512dq ? _mm512_movm_epi64(__k) 3820 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3821 } 3822 3823 using _UpUInt = make_unsigned_t<_Up>; 3824 using _V = __vector_type_t<_UpUInt, _ToN>; 3825 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3826 if constexpr (_ToN == 2) 3827 { 3828 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3829 } 3830 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3831 { 3832 if constexpr (sizeof(_Up) == 4) 3833 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3834 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3835 _mm256_castsi256_ps(_mm256_setr_epi32( 3836 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3837 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3838 else if constexpr (sizeof(_Up) == 8) 3839 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3840 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3841 _mm256_castsi256_pd( 3842 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3843 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3844 else 3845 __assert_unreachable<_Up>(); 3846 } 3847 else if constexpr (__bits_per_element >= _ToN) 3848 { 3849 constexpr auto __bitmask 3850 = __generate_vector<_V>([](auto __i) 3851 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3852 { return __i < _ToN ? 1ull << __i : 0; }); 3853 const auto __bits 3854 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3855 if constexpr (__bits_per_element > _ToN) 3856 return __vector_bitcast<_Up>(__bits) > 0; 3857 else 3858 return __vector_bitcast<_Up>(__bits != 0); 3859 } 3860 else 3861 { 3862 const _V __tmp 3863 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3864 return static_cast<_UpUInt>( 3865 __k >> (__bits_per_element * (__i / __bits_per_element))); 3866 }) 3867 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3868 return static_cast<_UpUInt>(1ull 3869 << (__i % __bits_per_element)); 3870 }); // mask bit index 3871 return __intrin_bitcast<_UV>(__tmp != _V()); 3872 } 3873 } 3874 3875 // }}} 3876 // _S_to_maskvector(_SimdWrapper) {{{ 3877 template
3879 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3880 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3881 { 3882 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3883 using _TW = _SimdWrapper<_Tp, _Np>; 3884 using _UW = _SimdWrapper<_Up, _ToN>; 3885 using _UI = __intrinsic_type_t<_Up, _ToN>; 3886 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3887 return _S_to_maskvector<_Up, _ToN>( 3888 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3889 // vector -> vector bitcast 3890 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3891 && sizeof(_TW) == sizeof(_UW)) 3892 return __wrapper_bitcast<_Up, _ToN>( 3893 _ToN <= _Np 3894 ? __x 3895 : simd_abi::_VecBuiltin
::_S_masked(__x)); 3896 else // vector -> vector {{{ 3897 { 3898 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3899 { 3900 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3901 return __generate_from_n_evaluations
>( 3903 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3904 } 3905 using _To = __vector_type_t<_Up, _ToN>; 3906 [[maybe_unused]] constexpr size_t _FromN = _Np; 3907 constexpr int _FromBytes = sizeof(_Tp); 3908 constexpr int _ToBytes = sizeof(_Up); 3909 const auto __k = __x._M_data; 3910 3911 if constexpr (_FromBytes == _ToBytes) 3912 return __intrin_bitcast<_To>(__k); 3913 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3914 { // SSE -> SSE {{{ 3915 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3916 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3917 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3918 { 3919 const auto __y 3920 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3921 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3922 } 3923 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3924 { 3925 auto __y 3926 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3927 auto __z 3928 = __vector_bitcast
(__interleave128_lo(__y, __y)); 3929 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3930 } 3931 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3932 && __have_sse2) 3933 return __intrin_bitcast<_To>( 3934 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3935 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3936 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3937 _UI()); 3938 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3939 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3940 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3941 { 3942 const auto __y 3943 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3944 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3945 } 3946 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3947 { 3948 if constexpr (__have_sse2 && !__have_ssse3) 3949 return __intrin_bitcast<_To>(_mm_packs_epi32( 3950 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3951 __m128i())); 3952 else 3953 return __intrin_bitcast<_To>( 3954 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3955 __vector_bitcast<_Up>(__k))); 3956 } 3957 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3958 return __intrin_bitcast<_To>( 3959 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3960 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3961 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3962 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3963 && __have_ssse3) 3964 return __intrin_bitcast<_To>( 3965 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3966 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3967 -1, -1, -1, -1, -1, -1, -1, 3968 -1))); 3969 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3970 { 3971 auto __y 3972 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3973 __y = _mm_packs_epi32(__y, __m128i()); 3974 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3975 } 3976 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3977 && __have_ssse3) 3978 return __intrin_bitcast<_To>( 3979 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3980 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 3981 -1, -1, -1, -1, -1, -1, -1, 3982 -1))); 3983 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 3984 { 3985 const auto __y 3986 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3987 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3988 } 3989 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 3990 return __intrin_bitcast<_To>( 3991 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 3992 else 3993 __assert_unreachable<_Tp>(); 3994 } // }}} 3995 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 3996 { // AVX -> AVX {{{ 3997 if constexpr (_FromBytes == _ToBytes) 3998 __assert_unreachable<_Tp>(); 3999 else if constexpr (_FromBytes == _ToBytes * 2) 4000 { 4001 const auto __y = __vector_bitcast<_LLong>(__k); 4002 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4003 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4004 } 4005 else if constexpr (_FromBytes == _ToBytes * 4) 4006 { 4007 const auto __y = __vector_bitcast<_LLong>(__k); 4008 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4009 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4010 __m128i()))); 4011 } 4012 else if constexpr (_FromBytes == _ToBytes * 8) 4013 { 4014 const auto __y = __vector_bitcast<_LLong>(__k); 4015 return __intrin_bitcast<_To>( 4016 _mm256_castsi128_si256(_mm_shuffle_epi8( 4017 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4018 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4019 -1, -1, -1, -1, -1)))); 4020 } 4021 else if constexpr (_FromBytes * 2 == _ToBytes) 4022 { 4023 auto __y = __xzyw(__to_intrin(__k)); 4024 if constexpr (is_floating_point_v< 4025 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4026 { 4027 const auto __yy = __vector_bitcast
(__y); 4028 return __intrin_bitcast<_To>( 4029 _mm256_unpacklo_ps(__yy, __yy)); 4030 } 4031 else 4032 return __intrin_bitcast<_To>( 4033 _mm256_unpacklo_epi8(__y, __y)); 4034 } 4035 else if constexpr (_FromBytes * 4 == _ToBytes) 4036 { 4037 auto __y 4038 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4039 __lo128(__vector_bitcast<_LLong>( 4040 __k))); // drops 3/4 of input 4041 return __intrin_bitcast<_To>( 4042 __concat(_mm_unpacklo_epi16(__y, __y), 4043 _mm_unpackhi_epi16(__y, __y))); 4044 } 4045 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4046 { 4047 auto __y 4048 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4049 __lo128(__vector_bitcast<_LLong>( 4050 __k))); // drops 3/4 of input 4051 __y 4052 = _mm_unpacklo_epi16(__y, 4053 __y); // drops another 1/2 => 7/8 total 4054 return __intrin_bitcast<_To>( 4055 __concat(_mm_unpacklo_epi32(__y, __y), 4056 _mm_unpackhi_epi32(__y, __y))); 4057 } 4058 else 4059 __assert_unreachable<_Tp>(); 4060 } // }}} 4061 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4062 { // SSE -> AVX {{{ 4063 if constexpr (_FromBytes == _ToBytes) 4064 return __intrin_bitcast<_To>( 4065 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4066 __zero_extend(__to_intrin(__k)))); 4067 else if constexpr (_FromBytes * 2 == _ToBytes) 4068 { // keep all 4069 return __intrin_bitcast<_To>( 4070 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4071 __vector_bitcast<_LLong>(__k)), 4072 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4073 __vector_bitcast<_LLong>(__k)))); 4074 } 4075 else if constexpr (_FromBytes * 4 == _ToBytes) 4076 { 4077 if constexpr (__have_avx2) 4078 { 4079 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4080 __concat(__vector_bitcast<_LLong>(__k), 4081 __vector_bitcast<_LLong>(__k)), 4082 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4083 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4084 6, 6, 7, 7, 7, 7))); 4085 } 4086 else 4087 { 4088 return __intrin_bitcast<_To>(__concat( 4089 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4090 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4091 2, 2, 2, 2, 3, 3, 3, 3)), 4092 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4093 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4094 6, 6, 6, 6, 7, 7, 7, 4095 7)))); 4096 } 4097 } 4098 else if constexpr (_FromBytes * 8 == _ToBytes) 4099 { 4100 if constexpr (__have_avx2) 4101 { 4102 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4103 __concat(__vector_bitcast<_LLong>(__k), 4104 __vector_bitcast<_LLong>(__k)), 4105 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4106 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4107 3, 3, 3, 3, 3, 3))); 4108 } 4109 else 4110 { 4111 return __intrin_bitcast<_To>(__concat( 4112 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4113 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4114 1, 1, 1, 1, 1, 1, 1, 1)), 4115 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4116 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4117 3, 3, 3, 3, 3, 3, 3, 4118 3)))); 4119 } 4120 } 4121 else if constexpr (_FromBytes == _ToBytes * 2) 4122 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4123 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4124 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4125 { 4126 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4127 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4128 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4129 -1, -1, -1, -1, -1, -1, -1, 4130 -1))))); 4131 } 4132 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4133 { 4134 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4135 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4136 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4137 -1, -1, -1, -1, -1, -1, -1, 4138 -1))))); 4139 } 4140 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4141 { 4142 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4143 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4144 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4145 -1, -1, -1, -1, -1, -1, -1, 4146 -1, -1))))); 4147 } 4148 else 4149 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4150 } // }}} 4151 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4152 { // AVX -> SSE {{{ 4153 if constexpr (_FromBytes == _ToBytes) 4154 { // keep low 1/2 4155 return __intrin_bitcast<_To>(__lo128(__k)); 4156 } 4157 else if constexpr (_FromBytes == _ToBytes * 2) 4158 { // keep all 4159 auto __y = __vector_bitcast<_LLong>(__k); 4160 return __intrin_bitcast<_To>( 4161 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4162 } 4163 else if constexpr (_FromBytes == _ToBytes * 4) 4164 { // add 1/2 undef 4165 auto __y = __vector_bitcast<_LLong>(__k); 4166 return __intrin_bitcast<_To>( 4167 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4168 __m128i())); 4169 } 4170 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4171 { // add 3/4 undef 4172 auto __y = __vector_bitcast<_LLong>(__k); 4173 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4174 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4175 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4176 -1, -1, -1, -1))); 4177 } 4178 else if constexpr (_FromBytes * 2 == _ToBytes) 4179 { // keep low 1/4 4180 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4181 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4182 } 4183 else if constexpr (_FromBytes * 4 == _ToBytes) 4184 { // keep low 1/8 4185 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4186 __y = _mm_unpacklo_epi8(__y, __y); 4187 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4188 } 4189 else if constexpr (_FromBytes * 8 == _ToBytes) 4190 { // keep low 1/16 4191 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4192 __y = _mm_unpacklo_epi8(__y, __y); 4193 __y = _mm_unpacklo_epi8(__y, __y); 4194 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4195 } 4196 else 4197 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4198 } // }}} 4199 else 4200 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4201 /* 4202 if constexpr (_FromBytes > _ToBytes) { 4203 const _To __y = __vector_bitcast<_Up>(__k); 4204 return [&]
(index_sequence<_Is...>) { 4205 constexpr int _Stride = _FromBytes / _ToBytes; 4206 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4207 }(make_index_sequence
()); 4208 } else { 4209 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4210 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4211 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4212 // ... 4213 return [&]
(index_sequence<_Is...>) { 4214 constexpr int __dup = _ToBytes / _FromBytes; 4215 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4216 }(make_index_sequence<_FromN>()); 4217 } 4218 */ 4219 } // }}} 4220 } 4221 4222 // }}} 4223 // _S_to_bits {{{ 4224 template
4225 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4226 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4227 { 4228 if constexpr (is_same_v<_Tp, bool>) 4229 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4230 else 4231 { 4232 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4233 if (__builtin_is_constant_evaluated() 4234 || __builtin_constant_p(__x._M_data)) 4235 { 4236 const auto __bools = -__x._M_data; 4237 const _ULLong __k = __call_with_n_evaluations<_Np>( 4238 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4239 return (__bits | ...); 4240 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4241 return _ULLong(__bools[+__i]) << __i; 4242 }); 4243 if (__builtin_is_constant_evaluated() 4244 || __builtin_constant_p(__k)) 4245 return __k; 4246 } 4247 const auto __xi = __to_intrin(__x); 4248 if constexpr (sizeof(_Tp) == 1) 4249 if constexpr (sizeof(__xi) == 16) 4250 if constexpr (__have_avx512bw_vl) 4251 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4252 else // implies SSE2 4253 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4254 else if constexpr (sizeof(__xi) == 32) 4255 if constexpr (__have_avx512bw_vl) 4256 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4257 else // implies AVX2 4258 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4259 else // implies AVX512BW 4260 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4261 4262 else if constexpr (sizeof(_Tp) == 2) 4263 if constexpr (sizeof(__xi) == 16) 4264 if constexpr (__have_avx512bw_vl) 4265 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4266 else if constexpr (__have_avx512bw) 4267 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4268 else // implies SSE2 4269 return _BitMask<_Np>( 4270 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4271 else if constexpr (sizeof(__xi) == 32) 4272 if constexpr (__have_avx512bw_vl) 4273 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4274 else if constexpr (__have_avx512bw) 4275 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4276 else // implies SSE2 4277 return _BitMask<_Np>(_mm_movemask_epi8( 4278 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4279 else // implies AVX512BW 4280 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4281 4282 else if constexpr (sizeof(_Tp) == 4) 4283 if constexpr (sizeof(__xi) == 16) 4284 if constexpr (__have_avx512dq_vl) 4285 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4286 else if constexpr (__have_avx512vl) 4287 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4288 else if constexpr (__have_avx512dq) 4289 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4290 else if constexpr (__have_avx512f) 4291 return _BitMask<_Np>( 4292 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4293 else // implies SSE 4294 return _BitMask<_Np>( 4295 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4296 else if constexpr (sizeof(__xi) == 32) 4297 if constexpr (__have_avx512dq_vl) 4298 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4299 else if constexpr (__have_avx512dq) 4300 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4301 else if constexpr (__have_avx512vl) 4302 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4303 else if constexpr (__have_avx512f) 4304 return _BitMask<_Np>( 4305 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4306 else // implies AVX 4307 return _BitMask<_Np>( 4308 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4309 else // implies AVX512?? 4310 if constexpr (__have_avx512dq) 4311 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4312 else // implies AVX512F 4313 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4314 4315 else if constexpr (sizeof(_Tp) == 8) 4316 if constexpr (sizeof(__xi) == 16) 4317 if constexpr (__have_avx512dq_vl) 4318 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4319 else if constexpr (__have_avx512dq) 4320 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4321 else if constexpr (__have_avx512vl) 4322 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4323 else if constexpr (__have_avx512f) 4324 return _BitMask<_Np>( 4325 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4326 else // implies SSE2 4327 return _BitMask<_Np>( 4328 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4329 else if constexpr (sizeof(__xi) == 32) 4330 if constexpr (__have_avx512dq_vl) 4331 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4332 else if constexpr (__have_avx512dq) 4333 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4334 else if constexpr (__have_avx512vl) 4335 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4336 else if constexpr (__have_avx512f) 4337 return _BitMask<_Np>( 4338 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4339 else // implies AVX 4340 return _BitMask<_Np>( 4341 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4342 else // implies AVX512?? 4343 if constexpr (__have_avx512dq) 4344 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4345 else // implies AVX512F 4346 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4347 4348 else 4349 __assert_unreachable<_Tp>(); 4350 } 4351 } 4352 // }}} 4353 }; 4354 4355 // }}} 4356 // _MaskImplX86 {{{ 4357 template
4358 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4359 { 4360 using _MaskImplX86Mixin::_S_to_bits; 4361 using _MaskImplX86Mixin::_S_to_maskvector; 4362 using _MaskImplBuiltin<_Abi>::_S_convert; 4363 4364 // member types {{{ 4365 template
4366 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4367 4368 template
4369 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4370 4371 template
4372 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4373 4374 using _Base = _MaskImplBuiltin<_Abi>; 4375 4376 // }}} 4377 // _S_broadcast {{{ 4378 template
4379 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4380 _S_broadcast(bool __x) 4381 { 4382 if constexpr (__is_avx512_abi<_Abi>()) 4383 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4384 : _MaskMember<_Tp>(); 4385 else 4386 return _Base::template _S_broadcast<_Tp>(__x); 4387 } 4388 4389 // }}} 4390 // _S_load {{{ 4391 template
4392 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4393 _S_load(const bool* __mem) 4394 { 4395 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4396 if (__builtin_is_constant_evaluated()) 4397 { 4398 if constexpr (__is_avx512_abi<_Abi>()) 4399 { 4400 _MaskMember<_Tp> __r{}; 4401 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) 4402 __r._M_data |= _ULLong(__mem[__i]) << __i; 4403 return __r; 4404 } 4405 else 4406 return _Base::template _S_load<_Tp>(__mem); 4407 } 4408 else if constexpr (__have_avx512bw) 4409 { 4410 const auto __to_vec_or_bits 4411 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { 4412 if constexpr (__is_avx512_abi<_Abi>()) 4413 return __bits; 4414 else 4415 return _S_to_maskvector<_Tp>( 4416 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4417 }; 4418 4419 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4420 { 4421 __m128i __a = {}; 4422 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4423 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4424 } 4425 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4426 { 4427 __m256i __a = {}; 4428 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4429 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4430 } 4431 else if constexpr (_S_size<_Tp> <= 64) 4432 { 4433 __m512i __a = {}; 4434 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4435 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4436 } 4437 } 4438 else if constexpr (__is_avx512_abi<_Abi>()) 4439 { 4440 if constexpr (_S_size<_Tp> <= 8) 4441 { 4442 __m128i __a = {}; 4443 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4444 const auto __b = _mm512_cvtepi8_epi64(__a); 4445 return _mm512_test_epi64_mask(__b, __b); 4446 } 4447 else if constexpr (_S_size<_Tp> <= 16) 4448 { 4449 __m128i __a = {}; 4450 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4451 const auto __b = _mm512_cvtepi8_epi32(__a); 4452 return _mm512_test_epi32_mask(__b, __b); 4453 } 4454 else if constexpr (_S_size<_Tp> <= 32) 4455 { 4456 __m128i __a = {}; 4457 __builtin_memcpy(&__a, __mem, 16); 4458 const auto __b = _mm512_cvtepi8_epi32(__a); 4459 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4460 const auto __c = _mm512_cvtepi8_epi32(__a); 4461 return _mm512_test_epi32_mask(__b, __b) 4462 | (_mm512_test_epi32_mask(__c, __c) << 16); 4463 } 4464 else if constexpr (_S_size<_Tp> <= 64) 4465 { 4466 __m128i __a = {}; 4467 __builtin_memcpy(&__a, __mem, 16); 4468 const auto __b = _mm512_cvtepi8_epi32(__a); 4469 __builtin_memcpy(&__a, __mem + 16, 16); 4470 const auto __c = _mm512_cvtepi8_epi32(__a); 4471 if constexpr (_S_size<_Tp> <= 48) 4472 { 4473 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4474 const auto __d = _mm512_cvtepi8_epi32(__a); 4475 return _mm512_test_epi32_mask(__b, __b) 4476 | (_mm512_test_epi32_mask(__c, __c) << 16) 4477 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4478 } 4479 else 4480 { 4481 __builtin_memcpy(&__a, __mem + 16, 16); 4482 const auto __d = _mm512_cvtepi8_epi32(__a); 4483 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4484 const auto __e = _mm512_cvtepi8_epi32(__a); 4485 return _mm512_test_epi32_mask(__b, __b) 4486 | (_mm512_test_epi32_mask(__c, __c) << 16) 4487 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4488 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4489 } 4490 } 4491 else 4492 __assert_unreachable<_Tp>(); 4493 } 4494 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4495 return __vector_bitcast<_Tp>( 4496 __vector_type16_t
{-int(__mem[0]), -int(__mem[0]), 4497 -int(__mem[1]), -int(__mem[1])}); 4498 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4499 { 4500 int __bool4 = 0; 4501 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4502 const auto __k = __to_intrin( 4503 (__vector_broadcast<4>(__bool4) 4504 & __make_vector