Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_x86.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_x86.h 1 // Simd x86 specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_X86INTRIN 31 #error \ 32 "simd_x86.h may only be included when MMX or SSE on x86(_64) are available" 33 #endif 34 35 _GLIBCXX_SIMD_BEGIN_NAMESPACE 36 37 // __to_masktype {{{ 38 // Given
return <__int_for_sizeof_t
, N>. For _SimdWrapper and 39 // __vector_type_t. 40 template
41 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> 42 __to_masktype(_SimdWrapper<_Tp, _Np> __x) 43 { return reinterpret_cast<__vector_type_t<__int_for_sizeof_t<_Tp>, _Np>>(__x._M_data); } 44 45 template
, _VectorTraits<_TV>>, 48 typename _Up = __int_for_sizeof_t
> 49 _GLIBCXX_SIMD_INTRINSIC constexpr __vector_type_t<_Up, _TVT::_S_full_size> 50 __to_masktype(_TV __x) 51 { return reinterpret_cast<__vector_type_t<_Up, _TVT::_S_full_size>>(__x); } 52 53 // }}} 54 // __interleave128_lo {{{ 55 template
, 56 typename _Trait = _VectorTraits<_Tp>> 57 _GLIBCXX_SIMD_INTRINSIC constexpr _Tp 58 __interleave128_lo(const _Ap& __av, const _Bp& __bv) 59 { 60 const _Tp __a(__av); 61 const _Tp __b(__bv); 62 if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 2) 63 return _Tp{__a[0], __b[0]}; 64 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 4) 65 return _Tp{__a[0], __b[0], __a[1], __b[1]}; 66 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 8) 67 return _Tp{__a[0], __b[0], __a[1], __b[1], 68 __a[2], __b[2], __a[3], __b[3]}; 69 else if constexpr (sizeof(_Tp) == 16 && _Trait::_S_full_size == 16) 70 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 71 __a[3], __b[3], __a[4], __b[4], __a[5], __b[5], 72 __a[6], __b[6], __a[7], __b[7]}; 73 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 4) 74 return _Tp{__a[0], __b[0], __a[2], __b[2]}; 75 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 8) 76 return _Tp{__a[0], __b[0], __a[1], __b[1], 77 __a[4], __b[4], __a[5], __b[5]}; 78 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 16) 79 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], 80 __a[3], __b[3], __a[8], __b[8], __a[9], __b[9], 81 __a[10], __b[10], __a[11], __b[11]}; 82 else if constexpr (sizeof(_Tp) == 32 && _Trait::_S_full_size == 32) 83 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 84 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 85 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 86 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 87 __a[22], __b[22], __a[23], __b[23]}; 88 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 8) 89 return _Tp{__a[0], __b[0], __a[2], __b[2], 90 __a[4], __b[4], __a[6], __b[6]}; 91 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 16) 92 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[4], __b[4], 93 __a[5], __b[5], __a[8], __b[8], __a[9], __b[9], 94 __a[12], __b[12], __a[13], __b[13]}; 95 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 32) 96 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 97 __b[3], __a[8], __b[8], __a[9], __b[9], __a[10], __b[10], 98 __a[11], __b[11], __a[16], __b[16], __a[17], __b[17], __a[18], 99 __b[18], __a[19], __b[19], __a[24], __b[24], __a[25], __b[25], 100 __a[26], __b[26], __a[27], __b[27]}; 101 else if constexpr (sizeof(_Tp) == 64 && _Trait::_S_full_size == 64) 102 return _Tp{__a[0], __b[0], __a[1], __b[1], __a[2], __b[2], __a[3], 103 __b[3], __a[4], __b[4], __a[5], __b[5], __a[6], __b[6], 104 __a[7], __b[7], __a[16], __b[16], __a[17], __b[17], __a[18], 105 __b[18], __a[19], __b[19], __a[20], __b[20], __a[21], __b[21], 106 __a[22], __b[22], __a[23], __b[23], __a[32], __b[32], __a[33], 107 __b[33], __a[34], __b[34], __a[35], __b[35], __a[36], __b[36], 108 __a[37], __b[37], __a[38], __b[38], __a[39], __b[39], __a[48], 109 __b[48], __a[49], __b[49], __a[50], __b[50], __a[51], __b[51], 110 __a[52], __b[52], __a[53], __b[53], __a[54], __b[54], __a[55], 111 __b[55]}; 112 else 113 __assert_unreachable<_Tp>(); 114 } 115 116 // }}} 117 // __is_zero{{{ 118 template
> 119 _GLIBCXX_SIMD_INTRINSIC constexpr bool 120 __is_zero(_Tp __a) 121 { 122 if (!__builtin_is_constant_evaluated()) 123 { 124 if constexpr (__have_avx) 125 { 126 if constexpr (_TVT::template _S_is
) 127 return _mm256_testz_ps(__a, __a); 128 else if constexpr (_TVT::template _S_is
) 129 return _mm256_testz_pd(__a, __a); 130 else if constexpr (sizeof(_Tp) == 32) 131 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__a)); 132 else if constexpr (_TVT::template _S_is
) 133 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__a)); 134 else if constexpr (_TVT::template _S_is
) 135 return _mm_testz_pd(__a, __a); 136 else 137 return _mm_testz_si128(__to_intrin(__a), __to_intrin(__a)); 138 } 139 else if constexpr (__have_sse4_1) 140 return _mm_testz_si128(__intrin_bitcast<__m128i>(__a), 141 __intrin_bitcast<__m128i>(__a)); 142 } 143 else if constexpr (sizeof(_Tp) <= 8) 144 return reinterpret_cast<__int_for_sizeof_t<_Tp>>(__a) == 0; 145 else 146 { 147 const auto __b = __vector_bitcast<_LLong>(__a); 148 if constexpr (sizeof(__b) == 16) 149 return (__b[0] | __b[1]) == 0; 150 else if constexpr (sizeof(__b) == 32) 151 return __is_zero(__lo128(__b) | __hi128(__b)); 152 else if constexpr (sizeof(__b) == 64) 153 return __is_zero(__lo256(__b) | __hi256(__b)); 154 else 155 __assert_unreachable<_Tp>(); 156 } 157 } 158 159 // }}} 160 // __movemask{{{ 161 template
> 162 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST int 163 __movemask(_Tp __a) 164 { 165 if constexpr (sizeof(_Tp) == 32) 166 { 167 if constexpr (_TVT::template _S_is
) 168 return _mm256_movemask_ps(__to_intrin(__a)); 169 else if constexpr (_TVT::template _S_is
) 170 return _mm256_movemask_pd(__to_intrin(__a)); 171 else 172 return _mm256_movemask_epi8(__to_intrin(__a)); 173 } 174 else if constexpr (_TVT::template _S_is
) 175 return _mm_movemask_ps(__to_intrin(__a)); 176 else if constexpr (_TVT::template _S_is
) 177 return _mm_movemask_pd(__to_intrin(__a)); 178 else 179 return _mm_movemask_epi8(__to_intrin(__a)); 180 } 181 182 // }}} 183 // __testz{{{ 184 template
> 185 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 186 __testz(_TI __a, _TI __b) 187 { 188 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 190 if (!__builtin_is_constant_evaluated()) 191 { 192 if constexpr (sizeof(_TI) == 32) 193 { 194 if constexpr (_TVT::template _S_is
) 195 return _mm256_testz_ps(__to_intrin(__a), __to_intrin(__b)); 196 else if constexpr (_TVT::template _S_is
) 197 return _mm256_testz_pd(__to_intrin(__a), __to_intrin(__b)); 198 else 199 return _mm256_testz_si256(__to_intrin(__a), __to_intrin(__b)); 200 } 201 else if constexpr (_TVT::template _S_is
&& __have_avx) 202 return _mm_testz_ps(__to_intrin(__a), __to_intrin(__b)); 203 else if constexpr (_TVT::template _S_is
&& __have_avx) 204 return _mm_testz_pd(__to_intrin(__a), __to_intrin(__b)); 205 else if constexpr (__have_sse4_1) 206 return _mm_testz_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 207 __intrin_bitcast<__m128i>(__to_intrin(__b))); 208 else 209 return __movemask(0 == __and(__a, __b)) != 0; 210 } 211 else 212 return __is_zero(__and(__a, __b)); 213 } 214 215 // }}} 216 // __testc{{{ 217 // requires SSE4.1 or above 218 template
> 219 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 220 __testc(_TI __a, _TI __b) 221 { 222 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 224 if (__builtin_is_constant_evaluated()) 225 return __is_zero(__andnot(__a, __b)); 226 227 if constexpr (sizeof(_TI) == 32) 228 { 229 if constexpr (_TVT::template _S_is
) 230 return _mm256_testc_ps(__a, __b); 231 else if constexpr (_TVT::template _S_is
) 232 return _mm256_testc_pd(__a, __b); 233 else 234 return _mm256_testc_si256(__to_intrin(__a), __to_intrin(__b)); 235 } 236 else if constexpr (_TVT::template _S_is
&& __have_avx) 237 return _mm_testc_ps(__to_intrin(__a), __to_intrin(__b)); 238 else if constexpr (_TVT::template _S_is
&& __have_avx) 239 return _mm_testc_pd(__to_intrin(__a), __to_intrin(__b)); 240 else 241 { 242 static_assert(is_same_v<_TI, _TI> && __have_sse4_1); 243 return _mm_testc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 244 __intrin_bitcast<__m128i>(__to_intrin(__b))); 245 } 246 } 247 248 // }}} 249 // __testnzc{{{ 250 template
> 251 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr int 252 __testnzc(_TI __a, _TI __b) 253 { 254 static_assert(is_same_v<_TI, __intrinsic_type_t
>); 256 if (!__builtin_is_constant_evaluated()) 257 { 258 if constexpr (sizeof(_TI) == 32) 259 { 260 if constexpr (_TVT::template _S_is
) 261 return _mm256_testnzc_ps(__a, __b); 262 else if constexpr (_TVT::template _S_is
) 263 return _mm256_testnzc_pd(__a, __b); 264 else 265 return _mm256_testnzc_si256(__to_intrin(__a), __to_intrin(__b)); 266 } 267 else if constexpr (_TVT::template _S_is
&& __have_avx) 268 return _mm_testnzc_ps(__to_intrin(__a), __to_intrin(__b)); 269 else if constexpr (_TVT::template _S_is
&& __have_avx) 270 return _mm_testnzc_pd(__to_intrin(__a), __to_intrin(__b)); 271 else if constexpr (__have_sse4_1) 272 return _mm_testnzc_si128(__intrin_bitcast<__m128i>(__to_intrin(__a)), 273 __intrin_bitcast<__m128i>(__to_intrin(__b))); 274 else 275 return __movemask(0 == __and(__a, __b)) == 0 276 && __movemask(0 == __andnot(__a, __b)) == 0; 277 } 278 else 279 return !(__is_zero(__and(__a, __b)) || __is_zero(__andnot(__a, __b))); 280 } 281 282 // }}} 283 // __xzyw{{{ 284 // shuffles the complete vector, swapping the inner two quarters. Often useful 285 // for AVX for fixing up a shuffle result. 286 template
> 287 _GLIBCXX_SIMD_INTRINSIC _Tp 288 __xzyw(_Tp __a) 289 { 290 if constexpr (sizeof(_Tp) == 16) 291 { 292 const auto __x = __vector_bitcast
, float, int>>(__a); 294 return reinterpret_cast<_Tp>( 295 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 296 } 297 else if constexpr (sizeof(_Tp) == 32) 298 { 299 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 301 return reinterpret_cast<_Tp>( 302 decltype(__x){__x[0], __x[2], __x[1], __x[3]}); 303 } 304 else if constexpr (sizeof(_Tp) == 64) 305 { 306 const auto __x = __vector_bitcast
, double, _LLong>>(__a); 308 return reinterpret_cast<_Tp>(decltype(__x){__x[0], __x[1], __x[4], 309 __x[5], __x[2], __x[3], 310 __x[6], __x[7]}); 311 } 312 else 313 __assert_unreachable<_Tp>(); 314 } 315 316 // }}} 317 // __maskload_epi32{{{ 318 template
319 _GLIBCXX_SIMD_INTRINSIC auto 320 __maskload_epi32(const int* __ptr, _Tp __k) 321 { 322 if constexpr (sizeof(__k) == 16) 323 return _mm_maskload_epi32(__ptr, __k); 324 else 325 return _mm256_maskload_epi32(__ptr, __k); 326 } 327 328 // }}} 329 // __maskload_epi64{{{ 330 template
331 _GLIBCXX_SIMD_INTRINSIC auto 332 __maskload_epi64(const _LLong* __ptr, _Tp __k) 333 { 334 if constexpr (sizeof(__k) == 16) 335 return _mm_maskload_epi64(__ptr, __k); 336 else 337 return _mm256_maskload_epi64(__ptr, __k); 338 } 339 340 // }}} 341 // __maskload_ps{{{ 342 template
343 _GLIBCXX_SIMD_INTRINSIC auto 344 __maskload_ps(const float* __ptr, _Tp __k) 345 { 346 if constexpr (sizeof(__k) == 16) 347 return _mm_maskload_ps(__ptr, __k); 348 else 349 return _mm256_maskload_ps(__ptr, __k); 350 } 351 352 // }}} 353 // __maskload_pd{{{ 354 template
355 _GLIBCXX_SIMD_INTRINSIC auto 356 __maskload_pd(const double* __ptr, _Tp __k) 357 { 358 if constexpr (sizeof(__k) == 16) 359 return _mm_maskload_pd(__ptr, __k); 360 else 361 return _mm256_maskload_pd(__ptr, __k); 362 } 363 364 // }}} 365 366 #ifdef __clang__ 367 template
368 _GLIBCXX_SIMD_INTRINSIC constexpr auto 369 __movm(_Kp __k) noexcept 370 { 371 static_assert(is_unsigned_v<_Kp>); 372 if constexpr (sizeof(_Tp) == 1 && __have_avx512bw) 373 { 374 if constexpr (_Np <= 16 && __have_avx512vl) 375 return __builtin_ia32_cvtmask2b128(__k); 376 else if constexpr (_Np <= 32 && __have_avx512vl) 377 return __builtin_ia32_cvtmask2b256(__k); 378 else 379 return __builtin_ia32_cvtmask2b512(__k); 380 } 381 else if constexpr (sizeof(_Tp) == 2 && __have_avx512bw) 382 { 383 if constexpr (_Np <= 8 && __have_avx512vl) 384 return __builtin_ia32_cvtmask2w128(__k); 385 else if constexpr (_Np <= 16 && __have_avx512vl) 386 return __builtin_ia32_cvtmask2w256(__k); 387 else 388 return __builtin_ia32_cvtmask2w512(__k); 389 } 390 else if constexpr (sizeof(_Tp) == 4 && __have_avx512dq) 391 { 392 if constexpr (_Np <= 4 && __have_avx512vl) 393 return __builtin_ia32_cvtmask2d128(__k); 394 else if constexpr (_Np <= 8 && __have_avx512vl) 395 return __builtin_ia32_cvtmask2d256(__k); 396 else 397 return __builtin_ia32_cvtmask2d512(__k); 398 } 399 else if constexpr (sizeof(_Tp) == 8 && __have_avx512dq) 400 { 401 if constexpr (_Np <= 2 && __have_avx512vl) 402 return __builtin_ia32_cvtmask2q128(__k); 403 else if constexpr (_Np <= 4 && __have_avx512vl) 404 return __builtin_ia32_cvtmask2q256(__k); 405 else 406 return __builtin_ia32_cvtmask2q512(__k); 407 } 408 else 409 __assert_unreachable<_Tp>(); 410 } 411 #endif // __clang__ 412 413 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 414 #include "simd_x86_conversions.h" 415 #endif 416 417 // ISA & type detection {{{ 418 template
419 constexpr bool 420 __is_sse_ps() 421 { 422 return __have_sse 423 && is_same_v<_Tp, 424 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 425 } 426 427 template
428 constexpr bool 429 __is_sse_pd() 430 { 431 return __have_sse2 432 && is_same_v<_Tp, 433 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 16; 434 } 435 436 template
437 constexpr bool 438 __is_avx_ps() 439 { 440 return __have_avx 441 && is_same_v<_Tp, 442 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 443 } 444 445 template
446 constexpr bool 447 __is_avx_pd() 448 { 449 return __have_avx 450 && is_same_v<_Tp, 451 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 32; 452 } 453 454 template
455 constexpr bool 456 __is_avx512_ps() 457 { 458 return __have_avx512f 459 && is_same_v<_Tp, 460 float> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 461 } 462 463 template
464 constexpr bool 465 __is_avx512_pd() 466 { 467 return __have_avx512f 468 && is_same_v<_Tp, 469 double> && sizeof(__intrinsic_type_t<_Tp, _Np>) == 64; 470 } 471 472 // }}} 473 struct _MaskImplX86Mixin; 474 475 // _CommonImplX86 {{{ 476 struct _CommonImplX86 : _CommonImplBuiltin 477 { 478 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 479 // _S_converts_via_decomposition {{{ 480 template
481 static constexpr bool 482 _S_converts_via_decomposition() 483 { 484 if constexpr (is_integral_v< 485 _From> && is_integral_v<_To> && sizeof(_From) == 8 486 && _ToSize == 16) 487 return (sizeof(_To) == 2 && !__have_ssse3) 488 || (sizeof(_To) == 1 && !__have_avx512f); 489 else if constexpr (is_floating_point_v<_From> && is_integral_v<_To>) 490 return ((sizeof(_From) == 4 || sizeof(_From) == 8) && sizeof(_To) == 8 491 && !__have_avx512dq) 492 || (sizeof(_From) == 8 && sizeof(_To) == 4 && !__have_sse4_1 493 && _ToSize == 16); 494 else if constexpr ( 495 is_integral_v<_From> && is_floating_point_v<_To> && sizeof(_From) == 8 496 && !__have_avx512dq) 497 return (sizeof(_To) == 4 && _ToSize == 16) 498 || (sizeof(_To) == 8 && _ToSize < 64); 499 else 500 return false; 501 } 502 503 template
504 static inline constexpr bool __converts_via_decomposition_v 505 = _S_converts_via_decomposition<_From, _To, _ToSize>(); 506 507 // }}} 508 #endif 509 // _S_store {{{ 510 using _CommonImplBuiltin::_S_store; 511 512 template
513 _GLIBCXX_SIMD_INTRINSIC static constexpr void 514 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 515 { 516 constexpr size_t _Bytes = _Np * sizeof(_Tp); 517 518 if (__builtin_is_constant_evaluated()) 519 _CommonImplBuiltin::_S_store(__x, __addr); 520 else if constexpr ((_Bytes & (_Bytes - 1)) != 0 && __have_avx512bw_vl) 521 { 522 const auto __v = __to_intrin(__x); 523 524 if constexpr (_Bytes & 1) 525 { 526 if constexpr (_Bytes < 16) 527 _mm_mask_storeu_epi8(__addr, 0xffffu >> (16 - _Bytes), 528 __intrin_bitcast<__m128i>(__v)); 529 else if constexpr (_Bytes < 32) 530 _mm256_mask_storeu_epi8(__addr, 0xffffffffu >> (32 - _Bytes), 531 __intrin_bitcast<__m256i>(__v)); 532 else 533 _mm512_mask_storeu_epi8(__addr, 534 0xffffffffffffffffull >> (64 - _Bytes), 535 __intrin_bitcast<__m512i>(__v)); 536 } 537 else if constexpr (_Bytes & 2) 538 { 539 if constexpr (_Bytes < 16) 540 _mm_mask_storeu_epi16(__addr, 0xffu >> (8 - _Bytes / 2), 541 __intrin_bitcast<__m128i>(__v)); 542 else if constexpr (_Bytes < 32) 543 _mm256_mask_storeu_epi16(__addr, 0xffffu >> (16 - _Bytes / 2), 544 __intrin_bitcast<__m256i>(__v)); 545 else 546 _mm512_mask_storeu_epi16(__addr, 547 0xffffffffull >> (32 - _Bytes / 2), 548 __intrin_bitcast<__m512i>(__v)); 549 } 550 else if constexpr (_Bytes & 4) 551 { 552 if constexpr (_Bytes < 16) 553 _mm_mask_storeu_epi32(__addr, 0xfu >> (4 - _Bytes / 4), 554 __intrin_bitcast<__m128i>(__v)); 555 else if constexpr (_Bytes < 32) 556 _mm256_mask_storeu_epi32(__addr, 0xffu >> (8 - _Bytes / 4), 557 __intrin_bitcast<__m256i>(__v)); 558 else 559 _mm512_mask_storeu_epi32(__addr, 0xffffull >> (16 - _Bytes / 4), 560 __intrin_bitcast<__m512i>(__v)); 561 } 562 else 563 { 564 static_assert( 565 _Bytes > 16, 566 "_Bytes < 16 && (_Bytes & 7) == 0 && (_Bytes & (_Bytes " 567 "- 1)) != 0 is impossible"); 568 if constexpr (_Bytes < 32) 569 _mm256_mask_storeu_epi64(__addr, 0xfu >> (4 - _Bytes / 8), 570 __intrin_bitcast<__m256i>(__v)); 571 else 572 _mm512_mask_storeu_epi64(__addr, 0xffull >> (8 - _Bytes / 8), 573 __intrin_bitcast<__m512i>(__v)); 574 } 575 } 576 else 577 _CommonImplBuiltin::_S_store(__x, __addr); 578 } 579 580 // }}} 581 // _S_store_bool_array(_BitMask) {{{ 582 template
583 _GLIBCXX_SIMD_INTRINSIC static constexpr void 584 _S_store_bool_array(const _BitMask<_Np, _Sanitized> __x, bool* __mem) 585 { 586 if (__builtin_is_constant_evaluated()) 587 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 588 else if constexpr (__have_avx512bw_vl) // don't care for BW w/o VL 589 _S_store<_Np>(1 & __vector_bitcast<_UChar, _Np>( 590 [=]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 591 if constexpr (_Np <= 16) 592 return _mm_movm_epi8(__x._M_to_bits()); 593 else if constexpr (_Np <= 32) 594 return _mm256_movm_epi8(__x._M_to_bits()); 595 else if constexpr (_Np <= 64) 596 return _mm512_movm_epi8(__x._M_to_bits()); 597 else 598 __assert_unreachable<_SizeConstant<_Np>>(); 599 }()), 600 __mem); 601 else if constexpr (__have_bmi2) 602 { 603 if constexpr (_Np <= 4) 604 _S_store<_Np>(_pdep_u32(__x._M_to_bits(), 0x01010101U), __mem); 605 else 606 __execute_n_times<__div_roundup(_Np, sizeof(size_t))>( 607 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 608 constexpr size_t __offset = __i * sizeof(size_t); 609 constexpr int __todo = std::min(sizeof(size_t), _Np - __offset); 610 if constexpr (__todo == 1) 611 __mem[__offset] = __x[__offset]; 612 else 613 { 614 const auto __bools = 615 #ifdef __x86_64__ 616 _pdep_u64(__x.template _M_extract<__offset>().to_ullong(), 617 0x0101010101010101ULL); 618 #else // __x86_64__ 619 _pdep_u32( 620 __x.template _M_extract<__offset>()._M_to_bits(), 621 0x01010101U); 622 #endif // __x86_64__ 623 _S_store<__todo>(__bools, __mem + __offset); 624 } 625 }); 626 } 627 else if constexpr (__have_sse2 && _Np > 7) 628 __execute_n_times<__div_roundup(_Np, 16)>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 629 constexpr int __offset = __i * 16; 630 constexpr int __todo = std::min(16, int(_Np) - __offset); 631 const int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 632 __vector_type16_t<_UChar> __bools; 633 if constexpr (__have_avx512f) 634 { 635 auto __as32bits 636 = _mm512_maskz_mov_epi32(__bits, __to_intrin( 637 __vector_broadcast<16>(1))); 638 auto __as16bits 639 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 640 __todo > 8 ? __hi256(__as32bits) 641 : __m256i())); 642 __bools = __vector_bitcast<_UChar>( 643 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 644 } 645 else 646 { 647 using _V = __vector_type_t<_UChar, 16>; 648 auto __tmp = _mm_cvtsi32_si128(__bits); 649 __tmp = _mm_unpacklo_epi8(__tmp, __tmp); 650 __tmp = _mm_unpacklo_epi16(__tmp, __tmp); 651 __tmp = _mm_unpacklo_epi32(__tmp, __tmp); 652 _V __tmp2 = reinterpret_cast<_V>(__tmp); 653 __tmp2 &= _V{1, 2, 4, 8, 16, 32, 64, 128, 654 1, 2, 4, 8, 16, 32, 64, 128}; // mask bit index 655 __bools = (__tmp2 == 0) + 1; // 0xff -> 0x00 | 0x00 -> 0x01 656 } 657 _S_store<__todo>(__bools, __mem + __offset); 658 }); 659 else 660 _CommonImplBuiltin::_S_store_bool_array(__x, __mem); 661 } 662 663 // }}} 664 // _S_blend_avx512 {{{ 665 // Returns: __k ? __b : __a 666 // TODO: reverse __a and __b to match COND_EXPR 667 // Requires: _TV to be a __vector_type_t matching valuetype for the bitmask 668 // __k 669 template
670 _GLIBCXX_SIMD_INTRINSIC static _TV 671 _S_blend_avx512(const _Kp __k, const _TV __a, const _TV __b) noexcept 672 { 673 static_assert(__is_vector_type_v<_TV>); 674 using _Tp = typename _VectorTraits<_TV>::value_type; 675 static_assert(sizeof(_TV) >= 16); 676 static_assert(sizeof(_Tp) <= 8); 677 #ifdef __clang__ 678 return __movm<_VectorTraits<_TV>::_S_full_size, _Tp>(__k) ? __b : __a; 679 #else 680 using _IntT 681 = conditional_t<(sizeof(_Tp) > 2), 682 conditional_t
, 683 conditional_t
>; 684 [[maybe_unused]] const auto __aa = __vector_bitcast<_IntT>(__a); 685 [[maybe_unused]] const auto __bb = __vector_bitcast<_IntT>(__b); 686 if constexpr (sizeof(_TV) == 64) 687 { 688 if constexpr (sizeof(_Tp) == 1) 689 return reinterpret_cast<_TV>( 690 __builtin_ia32_blendmb_512_mask(__aa, __bb, __k)); 691 else if constexpr (sizeof(_Tp) == 2) 692 return reinterpret_cast<_TV>( 693 __builtin_ia32_blendmw_512_mask(__aa, __bb, __k)); 694 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 695 return __builtin_ia32_blendmps_512_mask(__a, __b, __k); 696 else if constexpr (sizeof(_Tp) == 4) 697 return reinterpret_cast<_TV>( 698 __builtin_ia32_blendmd_512_mask(__aa, __bb, __k)); 699 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 700 return __builtin_ia32_blendmpd_512_mask(__a, __b, __k); 701 else if constexpr (sizeof(_Tp) == 8) 702 return reinterpret_cast<_TV>( 703 __builtin_ia32_blendmq_512_mask(__aa, __bb, __k)); 704 } 705 else if constexpr (sizeof(_TV) == 32) 706 { 707 if constexpr (sizeof(_Tp) == 1) 708 return reinterpret_cast<_TV>( 709 __builtin_ia32_blendmb_256_mask(__aa, __bb, __k)); 710 else if constexpr (sizeof(_Tp) == 2) 711 return reinterpret_cast<_TV>( 712 __builtin_ia32_blendmw_256_mask(__aa, __bb, __k)); 713 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 714 return __builtin_ia32_blendmps_256_mask(__a, __b, __k); 715 else if constexpr (sizeof(_Tp) == 4) 716 return reinterpret_cast<_TV>( 717 __builtin_ia32_blendmd_256_mask(__aa, __bb, __k)); 718 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 719 return __builtin_ia32_blendmpd_256_mask(__a, __b, __k); 720 else if constexpr (sizeof(_Tp) == 8) 721 return reinterpret_cast<_TV>( 722 __builtin_ia32_blendmq_256_mask(__aa, __bb, __k)); 723 } 724 else if constexpr (sizeof(_TV) == 16) 725 { 726 if constexpr (sizeof(_Tp) == 1) 727 return reinterpret_cast<_TV>( 728 __builtin_ia32_blendmb_128_mask(__aa, __bb, __k)); 729 else if constexpr (sizeof(_Tp) == 2) 730 return reinterpret_cast<_TV>( 731 __builtin_ia32_blendmw_128_mask(__aa, __bb, __k)); 732 else if constexpr (sizeof(_Tp) == 4 && is_floating_point_v<_Tp>) 733 return __builtin_ia32_blendmps_128_mask(__a, __b, __k); 734 else if constexpr (sizeof(_Tp) == 4) 735 return reinterpret_cast<_TV>( 736 __builtin_ia32_blendmd_128_mask(__aa, __bb, __k)); 737 else if constexpr (sizeof(_Tp) == 8 && is_floating_point_v<_Tp>) 738 return __builtin_ia32_blendmpd_128_mask(__a, __b, __k); 739 else if constexpr (sizeof(_Tp) == 8) 740 return reinterpret_cast<_TV>( 741 __builtin_ia32_blendmq_128_mask(__aa, __bb, __k)); 742 } 743 #endif 744 } 745 746 // }}} 747 // _S_blend_intrin {{{ 748 // Returns: __k ? __b : __a 749 // TODO: reverse __a and __b to match COND_EXPR 750 // Requires: _Tp to be an intrinsic type (integers blend per byte) and 16/32 751 // Bytes wide 752 template
753 _GLIBCXX_SIMD_INTRINSIC static _Tp 754 _S_blend_intrin(_Tp __k, _Tp __a, _Tp __b) noexcept 755 { 756 static_assert(is_same_v
); 757 constexpr struct 758 { 759 _GLIBCXX_SIMD_INTRINSIC __m128 operator()(__m128 __a, __m128 __b, 760 __m128 __k) const noexcept 761 { 762 return __builtin_ia32_blendvps(__a, __b, __k); 763 } 764 _GLIBCXX_SIMD_INTRINSIC __m128d operator()(__m128d __a, __m128d __b, 765 __m128d __k) const noexcept 766 { 767 return __builtin_ia32_blendvpd(__a, __b, __k); 768 } 769 _GLIBCXX_SIMD_INTRINSIC __m128i operator()(__m128i __a, __m128i __b, 770 __m128i __k) const noexcept 771 { 772 return reinterpret_cast<__m128i>( 773 __builtin_ia32_pblendvb128(reinterpret_cast<__v16qi>(__a), 774 reinterpret_cast<__v16qi>(__b), 775 reinterpret_cast<__v16qi>(__k))); 776 } 777 _GLIBCXX_SIMD_INTRINSIC __m256 operator()(__m256 __a, __m256 __b, 778 __m256 __k) const noexcept 779 { 780 return __builtin_ia32_blendvps256(__a, __b, __k); 781 } 782 _GLIBCXX_SIMD_INTRINSIC __m256d operator()(__m256d __a, __m256d __b, 783 __m256d __k) const noexcept 784 { 785 return __builtin_ia32_blendvpd256(__a, __b, __k); 786 } 787 _GLIBCXX_SIMD_INTRINSIC __m256i operator()(__m256i __a, __m256i __b, 788 __m256i __k) const noexcept 789 { 790 if constexpr (__have_avx2) 791 return reinterpret_cast<__m256i>( 792 __builtin_ia32_pblendvb256(reinterpret_cast<__v32qi>(__a), 793 reinterpret_cast<__v32qi>(__b), 794 reinterpret_cast<__v32qi>(__k))); 795 else 796 return reinterpret_cast<__m256i>( 797 __builtin_ia32_blendvps256(reinterpret_cast<__v8sf>(__a), 798 reinterpret_cast<__v8sf>(__b), 799 reinterpret_cast<__v8sf>(__k))); 800 } 801 } __eval; 802 return __eval(__a, __b, __k); 803 } 804 805 // }}} 806 // _S_blend {{{ 807 // Returns: __k ? __at1 : __at0 808 // TODO: reverse __at0 and __at1 to match COND_EXPR 809 template
810 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 811 _S_blend(_SimdWrapper
__k, _SimdWrapper<_Tp, _Np> __at0, 812 _SimdWrapper<_Tp, _Np> __at1) 813 { 814 static_assert(is_same_v<_Tp, _Tp> && __have_avx512f); 815 if (__k._M_is_constprop() && __at0._M_is_constprop() 816 && __at1._M_is_constprop()) 817 return __generate_from_n_evaluations<_Np, __vector_type_t<_Tp, _Np>>( 818 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 819 return __k[__i] ? __at1[__i] : __at0[__i]; 820 }); 821 else if constexpr (sizeof(__at0) == 64 822 || (__have_avx512vl && sizeof(__at0) >= 16)) 823 return _S_blend_avx512(__k._M_data, __at0._M_data, __at1._M_data); 824 else 825 { 826 static_assert((__have_avx512vl && sizeof(__at0) < 16) 827 || !__have_avx512vl); 828 constexpr size_t __size = (__have_avx512vl ? 16 : 64) / sizeof(_Tp); 829 return __vector_bitcast<_Tp, _Np>( 830 _S_blend_avx512(__k._M_data, __vector_bitcast<_Tp, __size>(__at0), 831 __vector_bitcast<_Tp, __size>(__at1))); 832 } 833 } 834 835 template
836 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 837 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 838 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 839 { 840 const auto __kk = __wrapper_bitcast<_Tp>(__k); 841 if (__builtin_is_constant_evaluated() 842 || (__kk._M_is_constprop() && __at0._M_is_constprop() 843 && __at1._M_is_constprop())) 844 { 845 auto __r = __or(__andnot(__kk, __at0), __and(__kk, __at1)); 846 if (__r._M_is_constprop()) 847 return __r; 848 } 849 if constexpr (((__have_avx512f && sizeof(__at0) == 64) || __have_avx512vl) 850 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 851 // convert to bitmask and call overload above 852 return _S_blend( 853 _SimdWrapper
( 854 __make_dependent_t<_Tp, _MaskImplX86Mixin>::_S_to_bits(__k) 855 ._M_to_bits()), 856 __at0, __at1); 857 else 858 { 859 // Since GCC does not assume __k to be a mask, using the builtin 860 // conditional operator introduces an extra compare against 0 before 861 // blending. So we rather call the intrinsic here. 862 if constexpr (__have_sse4_1) 863 return _S_blend_intrin(__to_intrin(__kk), __to_intrin(__at0), 864 __to_intrin(__at1)); 865 else 866 return __or(__andnot(__kk, __at0), __and(__kk, __at1)); 867 } 868 } 869 870 // }}} 871 }; 872 873 // }}} 874 // _SimdImplX86 {{{ 875 template
876 struct _SimdImplX86 : _SimdImplBuiltin<_Abi> 877 { 878 using _Base = _SimdImplBuiltin<_Abi>; 879 880 template
881 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 882 883 template
884 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 885 886 template
887 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 888 889 template
890 static constexpr size_t _S_max_store_size 891 = (sizeof(_Tp) >= 4 && __have_avx512f) || __have_avx512bw ? 64 892 : (is_floating_point_v<_Tp>&& __have_avx) || __have_avx2 ? 32 893 : 16; 894 895 using _MaskImpl = typename _Abi::_MaskImpl; 896 897 // _S_masked_load {{{ 898 template
899 static inline _SimdWrapper<_Tp, _Np> 900 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 901 const _Up* __mem) noexcept 902 { 903 static_assert(_Np == _S_size<_Tp>); 904 if constexpr (is_same_v<_Tp, _Up> || // no conversion 905 (sizeof(_Tp) == sizeof(_Up) 906 && is_integral_v< 907 _Tp> == is_integral_v<_Up>) // conversion via bit 908 // reinterpretation 909 ) 910 { 911 [[maybe_unused]] const auto __intrin = __to_intrin(__merge); 912 if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 913 && sizeof(_Tp) == 1) 914 { 915 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 916 if constexpr (sizeof(__intrin) == 16) 917 __merge = __vector_bitcast<_Tp, _Np>( 918 _mm_mask_loadu_epi8(__intrin, __kk, __mem)); 919 else if constexpr (sizeof(__merge) == 32) 920 __merge = __vector_bitcast<_Tp, _Np>( 921 _mm256_mask_loadu_epi8(__intrin, __kk, __mem)); 922 else if constexpr (sizeof(__merge) == 64) 923 __merge = __vector_bitcast<_Tp, _Np>( 924 _mm512_mask_loadu_epi8(__intrin, __kk, __mem)); 925 else 926 __assert_unreachable<_Tp>(); 927 } 928 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512bw_vl) 929 && sizeof(_Tp) == 2) 930 { 931 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 932 if constexpr (sizeof(__intrin) == 16) 933 __merge = __vector_bitcast<_Tp, _Np>( 934 _mm_mask_loadu_epi16(__intrin, __kk, __mem)); 935 else if constexpr (sizeof(__intrin) == 32) 936 __merge = __vector_bitcast<_Tp, _Np>( 937 _mm256_mask_loadu_epi16(__intrin, __kk, __mem)); 938 else if constexpr (sizeof(__intrin) == 64) 939 __merge = __vector_bitcast<_Tp, _Np>( 940 _mm512_mask_loadu_epi16(__intrin, __kk, __mem)); 941 else 942 __assert_unreachable<_Tp>(); 943 } 944 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 945 && sizeof(_Tp) == 4 && is_integral_v<_Up>) 946 { 947 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 948 if constexpr (sizeof(__intrin) == 16) 949 __merge = __vector_bitcast<_Tp, _Np>( 950 _mm_mask_loadu_epi32(__intrin, __kk, __mem)); 951 else if constexpr (sizeof(__intrin) == 32) 952 __merge = __vector_bitcast<_Tp, _Np>( 953 _mm256_mask_loadu_epi32(__intrin, __kk, __mem)); 954 else if constexpr (sizeof(__intrin) == 64) 955 __merge = __vector_bitcast<_Tp, _Np>( 956 _mm512_mask_loadu_epi32(__intrin, __kk, __mem)); 957 else 958 __assert_unreachable<_Tp>(); 959 } 960 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 961 && sizeof(_Tp) == 4 && is_floating_point_v<_Up>) 962 { 963 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 964 if constexpr (sizeof(__intrin) == 16) 965 __merge = __vector_bitcast<_Tp, _Np>( 966 _mm_mask_loadu_ps(__intrin, __kk, __mem)); 967 else if constexpr (sizeof(__intrin) == 32) 968 __merge = __vector_bitcast<_Tp, _Np>( 969 _mm256_mask_loadu_ps(__intrin, __kk, __mem)); 970 else if constexpr (sizeof(__intrin) == 64) 971 __merge = __vector_bitcast<_Tp, _Np>( 972 _mm512_mask_loadu_ps(__intrin, __kk, __mem)); 973 else 974 __assert_unreachable<_Tp>(); 975 } 976 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 977 && is_integral_v<_Up>) 978 { 979 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 980 __merge 981 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 982 __vector_bitcast<_Tp, _Np>( 983 __maskload_epi32(reinterpret_cast
(__mem), 984 __to_intrin(__k)))); 985 } 986 else if constexpr (__have_avx && sizeof(_Tp) == 4) 987 { 988 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 989 __merge 990 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 991 __vector_bitcast<_Tp, _Np>( 992 __maskload_ps(reinterpret_cast
(__mem), 993 __to_intrin(__k)))); 994 } 995 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 996 && sizeof(_Tp) == 8 && is_integral_v<_Up>) 997 { 998 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 999 if constexpr (sizeof(__intrin) == 16) 1000 __merge = __vector_bitcast<_Tp, _Np>( 1001 _mm_mask_loadu_epi64(__intrin, __kk, __mem)); 1002 else if constexpr (sizeof(__intrin) == 32) 1003 __merge = __vector_bitcast<_Tp, _Np>( 1004 _mm256_mask_loadu_epi64(__intrin, __kk, __mem)); 1005 else if constexpr (sizeof(__intrin) == 64) 1006 __merge = __vector_bitcast<_Tp, _Np>( 1007 _mm512_mask_loadu_epi64(__intrin, __kk, __mem)); 1008 else 1009 __assert_unreachable<_Tp>(); 1010 } 1011 else if constexpr ((__is_avx512_abi<_Abi>() || __have_avx512vl) 1012 && sizeof(_Tp) == 8 && is_floating_point_v<_Up>) 1013 { 1014 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1015 if constexpr (sizeof(__intrin) == 16) 1016 __merge = __vector_bitcast<_Tp, _Np>( 1017 _mm_mask_loadu_pd(__intrin, __kk, __mem)); 1018 else if constexpr (sizeof(__intrin) == 32) 1019 __merge = __vector_bitcast<_Tp, _Np>( 1020 _mm256_mask_loadu_pd(__intrin, __kk, __mem)); 1021 else if constexpr (sizeof(__intrin) == 64) 1022 __merge = __vector_bitcast<_Tp, _Np>( 1023 _mm512_mask_loadu_pd(__intrin, __kk, __mem)); 1024 else 1025 __assert_unreachable<_Tp>(); 1026 } 1027 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1028 && is_integral_v<_Up>) 1029 { 1030 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1031 __merge 1032 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1033 __vector_bitcast<_Tp, _Np>(__maskload_epi64( 1034 reinterpret_cast
(__mem), 1035 __to_intrin(__k)))); 1036 } 1037 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1038 { 1039 static_assert(sizeof(__intrin) == 16 || sizeof(__intrin) == 32); 1040 __merge 1041 = __or(__andnot(__vector_bitcast<_Tp>(__k), __merge._M_data), 1042 __vector_bitcast<_Tp, _Np>( 1043 __maskload_pd(reinterpret_cast
(__mem), 1044 __to_intrin(__k)))); 1045 } 1046 else 1047 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1048 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1049 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1050 }); 1051 } 1052 /* Very uncertain, that the following improves anything. Needs 1053 benchmarking 1054 * before it's activated. 1055 else if constexpr (sizeof(_Up) <= 8 && // no long double 1056 !__converts_via_decomposition_v< 1057 _Up, _Tp, 1058 sizeof(__merge)> // conversion via decomposition 1059 // is better handled via the 1060 // bit_iteration fallback below 1061 ) 1062 { 1063 // TODO: copy pattern from _S_masked_store, which doesn't resort to 1064 // fixed_size 1065 using _Ap = simd_abi::deduce_t<_Up, _Np>; 1066 using _ATraits = _SimdTraits<_Up, _Ap>; 1067 using _AImpl = typename _ATraits::_SimdImpl; 1068 typename _ATraits::_SimdMember __uncvted{}; 1069 typename _ATraits::_MaskMember __kk = _Ap::_MaskImpl::template 1070 _S_convert<_Up>(__k); 1071 __uncvted = _AImpl::_S_masked_load(__uncvted, __kk, __mem); 1072 _SimdConverter<_Up, _Ap, _Tp, _Abi> __converter; 1073 _Base::_S_masked_assign(__k, __merge, __converter(__uncvted)); 1074 } 1075 */ 1076 else 1077 __merge = _Base::_S_masked_load(__merge, __k, __mem); 1078 return __merge; 1079 } 1080 1081 // }}} 1082 // _S_masked_store_nocvt {{{ 1083 template
1084 _GLIBCXX_SIMD_INTRINSIC static void 1085 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _SimdWrapper
__k) 1086 { 1087 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1088 if constexpr (sizeof(__vi) == 64) 1089 { 1090 static_assert(sizeof(__v) == 64 && __have_avx512f); 1091 if constexpr (__have_avx512bw && sizeof(_Tp) == 1) 1092 _mm512_mask_storeu_epi8(__mem, __k, __vi); 1093 else if constexpr (__have_avx512bw && sizeof(_Tp) == 2) 1094 _mm512_mask_storeu_epi16(__mem, __k, __vi); 1095 else if constexpr (__have_avx512f && sizeof(_Tp) == 4) 1096 { 1097 if constexpr (is_integral_v<_Tp>) 1098 _mm512_mask_storeu_epi32(__mem, __k, __vi); 1099 else 1100 _mm512_mask_storeu_ps(__mem, __k, __vi); 1101 } 1102 else if constexpr (__have_avx512f && sizeof(_Tp) == 8) 1103 { 1104 if constexpr (is_integral_v<_Tp>) 1105 _mm512_mask_storeu_epi64(__mem, __k, __vi); 1106 else 1107 _mm512_mask_storeu_pd(__mem, __k, __vi); 1108 } 1109 else 1110 __assert_unreachable<_Tp>(); 1111 } 1112 else if constexpr (sizeof(__vi) == 32) 1113 { 1114 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1115 _mm256_mask_storeu_epi8(__mem, __k, __vi); 1116 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1117 _mm256_mask_storeu_epi16(__mem, __k, __vi); 1118 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1119 { 1120 if constexpr (is_integral_v<_Tp>) 1121 _mm256_mask_storeu_epi32(__mem, __k, __vi); 1122 else 1123 _mm256_mask_storeu_ps(__mem, __k, __vi); 1124 } 1125 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1126 { 1127 if constexpr (is_integral_v<_Tp>) 1128 _mm256_mask_storeu_epi64(__mem, __k, __vi); 1129 else 1130 _mm256_mask_storeu_pd(__mem, __k, __vi); 1131 } 1132 else if constexpr (__have_avx512f 1133 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1134 { 1135 // use a 512-bit maskstore, using zero-extension of the bitmask 1136 _S_masked_store_nocvt( 1137 _SimdWrapper64<_Tp>( 1138 __intrin_bitcast<__vector_type64_t<_Tp>>(__v._M_data)), 1139 __mem, _SimdWrapper
(__k._M_data)); 1140 } 1141 else 1142 _S_masked_store_nocvt(__v, __mem, 1143 _MaskImpl::template _S_to_maskvector< 1144 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1145 } 1146 else if constexpr (sizeof(__vi) == 16) 1147 { 1148 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1149 _mm_mask_storeu_epi8(__mem, __k, __vi); 1150 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1151 _mm_mask_storeu_epi16(__mem, __k, __vi); 1152 else if constexpr (__have_avx512vl && sizeof(_Tp) == 4) 1153 { 1154 if constexpr (is_integral_v<_Tp>) 1155 _mm_mask_storeu_epi32(__mem, __k, __vi); 1156 else 1157 _mm_mask_storeu_ps(__mem, __k, __vi); 1158 } 1159 else if constexpr (__have_avx512vl && sizeof(_Tp) == 8) 1160 { 1161 if constexpr (is_integral_v<_Tp>) 1162 _mm_mask_storeu_epi64(__mem, __k, __vi); 1163 else 1164 _mm_mask_storeu_pd(__mem, __k, __vi); 1165 } 1166 else if constexpr (__have_avx512f 1167 && (sizeof(_Tp) >= 4 || __have_avx512bw)) 1168 { 1169 // use a 512-bit maskstore, using zero-extension of the bitmask 1170 _S_masked_store_nocvt( 1171 _SimdWrapper64<_Tp>( 1172 __intrin_bitcast<__intrinsic_type64_t<_Tp>>(__v._M_data)), 1173 __mem, _SimdWrapper
(__k._M_data)); 1174 } 1175 else 1176 _S_masked_store_nocvt(__v, __mem, 1177 _MaskImpl::template _S_to_maskvector< 1178 __int_for_sizeof_t<_Tp>, _Np>(__k)); 1179 } 1180 else 1181 __assert_unreachable<_Tp>(); 1182 } 1183 1184 template
1185 _GLIBCXX_SIMD_INTRINSIC static void 1186 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 1187 _SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k) 1188 { 1189 if constexpr (sizeof(__v) <= 16) 1190 { 1191 [[maybe_unused]] const auto __vi 1192 = __intrin_bitcast<__m128i>(__as_vector(__v)); 1193 [[maybe_unused]] const auto __ki 1194 = __intrin_bitcast<__m128i>(__as_vector(__k)); 1195 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1196 _mm_mask_storeu_epi8(__mem, _mm_movepi8_mask(__ki), __vi); 1197 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1198 _mm_mask_storeu_epi16(__mem, _mm_movepi16_mask(__ki), __vi); 1199 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1200 && is_integral_v<_Tp>) 1201 _mm_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1202 else if constexpr (__have_avx && sizeof(_Tp) == 4) 1203 _mm_maskstore_ps(reinterpret_cast
(__mem), __ki, 1204 __vector_bitcast
(__vi)); 1205 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1206 && is_integral_v<_Tp>) 1207 _mm_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, __vi); 1208 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1209 _mm_maskstore_pd(reinterpret_cast
(__mem), __ki, 1210 __vector_bitcast
(__vi)); 1211 else 1212 _Base::_S_masked_store_nocvt(__v, __mem, __k); 1213 } 1214 else if constexpr (sizeof(__v) == 32) 1215 { 1216 [[maybe_unused]] const auto __vi 1217 = __intrin_bitcast<__m256i>(__as_vector(__v)); 1218 [[maybe_unused]] const auto __ki 1219 = __intrin_bitcast<__m256i>(__as_vector(__k)); 1220 if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 1) 1221 _mm256_mask_storeu_epi8(__mem, _mm256_movepi8_mask(__ki), __vi); 1222 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 2) 1223 _mm256_mask_storeu_epi16(__mem, _mm256_movepi16_mask(__ki), __vi); 1224 else if constexpr (__have_avx2 && sizeof(_Tp) == 4 1225 && is_integral_v<_Tp>) 1226 _mm256_maskstore_epi32(reinterpret_cast
(__mem), __ki, __vi); 1227 else if constexpr (sizeof(_Tp) == 4) 1228 _mm256_maskstore_ps(reinterpret_cast
(__mem), __ki, 1229 __vector_bitcast
(__v)); 1230 else if constexpr (__have_avx2 && sizeof(_Tp) == 8 1231 && is_integral_v<_Tp>) 1232 _mm256_maskstore_epi64(reinterpret_cast<_LLong*>(__mem), __ki, 1233 __vi); 1234 else if constexpr (__have_avx && sizeof(_Tp) == 8) 1235 _mm256_maskstore_pd(reinterpret_cast
(__mem), __ki, 1236 __vector_bitcast
(__v)); 1237 else 1238 _Base::_S_masked_store_nocvt(__v, __mem, __k); 1239 } 1240 else 1241 __assert_unreachable<_Tp>(); 1242 } 1243 1244 // }}} 1245 // _S_masked_store {{{ 1246 template
1247 _GLIBCXX_SIMD_INTRINSIC static void 1248 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, _Up* __mem, 1249 const _MaskMember<_Tp> __k) noexcept 1250 { 1251 if constexpr (is_integral_v< 1252 _Tp> && is_integral_v<_Up> && sizeof(_Tp) > sizeof(_Up) 1253 && __have_avx512f && (sizeof(_Tp) >= 4 || __have_avx512bw) 1254 && (sizeof(__v) == 64 || __have_avx512vl)) 1255 { // truncating store 1256 const auto __vi = __to_intrin(__v); 1257 const auto __kk = _MaskImpl::_S_to_bits(__k)._M_to_bits(); 1258 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1259 && sizeof(__vi) == 64) 1260 _mm512_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1261 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1262 && sizeof(__vi) == 32) 1263 _mm256_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1264 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 1265 && sizeof(__vi) == 16) 1266 _mm_mask_cvtepi64_storeu_epi32(__mem, __kk, __vi); 1267 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1268 && sizeof(__vi) == 64) 1269 _mm512_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1270 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1271 && sizeof(__vi) == 32) 1272 _mm256_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1273 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 1274 && sizeof(__vi) == 16) 1275 _mm_mask_cvtepi64_storeu_epi16(__mem, __kk, __vi); 1276 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1277 && sizeof(__vi) == 64) 1278 _mm512_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1279 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1280 && sizeof(__vi) == 32) 1281 _mm256_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1282 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 1 1283 && sizeof(__vi) == 16) 1284 _mm_mask_cvtepi64_storeu_epi8(__mem, __kk, __vi); 1285 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1286 && sizeof(__vi) == 64) 1287 _mm512_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1288 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1289 && sizeof(__vi) == 32) 1290 _mm256_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1291 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 2 1292 && sizeof(__vi) == 16) 1293 _mm_mask_cvtepi32_storeu_epi16(__mem, __kk, __vi); 1294 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1295 && sizeof(__vi) == 64) 1296 _mm512_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1297 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1298 && sizeof(__vi) == 32) 1299 _mm256_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1300 else if constexpr (sizeof(_Tp) == 4 && sizeof(_Up) == 1 1301 && sizeof(__vi) == 16) 1302 _mm_mask_cvtepi32_storeu_epi8(__mem, __kk, __vi); 1303 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1304 && sizeof(__vi) == 64) 1305 _mm512_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1306 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1307 && sizeof(__vi) == 32) 1308 _mm256_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1309 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 1310 && sizeof(__vi) == 16) 1311 _mm_mask_cvtepi16_storeu_epi8(__mem, __kk, __vi); 1312 else 1313 __assert_unreachable<_Tp>(); 1314 } 1315 else 1316 _Base::_S_masked_store(__v, __mem, __k); 1317 } 1318 1319 // }}} 1320 // _S_multiplies {{{ 1321 template
> 1322 _GLIBCXX_SIMD_INTRINSIC static constexpr _V 1323 _S_multiplies(_V __x, _V __y) 1324 { 1325 using _Tp = typename _VVT::value_type; 1326 if (__builtin_is_constant_evaluated() || __x._M_is_constprop() 1327 || __y._M_is_constprop()) 1328 return __as_vector(__x) * __as_vector(__y); 1329 else if constexpr (sizeof(_Tp) == 1) 1330 { 1331 if constexpr (sizeof(_V) == 2) 1332 { 1333 const auto __xs = reinterpret_cast
(__x._M_data); 1334 const auto __ys = reinterpret_cast
(__y._M_data); 1335 return reinterpret_cast<__vector_type_t<_Tp, 2>>(short( 1336 ((__xs * __ys) & 0xff) | ((__xs >> 8) * (__ys & 0xff00)))); 1337 } 1338 else if constexpr (sizeof(_V) == 4 && _VVT::_S_partial_width == 3) 1339 { 1340 const auto __xi = reinterpret_cast
(__x._M_data); 1341 const auto __yi = reinterpret_cast
(__y._M_data); 1342 return reinterpret_cast<__vector_type_t<_Tp, 3>>( 1343 ((__xi * __yi) & 0xff) 1344 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1345 | ((__xi >> 16) * (__yi & 0xff0000))); 1346 } 1347 else if constexpr (sizeof(_V) == 4) 1348 { 1349 const auto __xi = reinterpret_cast
(__x._M_data); 1350 const auto __yi = reinterpret_cast
(__y._M_data); 1351 return reinterpret_cast<__vector_type_t<_Tp, 4>>( 1352 ((__xi * __yi) & 0xff) 1353 | (((__xi >> 8) * (__yi & 0xff00)) & 0xff00) 1354 | (((__xi >> 16) * (__yi & 0xff0000)) & 0xff0000) 1355 | ((__xi >> 24) * (__yi & 0xff000000u))); 1356 } 1357 else if constexpr (sizeof(_V) == 8 && __have_avx2 1358 && is_signed_v<_Tp>) 1359 return __convert
( 1360 __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__x))) 1361 * __vector_bitcast
(_mm_cvtepi8_epi16(__to_intrin(__y)))); 1362 else if constexpr (sizeof(_V) == 8 && __have_avx2 1363 && is_unsigned_v<_Tp>) 1364 return __convert
( 1365 __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__x))) 1366 * __vector_bitcast
(_mm_cvtepu8_epi16(__to_intrin(__y)))); 1367 else 1368 { 1369 // codegen of `x*y` is suboptimal (as of GCC 9.0.1) 1370 constexpr size_t __full_size = _VVT::_S_full_size; 1371 constexpr int _Np = sizeof(_V) >= 16 ? __full_size / 2 : 8; 1372 using _ShortW = _SimdWrapper
; 1373 const _ShortW __even = __vector_bitcast
(__x) 1374 * __vector_bitcast
(__y); 1375 _ShortW __high_byte = _ShortW()._M_data - 256; 1376 //[&]() { asm("" : "+x"(__high_byte._M_data)); }(); 1377 const _ShortW __odd 1378 = (__vector_bitcast
(__x) >> 8) 1379 * (__vector_bitcast
(__y) & __high_byte._M_data); 1380 if constexpr (__have_avx512bw && sizeof(_V) > 2) 1381 return _CommonImplX86::_S_blend_avx512( 1382 0xaaaa'aaaa'aaaa'aaaaLL, __vector_bitcast<_Tp>(__even), 1383 __vector_bitcast<_Tp>(__odd)); 1384 else if constexpr (__have_sse4_1 && sizeof(_V) > 2) 1385 return _CommonImplX86::_S_blend_intrin(__to_intrin( 1386 __high_byte), 1387 __to_intrin(__even), 1388 __to_intrin(__odd)); 1389 else 1390 return __to_intrin( 1391 __or(__andnot(__high_byte, __even), __odd)); 1392 } 1393 } 1394 else 1395 return _Base::_S_multiplies(__x, __y); 1396 } 1397 1398 // }}} 1399 // _S_divides {{{ 1400 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90993 1401 template
1402 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1403 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1404 { 1405 if (!__builtin_is_constant_evaluated() 1406 && !__builtin_constant_p(__y._M_data)) 1407 if constexpr (is_integral_v<_Tp> && sizeof(_Tp) <= 4) 1408 { // use divps - codegen of `x/y` is suboptimal (as of GCC 9.0.1) 1409 // Note that using floating-point division is likely to raise the 1410 // *Inexact* exception flag and thus appears like an invalid 1411 // "as-if" transformation. However, C++ doesn't specify how the 1412 // fpenv can be observed and points to C. C says that function 1413 // calls are assumed to potentially raise fp exceptions, unless 1414 // documented otherwise. Consequently, operator/, which is a 1415 // function call, may raise fp exceptions. 1416 /*const struct _CsrGuard 1417 { 1418 const unsigned _M_data = _mm_getcsr(); 1419 _CsrGuard() 1420 { 1421 _mm_setcsr(0x9f80); // turn off FP exceptions and 1422 flush-to-zero 1423 } 1424 ~_CsrGuard() { _mm_setcsr(_M_data); } 1425 } __csr;*/ 1426 using _Float = conditional_t
; 1427 constexpr size_t __n_intermediate 1428 = std::min(_Np, (__have_avx512f ? 64 1429 : __have_avx ? 32 1430 : 16) 1431 / sizeof(_Float)); 1432 using _FloatV = __vector_type_t<_Float, __n_intermediate>; 1433 constexpr size_t __n_floatv 1434 = __div_roundup(_Np, __n_intermediate); 1435 using _R = __vector_type_t<_Tp, _Np>; 1436 const auto __xf = __convert_all<_FloatV, __n_floatv>(__x); 1437 const auto __yf = __convert_all<_FloatV, __n_floatv>( 1438 _Abi::__make_padding_nonzero(__as_vector(__y))); 1439 return __call_with_n_evaluations<__n_floatv>( 1440 [](auto... __quotients) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1441 return __vector_convert<_R>(__quotients...); 1442 }, 1443 [&__xf, &__yf](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1444 -> _SimdWrapper<_Float, __n_intermediate> 1445 { 1446 #if __RECIPROCAL_MATH__ 1447 // If -freciprocal-math is active, using the `/` operator is 1448 // incorrect because it may be translated to an imprecise 1449 // multiplication with reciprocal. We need to use inline 1450 // assembly to force a real division. 1451 _FloatV __r; 1452 if constexpr (__have_avx) // -mno-sse2avx is irrelevant 1453 // because once -mavx is given, GCC 1454 // emits VEX encoded vdivp[sd] 1455 { 1456 if constexpr (sizeof(_Tp) == 4) 1457 asm("vdivpd\t{%2, %1, %0|%0, %1, %2}" 1458 : "=x"(__r) 1459 : "x"(__xf[__i]), "x"(__yf[__i])); 1460 else 1461 asm("vdivps\t{%2, %1, %0|%0, %1, %2}" 1462 : "=x"(__r) 1463 : "x"(__xf[__i]), "x"(__yf[__i])); 1464 } 1465 else 1466 { 1467 __r = __xf[__i]; 1468 if constexpr (sizeof(_Tp) == 4) 1469 asm("divpd\t{%1, %0|%0, %1}" 1470 : "=x"(__r) 1471 : "x"(__yf[__i])); 1472 else 1473 asm("divps\t{%1, %0|%0, %1}" 1474 : "=x"(__r) 1475 : "x"(__yf[__i])); 1476 } 1477 return __r; 1478 #else 1479 return __xf[__i] / __yf[__i]; 1480 #endif 1481 }); 1482 } 1483 /* 64-bit int division is potentially optimizable via double division if 1484 * the value in __x is small enough and the conversion between 1485 * int<->double is efficient enough: 1486 else if constexpr (is_integral_v<_Tp> && is_unsigned_v<_Tp> && 1487 sizeof(_Tp) == 8) 1488 { 1489 if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1490 { 1491 if (_mm_test_all_zeros(__x, __m128i{0xffe0'0000'0000'0000ull, 1492 0xffe0'0000'0000'0000ull})) 1493 { 1494 __x._M_data | 0x __vector_convert<__m128d>(__x._M_data) 1495 } 1496 } 1497 } 1498 */ 1499 return _Base::_S_divides(__x, __y); 1500 } 1501 #else 1502 using _Base::_S_divides; 1503 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90993 1504 1505 // }}} 1506 // _S_modulus {{{ 1507 template
1508 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1509 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1510 { 1511 if (__builtin_is_constant_evaluated() 1512 || __builtin_constant_p(__y._M_data) || sizeof(_Tp) >= 8) 1513 return _Base::_S_modulus(__x, __y); 1514 else 1515 return _Base::_S_minus(__x, _S_multiplies(__y, _S_divides(__x, __y))); 1516 } 1517 1518 // }}} 1519 // _S_bit_shift_left {{{ 1520 // Notes on UB. C++2a [expr.shift] says: 1521 // -1- [...] The operands shall be of integral or unscoped enumeration type 1522 // and integral promotions are performed. The type of the result is that 1523 // of the promoted left operand. The behavior is undefined if the right 1524 // operand is negative, or greater than or equal to the width of the 1525 // promoted left operand. 1526 // -2- The value of E1 << E2 is the unique value congruent to E1×2^E2 modulo 1527 // 2^N, where N is the width of the type of the result. 1528 // 1529 // C++17 [expr.shift] says: 1530 // -2- The value of E1 << E2 is E1 left-shifted E2 bit positions; vacated 1531 // bits are zero-filled. If E1 has an unsigned type, the value of the 1532 // result is E1 × 2^E2 , reduced modulo one more than the maximum value 1533 // representable in the result type. Otherwise, if E1 has a signed type 1534 // and non-negative value, and E1 × 2^E2 is representable in the 1535 // corresponding unsigned type of the result type, then that value, 1536 // converted to the result type, is the resulting value; otherwise, the 1537 // behavior is undefined. 1538 // 1539 // Consequences: 1540 // With C++2a signed and unsigned types have the same UB 1541 // characteristics: 1542 // - left shift is not UB for 0 <= RHS < max(32, #bits(T)) 1543 // 1544 // With C++17 there's little room for optimizations because the standard 1545 // requires all shifts to happen on promoted integrals (i.e. int). Thus, 1546 // short and char shifts must assume shifts affect bits of neighboring 1547 // values. 1548 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1549 template
> 1550 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1551 _S_bit_shift_left(_Tp __xx, int __y) 1552 { 1553 using _V = typename _TVT::type; 1554 using _Up = typename _TVT::value_type; 1555 _V __x = __xx; 1556 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1557 if (__builtin_is_constant_evaluated()) 1558 return __x << __y; 1559 #if __cplusplus > 201703 1560 // after C++17, signed shifts have no UB, and behave just like unsigned 1561 // shifts 1562 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) 1563 return __vector_bitcast<_Up>( 1564 _S_bit_shift_left(__vector_bitcast
>(__x), 1565 __y)); 1566 #endif 1567 else if constexpr (sizeof(_Up) == 1) 1568 { 1569 // (cf. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83894) 1570 if (__builtin_constant_p(__y)) 1571 { 1572 if (__y == 0) 1573 return __x; 1574 else if (__y == 1) 1575 return __x + __x; 1576 else if (__y == 2) 1577 { 1578 __x = __x + __x; 1579 return __x + __x; 1580 } 1581 else if (__y > 2 && __y < 8) 1582 { 1583 if constexpr (sizeof(__x) > sizeof(unsigned)) 1584 { 1585 const _UChar __mask = 0xff << __y; // precomputed vector 1586 return __vector_bitcast<_Up>( 1587 __vector_bitcast<_UChar>( 1588 __vector_bitcast
(__x) << __y) 1589 & __mask); 1590 } 1591 else 1592 { 1593 const unsigned __mask 1594 = (0xff & (0xff << __y)) * 0x01010101u; 1595 return reinterpret_cast<_V>( 1596 static_cast<__int_for_sizeof_t<_V>>( 1597 unsigned( 1598 reinterpret_cast<__int_for_sizeof_t<_V>>(__x) 1599 << __y) 1600 & __mask)); 1601 } 1602 } 1603 else if (__y >= 8 && __y < 32) 1604 return _V(); 1605 else 1606 __builtin_unreachable(); 1607 } 1608 // general strategy in the following: use an sllv instead of sll 1609 // instruction, because it's 2 to 4 times faster: 1610 else if constexpr (__have_avx512bw_vl && sizeof(__x) == 16) 1611 return __vector_bitcast<_Up>(_mm256_cvtepi16_epi8( 1612 _mm256_sllv_epi16(_mm256_cvtepi8_epi16(__ix), 1613 _mm256_set1_epi16(__y)))); 1614 else if constexpr (__have_avx512bw && sizeof(__x) == 32) 1615 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1616 _mm512_sllv_epi16(_mm512_cvtepi8_epi16(__ix), 1617 _mm512_set1_epi16(__y)))); 1618 else if constexpr (__have_avx512bw && sizeof(__x) == 64) 1619 { 1620 const auto __shift = _mm512_set1_epi16(__y); 1621 return __vector_bitcast<_Up>( 1622 __concat(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1623 _mm512_cvtepi8_epi16(__lo256(__ix)), __shift)), 1624 _mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1625 _mm512_cvtepi8_epi16(__hi256(__ix)), __shift)))); 1626 } 1627 else if constexpr (__have_avx2 && sizeof(__x) == 32) 1628 { 1629 #if 1 1630 const auto __shift = _mm_cvtsi32_si128(__y); 1631 auto __k 1632 = _mm256_sll_epi16(_mm256_slli_epi16(~__m256i(), 8), __shift); 1633 __k |= _mm256_srli_epi16(__k, 8); 1634 return __vector_bitcast<_Up>(_mm256_sll_epi32(__ix, __shift) 1635 & __k); 1636 #else 1637 const _Up __k = 0xff << __y; 1638 return __vector_bitcast<_Up>(__vector_bitcast
(__x) << __y) 1639 & __k; 1640 #endif 1641 } 1642 else 1643 { 1644 const auto __shift = _mm_cvtsi32_si128(__y); 1645 auto __k 1646 = _mm_sll_epi16(_mm_slli_epi16(~__m128i(), 8), __shift); 1647 __k |= _mm_srli_epi16(__k, 8); 1648 return __intrin_bitcast<_V>(_mm_sll_epi16(__ix, __shift) & __k); 1649 } 1650 } 1651 return __x << __y; 1652 } 1653 1654 template
> 1655 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1656 _S_bit_shift_left(_Tp __xx, typename _TVT::type __y) 1657 { 1658 using _V = typename _TVT::type; 1659 using _Up = typename _TVT::value_type; 1660 _V __x = __xx; 1661 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1662 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1663 if (__builtin_is_constant_evaluated()) 1664 return __x << __y; 1665 #if __cplusplus > 201703 1666 // after C++17, signed shifts have no UB, and behave just like unsigned 1667 // shifts 1668 else if constexpr (is_signed_v<_Up>) 1669 return __vector_bitcast<_Up>( 1670 _S_bit_shift_left(__vector_bitcast
>(__x), 1671 __vector_bitcast
>(__y))); 1672 #endif 1673 else if constexpr (sizeof(_Up) == 1) 1674 { 1675 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1676 return __vector_bitcast<_Up>(__concat( 1677 _mm512_cvtepi16_epi8( 1678 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__lo256(__ix)), 1679 _mm512_cvtepu8_epi16(__lo256(__iy)))), 1680 _mm512_cvtepi16_epi8( 1681 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__hi256(__ix)), 1682 _mm512_cvtepu8_epi16(__hi256(__iy)))))); 1683 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1684 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1685 _mm512_sllv_epi16(_mm512_cvtepu8_epi16(__ix), 1686 _mm512_cvtepu8_epi16(__iy)))); 1687 else if constexpr (sizeof __x <= 8 && __have_avx512bw_vl) 1688 return __intrin_bitcast<_V>( 1689 _mm_cvtepi16_epi8(_mm_sllv_epi16(_mm_cvtepu8_epi16(__ix), 1690 _mm_cvtepu8_epi16(__iy)))); 1691 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1692 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1693 _mm256_sllv_epi16(_mm256_cvtepu8_epi16(__ix), 1694 _mm256_cvtepu8_epi16(__iy)))); 1695 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1696 return __intrin_bitcast<_V>( 1697 __lo128(_mm512_cvtepi16_epi8(_mm512_sllv_epi16( 1698 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__ix)), 1699 _mm512_cvtepu8_epi16(_mm256_castsi128_si256(__iy)))))); 1700 else if constexpr (__have_sse4_1 && sizeof(__x) == 16) 1701 { 1702 auto __mask 1703 = __vector_bitcast<_Up>(__vector_bitcast
(__y) << 5); 1704 auto __x4 1705 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1706 __x4 &= char(0xf0); 1707 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1708 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x4))); 1709 __mask += __mask; 1710 auto __x2 1711 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1712 __x2 &= char(0xfc); 1713 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1714 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x2))); 1715 __mask += __mask; 1716 auto __x1 = __x + __x; 1717 __x = reinterpret_cast<_V>(_CommonImplX86::_S_blend_intrin( 1718 __to_intrin(__mask), __to_intrin(__x), __to_intrin(__x1))); 1719 return __x 1720 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1721 } 1722 else if constexpr (sizeof(__x) == 16) 1723 { 1724 auto __mask 1725 = __vector_bitcast<_UChar>(__vector_bitcast
(__y) << 5); 1726 auto __x4 1727 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 4); 1728 __x4 &= char(0xf0); 1729 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x4 : __x; 1730 __mask += __mask; 1731 auto __x2 1732 = __vector_bitcast<_Up>(__vector_bitcast
(__x) << 2); 1733 __x2 &= char(0xfc); 1734 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x2 : __x; 1735 __mask += __mask; 1736 auto __x1 = __x + __x; 1737 __x = __vector_bitcast<_SChar>(__mask) < 0 ? __x1 : __x; 1738 return __x 1739 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 1740 } 1741 else 1742 return __x << __y; 1743 } 1744 else if constexpr (sizeof(_Up) == 2) 1745 { 1746 if constexpr (sizeof __ix == 64 && __have_avx512bw) 1747 return __vector_bitcast<_Up>(_mm512_sllv_epi16(__ix, __iy)); 1748 else if constexpr (sizeof __ix == 32 && __have_avx512bw_vl) 1749 return __vector_bitcast<_Up>(_mm256_sllv_epi16(__ix, __iy)); 1750 else if constexpr (sizeof __ix == 32 && __have_avx512bw) 1751 return __vector_bitcast<_Up>( 1752 __lo256(_mm512_sllv_epi16(_mm512_castsi256_si512(__ix), 1753 _mm512_castsi256_si512(__iy)))); 1754 else if constexpr (sizeof __ix == 32 && __have_avx2) 1755 { 1756 const auto __ux = __vector_bitcast
(__x); 1757 const auto __uy = __vector_bitcast
(__y); 1758 return __vector_bitcast<_Up>(_mm256_blend_epi16( 1759 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1760 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1761 } 1762 else if constexpr (sizeof __ix == 16 && __have_avx512bw_vl) 1763 return __intrin_bitcast<_V>(_mm_sllv_epi16(__ix, __iy)); 1764 else if constexpr (sizeof __ix == 16 && __have_avx512bw) 1765 return __intrin_bitcast<_V>( 1766 __lo128(_mm512_sllv_epi16(_mm512_castsi128_si512(__ix), 1767 _mm512_castsi128_si512(__iy)))); 1768 else if constexpr (sizeof __ix == 16 && __have_avx2) 1769 { 1770 const auto __ux = __vector_bitcast
(__ix); 1771 const auto __uy = __vector_bitcast
(__iy); 1772 return __intrin_bitcast<_V>(_mm_blend_epi16( 1773 __auto_bitcast(__ux << (__uy & 0x0000ffffu)), 1774 __auto_bitcast((__ux & 0xffff0000u) << (__uy >> 16)), 0xaa)); 1775 } 1776 else if constexpr (sizeof __ix == 16) 1777 { 1778 using _Float4 = __vector_type_t
; 1779 using _Int4 = __vector_type_t
; 1780 using _UInt4 = __vector_type_t
; 1781 const _UInt4 __yu 1782 = reinterpret_cast<_UInt4>(__to_intrin(__y + (0x3f8 >> 3))); 1783 return __x 1784 * __intrin_bitcast<_V>( 1785 __vector_convert<_Int4>(_SimdWrapper
( 1786 reinterpret_cast<_Float4>(__yu << 23))) 1787 | (__vector_convert<_Int4>(_SimdWrapper
( 1788 reinterpret_cast<_Float4>((__yu >> 16) << 23))) 1789 << 16)); 1790 } 1791 else 1792 __assert_unreachable<_Tp>(); 1793 } 1794 else if constexpr (sizeof(_Up) == 4 && sizeof __ix == 16 1795 && !__have_avx2) 1796 // latency is suboptimal, but throughput is at full speedup 1797 return __intrin_bitcast<_V>( 1798 __vector_bitcast
(__ix) 1799 * __vector_convert<__vector_type16_t
>( 1800 _SimdWrapper
(__vector_bitcast
( 1801 (__vector_bitcast
(__y) << 23) + 0x3f80'0000)))); 1802 else if constexpr (sizeof(_Up) == 8 && sizeof __ix == 16 1803 && !__have_avx2) 1804 { 1805 const auto __lo = _mm_sll_epi64(__ix, __iy); 1806 const auto __hi 1807 = _mm_sll_epi64(__ix, _mm_unpackhi_epi64(__iy, __iy)); 1808 if constexpr (__have_sse4_1) 1809 return __vector_bitcast<_Up>(_mm_blend_epi16(__lo, __hi, 0xf0)); 1810 else 1811 return __vector_bitcast<_Up>( 1812 _mm_move_sd(__vector_bitcast
(__hi), 1813 __vector_bitcast
(__lo))); 1814 } 1815 else 1816 return __x << __y; 1817 } 1818 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 1819 1820 // }}} 1821 // _S_bit_shift_right {{{ 1822 #ifndef _GLIBCXX_SIMD_NO_SHIFT_OPT 1823 template
> 1824 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1825 _S_bit_shift_right(_Tp __xx, int __y) 1826 { 1827 using _V = typename _TVT::type; 1828 using _Up = typename _TVT::value_type; 1829 _V __x = __xx; 1830 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1831 if (__builtin_is_constant_evaluated()) 1832 return __x >> __y; 1833 else if (__builtin_constant_p(__y) 1834 && is_unsigned_v< 1835 _Up> && __y >= int(sizeof(_Up) * __CHAR_BIT__)) 1836 return _V(); 1837 else if constexpr (sizeof(_Up) == 1 && is_unsigned_v<_Up>) //{{{ 1838 return __intrin_bitcast<_V>(__vector_bitcast<_UShort>(__ix) >> __y) 1839 & _Up(0xff >> __y); 1840 //}}} 1841 else if constexpr (sizeof(_Up) == 1 && is_signed_v<_Up>) //{{{ 1842 return __intrin_bitcast<_V>( 1843 (__vector_bitcast<_UShort>(__vector_bitcast
(__ix) 1844 >> (__y + 8)) 1845 << 8) 1846 | (__vector_bitcast<_UShort>( 1847 __vector_bitcast
(__vector_bitcast<_UShort>(__ix) << 8) 1848 >> __y) 1849 >> 8)); 1850 //}}} 1851 // GCC optimizes sizeof == 2, 4, and unsigned 8 as expected 1852 else if constexpr (sizeof(_Up) == 8 && is_signed_v<_Up>) //{{{ 1853 { 1854 if (__y > 32) 1855 return (__intrin_bitcast<_V>(__vector_bitcast
(__ix) >> 32) 1856 & _Up(0xffff'ffff'0000'0000ull)) 1857 | __vector_bitcast<_Up>( 1858 __vector_bitcast
(__vector_bitcast<_ULLong>(__ix) 1859 >> 32) 1860 >> (__y - 32)); 1861 else 1862 return __intrin_bitcast<_V>(__vector_bitcast<_ULLong>(__ix) 1863 >> __y) 1864 | __vector_bitcast<_Up>( 1865 __vector_bitcast
(__ix & -0x8000'0000'0000'0000ll) 1866 >> __y); 1867 } 1868 //}}} 1869 else 1870 return __x >> __y; 1871 } 1872 1873 template
> 1874 constexpr inline _GLIBCXX_CONST static typename _TVT::type 1875 _S_bit_shift_right(_Tp __xx, typename _TVT::type __y) 1876 { 1877 using _V = typename _TVT::type; 1878 using _Up = typename _TVT::value_type; 1879 _V __x = __xx; 1880 [[maybe_unused]] const auto __ix = __to_intrin(__x); 1881 [[maybe_unused]] const auto __iy = __to_intrin(__y); 1882 if (__builtin_is_constant_evaluated() 1883 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 1884 return __x >> __y; 1885 else if constexpr (sizeof(_Up) == 1) //{{{ 1886 { 1887 if constexpr (sizeof(__x) <= 8 && __have_avx512bw_vl) 1888 return __intrin_bitcast<_V>(_mm_cvtepi16_epi8( 1889 is_signed_v<_Up> ? _mm_srav_epi16(_mm_cvtepi8_epi16(__ix), 1890 _mm_cvtepi8_epi16(__iy)) 1891 : _mm_srlv_epi16(_mm_cvtepu8_epi16(__ix), 1892 _mm_cvtepu8_epi16(__iy)))); 1893 if constexpr (sizeof(__x) == 16 && __have_avx512bw_vl) 1894 return __intrin_bitcast<_V>(_mm256_cvtepi16_epi8( 1895 is_signed_v<_Up> 1896 ? _mm256_srav_epi16(_mm256_cvtepi8_epi16(__ix), 1897 _mm256_cvtepi8_epi16(__iy)) 1898 : _mm256_srlv_epi16(_mm256_cvtepu8_epi16(__ix), 1899 _mm256_cvtepu8_epi16(__iy)))); 1900 else if constexpr (sizeof(__x) == 32 && __have_avx512bw) 1901 return __vector_bitcast<_Up>(_mm512_cvtepi16_epi8( 1902 is_signed_v<_Up> 1903 ? _mm512_srav_epi16(_mm512_cvtepi8_epi16(__ix), 1904 _mm512_cvtepi8_epi16(__iy)) 1905 : _mm512_srlv_epi16(_mm512_cvtepu8_epi16(__ix), 1906 _mm512_cvtepu8_epi16(__iy)))); 1907 else if constexpr (sizeof(__x) == 64 && is_signed_v<_Up>) 1908 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1909 _mm512_srav_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1910 0x5555'5555'5555'5555ull, 1911 _mm512_srav_epi16( 1912 _mm512_slli_epi16(__ix, 8), 1913 _mm512_maskz_add_epi8(0x5555'5555'5555'5555ull, __iy, 1914 _mm512_set1_epi16(8))))); 1915 else if constexpr (sizeof(__x) == 64 && is_unsigned_v<_Up>) 1916 return __vector_bitcast<_Up>(_mm512_mask_mov_epi8( 1917 _mm512_srlv_epi16(__ix, _mm512_srli_epi16(__iy, 8)), 1918 0x5555'5555'5555'5555ull, 1919 _mm512_srlv_epi16( 1920 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __ix), 1921 _mm512_maskz_mov_epi8(0x5555'5555'5555'5555ull, __iy)))); 1922 /* This has better throughput but higher latency than the impl below 1923 else if constexpr (__have_avx2 && sizeof(__x) == 16 && 1924 is_unsigned_v<_Up>) 1925 { 1926 const auto __shorts = __to_intrin(_S_bit_shift_right( 1927 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__ix)), 1928 __vector_bitcast<_UShort>(_mm256_cvtepu8_epi16(__iy)))); 1929 return __vector_bitcast<_Up>( 1930 _mm_packus_epi16(__lo128(__shorts), __hi128(__shorts))); 1931 } 1932 */ 1933 else if constexpr (__have_avx2 && sizeof(__x) > 8) 1934 // the following uses vpsr[al]vd, which requires AVX2 1935 if constexpr (is_signed_v<_Up>) 1936 { 1937 const auto r3 = __vector_bitcast<_UInt>( 1938 (__vector_bitcast
(__x) 1939 >> (__vector_bitcast<_UInt>(__y) >> 24))) 1940 & 0xff000000u; 1941 const auto r2 1942 = __vector_bitcast<_UInt>( 1943 ((__vector_bitcast
(__x) << 8) 1944 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24))) 1945 & 0xff000000u; 1946 const auto r1 1947 = __vector_bitcast<_UInt>( 1948 ((__vector_bitcast
(__x) << 16) 1949 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24))) 1950 & 0xff000000u; 1951 const auto r0 = __vector_bitcast<_UInt>( 1952 (__vector_bitcast
(__x) << 24) 1953 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24)); 1954 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1955 | (r0 >> 24)); 1956 } 1957 else 1958 { 1959 const auto r3 = (__vector_bitcast<_UInt>(__x) 1960 >> (__vector_bitcast<_UInt>(__y) >> 24)) 1961 & 0xff000000u; 1962 const auto r2 1963 = ((__vector_bitcast<_UInt>(__x) << 8) 1964 >> ((__vector_bitcast<_UInt>(__y) << 8) >> 24)) 1965 & 0xff000000u; 1966 const auto r1 1967 = ((__vector_bitcast<_UInt>(__x) << 16) 1968 >> ((__vector_bitcast<_UInt>(__y) << 16) >> 24)) 1969 & 0xff000000u; 1970 const auto r0 1971 = (__vector_bitcast<_UInt>(__x) << 24) 1972 >> ((__vector_bitcast<_UInt>(__y) << 24) >> 24); 1973 return __vector_bitcast<_Up>(r3 | (r2 >> 8) | (r1 >> 16) 1974 | (r0 >> 24)); 1975 } 1976 else if constexpr (__have_sse4_1 1977 && is_unsigned_v<_Up> && sizeof(__x) > 2) 1978 { 1979 auto __x128 = __vector_bitcast<_Up>(__ix); 1980 auto __mask 1981 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__iy) << 5); 1982 auto __x4 = __vector_bitcast<_Up>( 1983 (__vector_bitcast<_UShort>(__x128) >> 4) & _UShort(0xff0f)); 1984 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1985 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x4))); 1986 __mask += __mask; 1987 auto __x2 = __vector_bitcast<_Up>( 1988 (__vector_bitcast<_UShort>(__x128) >> 2) & _UShort(0xff3f)); 1989 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1990 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x2))); 1991 __mask += __mask; 1992 auto __x1 = __vector_bitcast<_Up>( 1993 (__vector_bitcast<_UShort>(__x128) >> 1) & _UShort(0xff7f)); 1994 __x128 = __vector_bitcast<_Up>(_CommonImplX86::_S_blend_intrin( 1995 __to_intrin(__mask), __to_intrin(__x128), __to_intrin(__x1))); 1996 return __intrin_bitcast<_V>( 1997 __x128 1998 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 1999 == 0)); // y > 7 nulls the result 2000 } 2001 else if constexpr (__have_sse4_1 2002 && is_signed_v<_Up> && sizeof(__x) > 2) 2003 { 2004 auto __mask = __vector_bitcast<_UChar>( 2005 __vector_bitcast<_UShort>(__iy) << 5); 2006 auto __maskl = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2007 return __to_intrin(__vector_bitcast<_UShort>(__mask) << 8); 2008 }; 2009 auto __xh = __vector_bitcast
(__ix); 2010 auto __xl = __vector_bitcast
(__ix) << 8; 2011 auto __xh4 = __xh >> 4; 2012 auto __xl4 = __xl >> 4; 2013 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2014 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh4))); 2015 __xl = __vector_bitcast
( 2016 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2017 __to_intrin(__xl4))); 2018 __mask += __mask; 2019 auto __xh2 = __xh >> 2; 2020 auto __xl2 = __xl >> 2; 2021 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2022 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh2))); 2023 __xl = __vector_bitcast
( 2024 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2025 __to_intrin(__xl2))); 2026 __mask += __mask; 2027 auto __xh1 = __xh >> 1; 2028 auto __xl1 = __xl >> 1; 2029 __xh = __vector_bitcast
(_CommonImplX86::_S_blend_intrin( 2030 __to_intrin(__mask), __to_intrin(__xh), __to_intrin(__xh1))); 2031 __xl = __vector_bitcast
( 2032 _CommonImplX86::_S_blend_intrin(__maskl(), __to_intrin(__xl), 2033 __to_intrin(__xl1))); 2034 return __intrin_bitcast<_V>( 2035 (__vector_bitcast<_Up>((__xh & short(0xff00))) 2036 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2037 >> 8)) 2038 & ((__vector_bitcast<_Up>(__iy) & char(0xf8)) 2039 == 0)); // y > 7 nulls the result 2040 } 2041 else if constexpr (is_unsigned_v<_Up> && sizeof(__x) > 2) // SSE2 2042 { 2043 auto __mask 2044 = __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__y) << 5); 2045 auto __x4 = __vector_bitcast<_Up>( 2046 (__vector_bitcast<_UShort>(__x) >> 4) & _UShort(0xff0f)); 2047 __x = __mask > 0x7f ? __x4 : __x; 2048 __mask += __mask; 2049 auto __x2 = __vector_bitcast<_Up>( 2050 (__vector_bitcast<_UShort>(__x) >> 2) & _UShort(0xff3f)); 2051 __x = __mask > 0x7f ? __x2 : __x; 2052 __mask += __mask; 2053 auto __x1 = __vector_bitcast<_Up>( 2054 (__vector_bitcast<_UShort>(__x) >> 1) & _UShort(0xff7f)); 2055 __x = __mask > 0x7f ? __x1 : __x; 2056 return __x 2057 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2058 } 2059 else if constexpr (sizeof(__x) > 2) // signed SSE2 2060 { 2061 static_assert(is_signed_v<_Up>); 2062 auto __maskh = __vector_bitcast<_UShort>(__y) << 5; 2063 auto __maskl = __vector_bitcast<_UShort>(__y) << (5 + 8); 2064 auto __xh = __vector_bitcast
(__x); 2065 auto __xl = __vector_bitcast
(__x) << 8; 2066 auto __xh4 = __xh >> 4; 2067 auto __xl4 = __xl >> 4; 2068 __xh = __maskh > 0x7fff ? __xh4 : __xh; 2069 __xl = __maskl > 0x7fff ? __xl4 : __xl; 2070 __maskh += __maskh; 2071 __maskl += __maskl; 2072 auto __xh2 = __xh >> 2; 2073 auto __xl2 = __xl >> 2; 2074 __xh = __maskh > 0x7fff ? __xh2 : __xh; 2075 __xl = __maskl > 0x7fff ? __xl2 : __xl; 2076 __maskh += __maskh; 2077 __maskl += __maskl; 2078 auto __xh1 = __xh >> 1; 2079 auto __xl1 = __xl >> 1; 2080 __xh = __maskh > 0x7fff ? __xh1 : __xh; 2081 __xl = __maskl > 0x7fff ? __xl1 : __xl; 2082 __x = __vector_bitcast<_Up>((__xh & short(0xff00))) 2083 | __vector_bitcast<_Up>(__vector_bitcast<_UShort>(__xl) 2084 >> 8); 2085 return __x 2086 & ((__y & char(0xf8)) == 0); // y > 7 nulls the result 2087 } 2088 else 2089 return __x >> __y; 2090 } //}}} 2091 else if constexpr (sizeof(_Up) == 2 && sizeof(__x) >= 4) //{{{ 2092 { 2093 [[maybe_unused]] auto __blend_0xaa 2094 = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2095 if constexpr (sizeof(__a) == 16) 2096 return _mm_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2097 0xaa); 2098 else if constexpr (sizeof(__a) == 32) 2099 return _mm256_blend_epi16(__to_intrin(__a), __to_intrin(__b), 2100 0xaa); 2101 else if constexpr (sizeof(__a) == 64) 2102 return _mm512_mask_blend_epi16(0xaaaa'aaaaU, __to_intrin(__a), 2103 __to_intrin(__b)); 2104 else 2105 __assert_unreachable
(); 2106 }; 2107 if constexpr (__have_avx512bw_vl && sizeof(_Tp) <= 16) 2108 return __intrin_bitcast<_V>(is_signed_v<_Up> 2109 ? _mm_srav_epi16(__ix, __iy) 2110 : _mm_srlv_epi16(__ix, __iy)); 2111 else if constexpr (__have_avx512bw_vl && sizeof(_Tp) == 32) 2112 return __vector_bitcast<_Up>(is_signed_v<_Up> 2113 ? _mm256_srav_epi16(__ix, __iy) 2114 : _mm256_srlv_epi16(__ix, __iy)); 2115 else if constexpr (__have_avx512bw && sizeof(_Tp) == 64) 2116 return __vector_bitcast<_Up>(is_signed_v<_Up> 2117 ? _mm512_srav_epi16(__ix, __iy) 2118 : _mm512_srlv_epi16(__ix, __iy)); 2119 else if constexpr (__have_avx2 && is_signed_v<_Up>) 2120 return __intrin_bitcast<_V>( 2121 __blend_0xaa(((__vector_bitcast
(__ix) << 16) 2122 >> (__vector_bitcast
(__iy) & 0xffffu)) 2123 >> 16, 2124 __vector_bitcast
(__ix) 2125 >> (__vector_bitcast
(__iy) >> 16))); 2126 else if constexpr (__have_avx2 && is_unsigned_v<_Up>) 2127 return __intrin_bitcast<_V>( 2128 __blend_0xaa((__vector_bitcast<_UInt>(__ix) & 0xffffu) 2129 >> (__vector_bitcast<_UInt>(__iy) & 0xffffu), 2130 __vector_bitcast<_UInt>(__ix) 2131 >> (__vector_bitcast<_UInt>(__iy) >> 16))); 2132 else if constexpr (__have_sse4_1) 2133 { 2134 auto __mask = __vector_bitcast<_UShort>(__iy); 2135 auto __x128 = __vector_bitcast<_Up>(__ix); 2136 //__mask *= 0x0808; 2137 __mask = (__mask << 3) | (__mask << 11); 2138 // do __x128 = 0 where __y[4] is set 2139 __x128 = __vector_bitcast<_Up>( 2140 _mm_blendv_epi8(__to_intrin(__x128), __m128i(), 2141 __to_intrin(__mask))); 2142 // do __x128 =>> 8 where __y[3] is set 2143 __x128 = __vector_bitcast<_Up>( 2144 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 8), 2145 __to_intrin(__mask += __mask))); 2146 // do __x128 =>> 4 where __y[2] is set 2147 __x128 = __vector_bitcast<_Up>( 2148 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 4), 2149 __to_intrin(__mask += __mask))); 2150 // do __x128 =>> 2 where __y[1] is set 2151 __x128 = __vector_bitcast<_Up>( 2152 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 2), 2153 __to_intrin(__mask += __mask))); 2154 // do __x128 =>> 1 where __y[0] is set 2155 return __intrin_bitcast<_V>( 2156 _mm_blendv_epi8(__to_intrin(__x128), __to_intrin(__x128 >> 1), 2157 __to_intrin(__mask + __mask))); 2158 } 2159 else 2160 { 2161 auto __k = __vector_bitcast<_UShort>(__iy) << 11; 2162 auto __x128 = __vector_bitcast<_Up>(__ix); 2163 auto __mask 2164 = [](__vector_type16_t<_UShort> __kk) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2165 return __vector_bitcast
(__kk) < 0; 2166 }; 2167 // do __x128 = 0 where __y[4] is set 2168 __x128 = __mask(__k) ? decltype(__x128)() : __x128; 2169 // do __x128 =>> 8 where __y[3] is set 2170 __x128 = __mask(__k += __k) ? __x128 >> 8 : __x128; 2171 // do __x128 =>> 4 where __y[2] is set 2172 __x128 = __mask(__k += __k) ? __x128 >> 4 : __x128; 2173 // do __x128 =>> 2 where __y[1] is set 2174 __x128 = __mask(__k += __k) ? __x128 >> 2 : __x128; 2175 // do __x128 =>> 1 where __y[0] is set 2176 return __intrin_bitcast<_V>(__mask(__k + __k) ? __x128 >> 1 2177 : __x128); 2178 } 2179 } //}}} 2180 else if constexpr (sizeof(_Up) == 4 && !__have_avx2) //{{{ 2181 { 2182 if constexpr (is_unsigned_v<_Up>) 2183 { 2184 // x >> y == x * 2^-y == (x * 2^(31-y)) >> 31 2185 const __m128 __factor_f = reinterpret_cast<__m128>( 2186 0x4f00'0000u - (__vector_bitcast
(__y) << 23)); 2187 const __m128i __factor 2188 = __builtin_constant_p(__factor_f) 2189 ? __to_intrin( 2190 __make_vector
(__factor_f[0], __factor_f[1], 2191 __factor_f[2], __factor_f[3])) 2192 : _mm_cvttps_epi32(__factor_f); 2193 const auto __r02 2194 = _mm_srli_epi64(_mm_mul_epu32(__ix, __factor), 31); 2195 const auto __r13 = _mm_mul_epu32(_mm_srli_si128(__ix, 4), 2196 _mm_srli_si128(__factor, 4)); 2197 if constexpr (__have_sse4_1) 2198 return __intrin_bitcast<_V>( 2199 _mm_blend_epi16(_mm_slli_epi64(__r13, 1), __r02, 0x33)); 2200 else 2201 return __intrin_bitcast<_V>( 2202 __r02 | _mm_slli_si128(_mm_srli_epi64(__r13, 31), 4)); 2203 } 2204 else 2205 { 2206 auto __shift = [](auto __a, auto __b) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2207 if constexpr (is_signed_v<_Up>) 2208 return _mm_sra_epi32(__a, __b); 2209 else 2210 return _mm_srl_epi32(__a, __b); 2211 }; 2212 const auto __r0 2213 = __shift(__ix, _mm_unpacklo_epi32(__iy, __m128i())); 2214 const auto __r1 = __shift(__ix, _mm_srli_epi64(__iy, 32)); 2215 const auto __r2 2216 = __shift(__ix, _mm_unpackhi_epi32(__iy, __m128i())); 2217 const auto __r3 = __shift(__ix, _mm_srli_si128(__iy, 12)); 2218 if constexpr (__have_sse4_1) 2219 return __intrin_bitcast<_V>( 2220 _mm_blend_epi16(_mm_blend_epi16(__r1, __r0, 0x3), 2221 _mm_blend_epi16(__r3, __r2, 0x30), 0xf0)); 2222 else 2223 return __intrin_bitcast<_V>(_mm_unpacklo_epi64( 2224 _mm_unpacklo_epi32(__r0, _mm_srli_si128(__r1, 4)), 2225 _mm_unpackhi_epi32(__r2, _mm_srli_si128(__r3, 4)))); 2226 } 2227 } //}}} 2228 else 2229 return __x >> __y; 2230 } 2231 #endif // _GLIBCXX_SIMD_NO_SHIFT_OPT 2232 2233 // }}} 2234 // compares {{{ 2235 // _S_equal_to {{{ 2236 template
2237 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2238 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2239 { 2240 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2241 { 2242 if (__builtin_is_constant_evaluated() 2243 || (__x._M_is_constprop() && __y._M_is_constprop())) 2244 return _MaskImpl::_S_to_bits( 2245 __as_wrapper<_Np>(__x._M_data == __y._M_data)); 2246 2247 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2248 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2249 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2250 if constexpr (is_floating_point_v<_Tp>) 2251 { 2252 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2253 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2254 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2255 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2256 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2257 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2258 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2259 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2260 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2261 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2262 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2263 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_EQ_OQ); 2264 else 2265 __assert_unreachable<_Tp>(); 2266 } 2267 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2268 return _mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2269 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2270 return _mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2271 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2272 return _mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2273 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2274 return _mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2275 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2276 return _mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2277 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2278 return _mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2279 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2280 return _mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2281 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2282 return _mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2283 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2284 return _mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); 2285 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2286 return _mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); 2287 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2288 return _mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); 2289 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2290 return _mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); 2291 else 2292 __assert_unreachable<_Tp>(); 2293 } // }}} 2294 else if (__builtin_is_constant_evaluated()) 2295 return _Base::_S_equal_to(__x, __y); 2296 else if constexpr (sizeof(__x) == 8) 2297 { 2298 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2299 == __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2300 _MaskMember<_Tp> __r64{}; 2301 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2302 return __r64; 2303 } 2304 else 2305 return _Base::_S_equal_to(__x, __y); 2306 } 2307 2308 // }}} 2309 // _S_not_equal_to {{{ 2310 template
2311 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2312 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2313 { 2314 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2315 { 2316 if (__builtin_is_constant_evaluated() 2317 || (__x._M_is_constprop() && __y._M_is_constprop())) 2318 return _MaskImpl::_S_to_bits( 2319 __as_wrapper<_Np>(__x._M_data != __y._M_data)); 2320 2321 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2322 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2323 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2324 if constexpr (is_floating_point_v<_Tp>) 2325 { 2326 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2327 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2328 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2329 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2330 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2331 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2332 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2333 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2334 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2335 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2336 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2337 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_UQ); 2338 else 2339 __assert_unreachable<_Tp>(); 2340 } 2341 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2342 return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2343 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2344 return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2345 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 2) 2346 return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2347 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 1) 2348 return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2349 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2350 return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2351 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2352 return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2353 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 2) 2354 return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2355 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 1) 2356 return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2357 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2358 return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi); 2359 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2360 return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi); 2361 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 2) 2362 return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi); 2363 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 1) 2364 return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi); 2365 else 2366 __assert_unreachable<_Tp>(); 2367 } // }}} 2368 else if (__builtin_is_constant_evaluated()) 2369 return _Base::_S_not_equal_to(__x, __y); 2370 else if constexpr (sizeof(__x) == 8) 2371 { 2372 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2373 != __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2374 _MaskMember<_Tp> __r64{}; 2375 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2376 return __r64; 2377 } 2378 else 2379 return _Base::_S_not_equal_to(__x, __y); 2380 } 2381 2382 // }}} 2383 // _S_less {{{ 2384 template
2385 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2386 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2387 { 2388 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2389 { 2390 if (__builtin_is_constant_evaluated() 2391 || (__x._M_is_constprop() && __y._M_is_constprop())) 2392 return _MaskImpl::_S_to_bits( 2393 __as_wrapper<_Np>(__x._M_data < __y._M_data)); 2394 2395 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2396 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2397 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2398 if constexpr (sizeof(__xi) == 64) 2399 { 2400 if constexpr (is_same_v<_Tp, float>) 2401 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2402 else if constexpr (is_same_v<_Tp, double>) 2403 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2404 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2405 return _mm512_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2406 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2407 return _mm512_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2408 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2409 return _mm512_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2410 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2411 return _mm512_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2412 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2413 return _mm512_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2414 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2415 return _mm512_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2416 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2417 return _mm512_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2418 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2419 return _mm512_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2420 else 2421 __assert_unreachable<_Tp>(); 2422 } 2423 else if constexpr (sizeof(__xi) == 32) 2424 { 2425 if constexpr (is_same_v<_Tp, float>) 2426 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2427 else if constexpr (is_same_v<_Tp, double>) 2428 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2429 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2430 return _mm256_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2431 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2432 return _mm256_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2433 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2434 return _mm256_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2435 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2436 return _mm256_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2437 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2438 return _mm256_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2439 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2440 return _mm256_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2441 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2442 return _mm256_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2443 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2444 return _mm256_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2445 else 2446 __assert_unreachable<_Tp>(); 2447 } 2448 else if constexpr (sizeof(__xi) == 16) 2449 { 2450 if constexpr (is_same_v<_Tp, float>) 2451 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OS); 2452 else if constexpr (is_same_v<_Tp, double>) 2453 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OS); 2454 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2455 return _mm_mask_cmplt_epi8_mask(__k1, __xi, __yi); 2456 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2457 return _mm_mask_cmplt_epi16_mask(__k1, __xi, __yi); 2458 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2459 return _mm_mask_cmplt_epi32_mask(__k1, __xi, __yi); 2460 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2461 return _mm_mask_cmplt_epi64_mask(__k1, __xi, __yi); 2462 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2463 return _mm_mask_cmplt_epu8_mask(__k1, __xi, __yi); 2464 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2465 return _mm_mask_cmplt_epu16_mask(__k1, __xi, __yi); 2466 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2467 return _mm_mask_cmplt_epu32_mask(__k1, __xi, __yi); 2468 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2469 return _mm_mask_cmplt_epu64_mask(__k1, __xi, __yi); 2470 else 2471 __assert_unreachable<_Tp>(); 2472 } 2473 else 2474 __assert_unreachable<_Tp>(); 2475 } // }}} 2476 else if (__builtin_is_constant_evaluated()) 2477 return _Base::_S_less(__x, __y); 2478 else if constexpr (sizeof(__x) == 8) 2479 { 2480 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2481 < __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2482 _MaskMember<_Tp> __r64{}; 2483 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2484 return __r64; 2485 } 2486 else 2487 return _Base::_S_less(__x, __y); 2488 } 2489 2490 // }}} 2491 // _S_less_equal {{{ 2492 template
2493 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2494 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2495 { 2496 if constexpr (__is_avx512_abi<_Abi>()) // {{{ 2497 { 2498 if (__builtin_is_constant_evaluated() 2499 || (__x._M_is_constprop() && __y._M_is_constprop())) 2500 return _MaskImpl::_S_to_bits( 2501 __as_wrapper<_Np>(__x._M_data <= __y._M_data)); 2502 2503 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2504 [[maybe_unused]] const auto __xi = __to_intrin(__x); 2505 [[maybe_unused]] const auto __yi = __to_intrin(__y); 2506 if constexpr (sizeof(__xi) == 64) 2507 { 2508 if constexpr (is_same_v<_Tp, float>) 2509 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2510 else if constexpr (is_same_v<_Tp, double>) 2511 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2512 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2513 return _mm512_mask_cmple_epi8_mask(__k1, __xi, __yi); 2514 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2515 return _mm512_mask_cmple_epi16_mask(__k1, __xi, __yi); 2516 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2517 return _mm512_mask_cmple_epi32_mask(__k1, __xi, __yi); 2518 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2519 return _mm512_mask_cmple_epi64_mask(__k1, __xi, __yi); 2520 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2521 return _mm512_mask_cmple_epu8_mask(__k1, __xi, __yi); 2522 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2523 return _mm512_mask_cmple_epu16_mask(__k1, __xi, __yi); 2524 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2525 return _mm512_mask_cmple_epu32_mask(__k1, __xi, __yi); 2526 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2527 return _mm512_mask_cmple_epu64_mask(__k1, __xi, __yi); 2528 else 2529 __assert_unreachable<_Tp>(); 2530 } 2531 else if constexpr (sizeof(__xi) == 32) 2532 { 2533 if constexpr (is_same_v<_Tp, float>) 2534 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2535 else if constexpr (is_same_v<_Tp, double>) 2536 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2537 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2538 return _mm256_mask_cmple_epi8_mask(__k1, __xi, __yi); 2539 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2540 return _mm256_mask_cmple_epi16_mask(__k1, __xi, __yi); 2541 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2542 return _mm256_mask_cmple_epi32_mask(__k1, __xi, __yi); 2543 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2544 return _mm256_mask_cmple_epi64_mask(__k1, __xi, __yi); 2545 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2546 return _mm256_mask_cmple_epu8_mask(__k1, __xi, __yi); 2547 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2548 return _mm256_mask_cmple_epu16_mask(__k1, __xi, __yi); 2549 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2550 return _mm256_mask_cmple_epu32_mask(__k1, __xi, __yi); 2551 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2552 return _mm256_mask_cmple_epu64_mask(__k1, __xi, __yi); 2553 else 2554 __assert_unreachable<_Tp>(); 2555 } 2556 else if constexpr (sizeof(__xi) == 16) 2557 { 2558 if constexpr (is_same_v<_Tp, float>) 2559 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OS); 2560 else if constexpr (is_same_v<_Tp, double>) 2561 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OS); 2562 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 1) 2563 return _mm_mask_cmple_epi8_mask(__k1, __xi, __yi); 2564 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 2) 2565 return _mm_mask_cmple_epi16_mask(__k1, __xi, __yi); 2566 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 4) 2567 return _mm_mask_cmple_epi32_mask(__k1, __xi, __yi); 2568 else if constexpr (is_signed_v<_Tp> && sizeof(_Tp) == 8) 2569 return _mm_mask_cmple_epi64_mask(__k1, __xi, __yi); 2570 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 1) 2571 return _mm_mask_cmple_epu8_mask(__k1, __xi, __yi); 2572 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 2) 2573 return _mm_mask_cmple_epu16_mask(__k1, __xi, __yi); 2574 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 4) 2575 return _mm_mask_cmple_epu32_mask(__k1, __xi, __yi); 2576 else if constexpr (is_unsigned_v<_Tp> && sizeof(_Tp) == 8) 2577 return _mm_mask_cmple_epu64_mask(__k1, __xi, __yi); 2578 else 2579 __assert_unreachable<_Tp>(); 2580 } 2581 else 2582 __assert_unreachable<_Tp>(); 2583 } // }}} 2584 else if (__builtin_is_constant_evaluated()) 2585 return _Base::_S_less_equal(__x, __y); 2586 else if constexpr (sizeof(__x) == 8) 2587 { 2588 const auto __r128 = __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__x) 2589 <= __vector_bitcast<_Tp, 16 / sizeof(_Tp)>(__y); 2590 _MaskMember<_Tp> __r64{}; 2591 __builtin_memcpy(&__r64._M_data, &__r128, sizeof(__r64)); 2592 return __r64; 2593 } 2594 else 2595 return _Base::_S_less_equal(__x, __y); 2596 } 2597 2598 // }}} }}} 2599 // negation {{{ 2600 template
2601 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2602 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 2603 { 2604 if constexpr (__is_avx512_abi<_Abi>()) 2605 return _S_equal_to(__x, _SimdWrapper<_Tp, _Np>()); 2606 else 2607 return _Base::_S_negate(__x); 2608 } 2609 2610 // }}} 2611 // math {{{ 2612 using _Base::_S_abs; 2613 2614 // _S_sqrt {{{ 2615 template
2616 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2617 _S_sqrt(_SimdWrapper<_Tp, _Np> __x) 2618 { 2619 if constexpr (__is_sse_ps<_Tp, _Np>()) 2620 return __auto_bitcast(_mm_sqrt_ps(__to_intrin(__x))); 2621 else if constexpr (__is_sse_pd<_Tp, _Np>()) 2622 return _mm_sqrt_pd(__x); 2623 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2624 return _mm256_sqrt_ps(__x); 2625 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2626 return _mm256_sqrt_pd(__x); 2627 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 2628 return _mm512_sqrt_ps(__x); 2629 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2630 return _mm512_sqrt_pd(__x); 2631 else 2632 __assert_unreachable<_Tp>(); 2633 } 2634 2635 // }}} 2636 // _S_ldexp {{{ 2637 template
2638 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2639 _S_ldexp(_SimdWrapper<_Tp, _Np> __x, 2640 __fixed_size_storage_t
__exp) 2641 { 2642 if constexpr (__is_avx512_abi<_Abi>()) 2643 { 2644 const auto __xi = __to_intrin(__x); 2645 constexpr _SimdConverter
, _Tp, _Abi> 2646 __cvt; 2647 const auto __expi = __to_intrin(__cvt(__exp)); 2648 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2649 if constexpr (sizeof(__xi) == 16) 2650 { 2651 if constexpr (sizeof(_Tp) == 8) 2652 return _mm_maskz_scalef_pd(__k1, __xi, __expi); 2653 else 2654 return _mm_maskz_scalef_ps(__k1, __xi, __expi); 2655 } 2656 else if constexpr (sizeof(__xi) == 32) 2657 { 2658 if constexpr (sizeof(_Tp) == 8) 2659 return _mm256_maskz_scalef_pd(__k1, __xi, __expi); 2660 else 2661 return _mm256_maskz_scalef_ps(__k1, __xi, __expi); 2662 } 2663 else 2664 { 2665 static_assert(sizeof(__xi) == 64); 2666 if constexpr (sizeof(_Tp) == 8) 2667 return _mm512_maskz_scalef_pd(__k1, __xi, __expi); 2668 else 2669 return _mm512_maskz_scalef_ps(__k1, __xi, __expi); 2670 } 2671 } 2672 else 2673 return _Base::_S_ldexp(__x, __exp); 2674 } 2675 2676 // }}} 2677 // _S_trunc {{{ 2678 template
2679 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2680 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2681 { 2682 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2683 return _mm512_roundscale_ps(__x, 0x0b); 2684 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2685 return _mm512_roundscale_pd(__x, 0x0b); 2686 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2687 return _mm256_round_ps(__x, 0x3); 2688 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2689 return _mm256_round_pd(__x, 0x3); 2690 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2691 return __auto_bitcast(_mm_round_ps(__to_intrin(__x), 0x3)); 2692 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2693 return _mm_round_pd(__x, 0x3); 2694 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2695 { 2696 auto __truncated 2697 = _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x))); 2698 const auto __no_fractional_values 2699 = __vector_bitcast
(__vector_bitcast<_UInt>(__to_intrin(__x)) 2700 & 0x7f800000u) 2701 < 0x4b000000; // the exponent is so large that no mantissa bits 2702 // signify fractional values (0x3f8 + 23*8 = 2703 // 0x4b0) 2704 return __no_fractional_values ? __truncated : __to_intrin(__x); 2705 } 2706 else 2707 return _Base::_S_trunc(__x); 2708 } 2709 2710 // }}} 2711 // _S_round {{{ 2712 template
2713 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2714 _S_round(_SimdWrapper<_Tp, _Np> __x) 2715 { 2716 // Note that _MM_FROUND_TO_NEAREST_INT rounds ties to even, not away 2717 // from zero as required by std::round. Therefore this function is more 2718 // complicated. 2719 using _V = __vector_type_t<_Tp, _Np>; 2720 _V __truncated; 2721 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2722 __truncated = _mm512_roundscale_ps(__x._M_data, 0x0b); 2723 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2724 __truncated = _mm512_roundscale_pd(__x._M_data, 0x0b); 2725 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2726 __truncated = _mm256_round_ps(__x._M_data, 2727 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2728 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2729 __truncated = _mm256_round_pd(__x._M_data, 2730 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2731 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2732 __truncated = __auto_bitcast( 2733 _mm_round_ps(__to_intrin(__x), 2734 _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); 2735 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2736 __truncated 2737 = _mm_round_pd(__x._M_data, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); 2738 else if constexpr (__is_sse_ps<_Tp, _Np>()) 2739 __truncated = __auto_bitcast( 2740 _mm_cvtepi32_ps(_mm_cvttps_epi32(__to_intrin(__x)))); 2741 else 2742 return _Base::_S_round(__x); 2743 2744 // x < 0 => truncated <= 0 && truncated >= x => x - truncated <= 0 2745 // x > 0 => truncated >= 0 && truncated <= x => x - truncated >= 0 2746 2747 const _V __rounded 2748 = __truncated 2749 + (__and(_S_absmask<_V>, __x._M_data - __truncated) >= _Tp(.5) 2750 ? __or(__and(_S_signmask<_V>, __x._M_data), _V() + 1) 2751 : _V()); 2752 if constexpr (__have_sse4_1) 2753 return __rounded; 2754 else // adjust for missing range in cvttps_epi32 2755 return __and(_S_absmask<_V>, __x._M_data) < 0x1p23f ? __rounded 2756 : __x._M_data; 2757 } 2758 2759 // }}} 2760 // _S_nearbyint {{{ 2761 template
> 2762 _GLIBCXX_SIMD_INTRINSIC static _Tp 2763 _S_nearbyint(_Tp __x) noexcept 2764 { 2765 if constexpr (_TVT::template _S_is
) 2766 return _mm512_roundscale_ps(__x, 0x0c); 2767 else if constexpr (_TVT::template _S_is
) 2768 return _mm512_roundscale_pd(__x, 0x0c); 2769 else if constexpr (_TVT::template _S_is
) 2770 return _mm256_round_ps(__x, 2771 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2772 else if constexpr (_TVT::template _S_is
) 2773 return _mm256_round_pd(__x, 2774 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2775 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2776 return _mm_round_ps(__x, 2777 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2778 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2779 return _mm_round_pd(__x, 2780 _MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC); 2781 else 2782 return _Base::_S_nearbyint(__x); 2783 } 2784 2785 // }}} 2786 // _S_rint {{{ 2787 template
> 2788 _GLIBCXX_SIMD_INTRINSIC static _Tp 2789 _S_rint(_Tp __x) noexcept 2790 { 2791 if constexpr (_TVT::template _S_is
) 2792 return _mm512_roundscale_ps(__x, 0x04); 2793 else if constexpr (_TVT::template _S_is
) 2794 return _mm512_roundscale_pd(__x, 0x04); 2795 else if constexpr (_TVT::template _S_is
) 2796 return _mm256_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2797 else if constexpr (_TVT::template _S_is
) 2798 return _mm256_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2799 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2800 return _mm_round_ps(__x, _MM_FROUND_CUR_DIRECTION); 2801 else if constexpr (__have_sse4_1 && _TVT::template _S_is
) 2802 return _mm_round_pd(__x, _MM_FROUND_CUR_DIRECTION); 2803 else 2804 return _Base::_S_rint(__x); 2805 } 2806 2807 // }}} 2808 // _S_floor {{{ 2809 template
2810 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2811 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2812 { 2813 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2814 return _mm512_roundscale_ps(__x, 0x09); 2815 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2816 return _mm512_roundscale_pd(__x, 0x09); 2817 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2818 return _mm256_round_ps(__x, 0x1); 2819 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2820 return _mm256_round_pd(__x, 0x1); 2821 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2822 return __auto_bitcast(_mm_floor_ps(__to_intrin(__x))); 2823 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2824 return _mm_floor_pd(__x); 2825 else 2826 return _Base::_S_floor(__x); 2827 } 2828 2829 // }}} 2830 // _S_ceil {{{ 2831 template
2832 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2833 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2834 { 2835 if constexpr (__is_avx512_ps<_Tp, _Np>()) 2836 return _mm512_roundscale_ps(__x, 0x0a); 2837 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 2838 return _mm512_roundscale_pd(__x, 0x0a); 2839 else if constexpr (__is_avx_ps<_Tp, _Np>()) 2840 return _mm256_round_ps(__x, 0x2); 2841 else if constexpr (__is_avx_pd<_Tp, _Np>()) 2842 return _mm256_round_pd(__x, 0x2); 2843 else if constexpr (__have_sse4_1 && __is_sse_ps<_Tp, _Np>()) 2844 return __auto_bitcast(_mm_ceil_ps(__to_intrin(__x))); 2845 else if constexpr (__have_sse4_1 && __is_sse_pd<_Tp, _Np>()) 2846 return _mm_ceil_pd(__x); 2847 else 2848 return _Base::_S_ceil(__x); 2849 } 2850 2851 // }}} 2852 // _S_signbit {{{ 2853 template
2854 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2855 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2856 { 2857 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 2858 { 2859 if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 4) 2860 return _mm512_movepi32_mask( 2861 __intrin_bitcast<__m512i>(__x._M_data)); 2862 else if constexpr (sizeof(__x) == 64 && sizeof(_Tp) == 8) 2863 return _mm512_movepi64_mask( 2864 __intrin_bitcast<__m512i>(__x._M_data)); 2865 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 4) 2866 return _mm256_movepi32_mask( 2867 __intrin_bitcast<__m256i>(__x._M_data)); 2868 else if constexpr (sizeof(__x) == 32 && sizeof(_Tp) == 8) 2869 return _mm256_movepi64_mask( 2870 __intrin_bitcast<__m256i>(__x._M_data)); 2871 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 4) 2872 return _mm_movepi32_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2873 else if constexpr (sizeof(__x) <= 16 && sizeof(_Tp) == 8) 2874 return _mm_movepi64_mask(__intrin_bitcast<__m128i>(__x._M_data)); 2875 } 2876 else if constexpr (__is_avx512_abi<_Abi>()) 2877 { 2878 const auto __xi = __to_intrin(__x); 2879 [[maybe_unused]] constexpr auto __k1 2880 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 2881 if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 2882 return _mm_movemask_ps(__xi); 2883 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 2884 return _mm_movemask_pd(__xi); 2885 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 2886 return _mm256_movemask_ps(__xi); 2887 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 2888 return _mm256_movemask_pd(__xi); 2889 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 2890 return _mm512_mask_cmplt_epi32_mask( 2891 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2892 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 2893 return _mm512_mask_cmplt_epi64_mask( 2894 __k1, __intrin_bitcast<__m512i>(__xi), __m512i()); 2895 else 2896 __assert_unreachable<_Tp>(); 2897 } 2898 else 2899 return _Base::_S_signbit(__x); 2900 /*{ 2901 using _I = __int_for_sizeof_t<_Tp>; 2902 if constexpr (sizeof(__x) == 64) 2903 return _S_less(__vector_bitcast<_I>(__x), _I()); 2904 else 2905 { 2906 const auto __xx = __vector_bitcast<_I>(__x._M_data); 2907 [[maybe_unused]] constexpr _I __signmask = __finite_min_v<_I>; 2908 if constexpr ((sizeof(_Tp) == 4 && 2909 (__have_avx2 || sizeof(__x) == 16)) || 2910 __have_avx512vl) 2911 { 2912 return __vector_bitcast<_Tp>(__xx >> __digits_v<_I>); 2913 } 2914 else if constexpr ((__have_avx2 || 2915 (__have_ssse3 && sizeof(__x) == 16))) 2916 { 2917 return __vector_bitcast<_Tp>((__xx & __signmask) == 2918 __signmask); 2919 } 2920 else 2921 { // SSE2/3 or AVX (w/o AVX2) 2922 constexpr auto __one = __vector_broadcast<_Np, _Tp>(1); 2923 return __vector_bitcast<_Tp>( 2924 __vector_bitcast<_Tp>( 2925 (__xx & __signmask) | 2926 __vector_bitcast<_I>(__one)) // -1 or 1 2927 != __one); 2928 } 2929 } 2930 }*/ 2931 } 2932 2933 // }}} 2934 // _S_isnonzerovalue_mask {{{ 2935 // (isnormal | is subnormal == !isinf & !isnan & !is zero) 2936 template
2937 _GLIBCXX_SIMD_INTRINSIC static auto 2938 _S_isnonzerovalue_mask(_Tp __x) 2939 { 2940 using _Traits = _VectorTraits<_Tp>; 2941 if constexpr (__have_avx512dq_vl) 2942 { 2943 if constexpr (_Traits::template _S_is< 2944 float, 2> || _Traits::template _S_is
) 2945 return _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), 0x9f)); 2946 else if constexpr (_Traits::template _S_is
) 2947 return _knot_mask8(_mm256_fpclass_ps_mask(__x, 0x9f)); 2948 else if constexpr (_Traits::template _S_is
) 2949 return _knot_mask16(_mm512_fpclass_ps_mask(__x, 0x9f)); 2950 else if constexpr (_Traits::template _S_is
) 2951 return _knot_mask8(_mm_fpclass_pd_mask(__x, 0x9f)); 2952 else if constexpr (_Traits::template _S_is
) 2953 return _knot_mask8(_mm256_fpclass_pd_mask(__x, 0x9f)); 2954 else if constexpr (_Traits::template _S_is
) 2955 return _knot_mask8(_mm512_fpclass_pd_mask(__x, 0x9f)); 2956 else 2957 __assert_unreachable<_Tp>(); 2958 } 2959 else 2960 { 2961 using _Up = typename _Traits::value_type; 2962 constexpr size_t _Np = _Traits::_S_full_size; 2963 const auto __a = __x * __infinity_v<_Up>; // NaN if __x == 0 2964 const auto __b = __x * _Up(); // NaN if __x == inf 2965 if constexpr (__have_avx512vl && __is_sse_ps<_Up, _Np>()) 2966 return _mm_cmp_ps_mask(__to_intrin(__a), __to_intrin(__b), 2967 _CMP_ORD_Q); 2968 else if constexpr (__have_avx512f && __is_sse_ps<_Up, _Np>()) 2969 return __mmask8(0xf 2970 & _mm512_cmp_ps_mask(__auto_bitcast(__a), 2971 __auto_bitcast(__b), 2972 _CMP_ORD_Q)); 2973 else if constexpr (__have_avx512vl && __is_sse_pd<_Up, _Np>()) 2974 return _mm_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2975 else if constexpr (__have_avx512f && __is_sse_pd<_Up, _Np>()) 2976 return __mmask8(0x3 2977 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2978 __auto_bitcast(__b), 2979 _CMP_ORD_Q)); 2980 else if constexpr (__have_avx512vl && __is_avx_ps<_Up, _Np>()) 2981 return _mm256_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2982 else if constexpr (__have_avx512f && __is_avx_ps<_Up, _Np>()) 2983 return __mmask8(_mm512_cmp_ps_mask(__auto_bitcast(__a), 2984 __auto_bitcast(__b), 2985 _CMP_ORD_Q)); 2986 else if constexpr (__have_avx512vl && __is_avx_pd<_Up, _Np>()) 2987 return _mm256_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2988 else if constexpr (__have_avx512f && __is_avx_pd<_Up, _Np>()) 2989 return __mmask8(0xf 2990 & _mm512_cmp_pd_mask(__auto_bitcast(__a), 2991 __auto_bitcast(__b), 2992 _CMP_ORD_Q)); 2993 else if constexpr (__is_avx512_ps<_Up, _Np>()) 2994 return _mm512_cmp_ps_mask(__a, __b, _CMP_ORD_Q); 2995 else if constexpr (__is_avx512_pd<_Up, _Np>()) 2996 return _mm512_cmp_pd_mask(__a, __b, _CMP_ORD_Q); 2997 else 2998 __assert_unreachable<_Tp>(); 2999 } 3000 } 3001 3002 // }}} 3003 // _S_isfinite {{{ 3004 template
3005 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3006 _S_isfinite(_SimdWrapper<_Tp, _Np> __x) 3007 { 3008 static_assert(is_floating_point_v<_Tp>); 3009 #if !__FINITE_MATH_ONLY__ 3010 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3011 { 3012 const auto __xi = __to_intrin(__x); 3013 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3014 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3015 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3016 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3017 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3018 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3019 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3020 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3021 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3022 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3023 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, 0x99); 3024 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3025 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, 0x99); 3026 } 3027 else if constexpr (__is_avx512_abi<_Abi>()) 3028 { 3029 // if all exponent bits are set, __x is either inf or NaN 3030 using _I = __int_for_sizeof_t<_Tp>; 3031 const auto __inf = __vector_bitcast<_I>( 3032 __vector_broadcast<_Np>(__infinity_v<_Tp>)); 3033 return _S_less<_I, _Np>(__vector_bitcast<_I>(__x) & __inf, __inf); 3034 } 3035 else 3036 #endif 3037 return _Base::_S_isfinite(__x); 3038 } 3039 3040 // }}} 3041 // _S_isinf {{{ 3042 template
3043 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3044 _S_isinf(_SimdWrapper<_Tp, _Np> __x) 3045 { 3046 #if !__FINITE_MATH_ONLY__ 3047 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3048 { 3049 const auto __xi = __to_intrin(__x); 3050 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3051 return _mm512_fpclass_ps_mask(__xi, 0x18); 3052 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3053 return _mm512_fpclass_pd_mask(__xi, 0x18); 3054 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3055 return _mm256_fpclass_ps_mask(__xi, 0x18); 3056 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3057 return _mm256_fpclass_pd_mask(__xi, 0x18); 3058 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3059 return _mm_fpclass_ps_mask(__xi, 0x18); 3060 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3061 return _mm_fpclass_pd_mask(__xi, 0x18); 3062 else 3063 __assert_unreachable<_Tp>(); 3064 } 3065 else if constexpr (__have_avx512dq_vl) 3066 { 3067 if constexpr (__is_sse_pd<_Tp, _Np>()) 3068 return _mm_movm_epi64(_mm_fpclass_pd_mask(__x, 0x18)); 3069 else if constexpr (__is_avx_pd<_Tp, _Np>()) 3070 return _mm256_movm_epi64(_mm256_fpclass_pd_mask(__x, 0x18)); 3071 else if constexpr (__is_sse_ps<_Tp, _Np>()) 3072 return _mm_movm_epi32( 3073 _mm_fpclass_ps_mask(__to_intrin(__x), 0x18)); 3074 else if constexpr (__is_avx_ps<_Tp, _Np>()) 3075 return _mm256_movm_epi32(_mm256_fpclass_ps_mask(__x, 0x18)); 3076 else 3077 __assert_unreachable<_Tp>(); 3078 } 3079 else 3080 #endif 3081 return _Base::_S_isinf(__x); 3082 } 3083 3084 // }}} 3085 // _S_isnormal {{{ 3086 template
3087 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3088 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 3089 { 3090 #if __FINITE_MATH_ONLY__ 3091 [[maybe_unused]] constexpr int __mode = 0x26; 3092 #else 3093 [[maybe_unused]] constexpr int __mode = 0xbf; 3094 #endif 3095 if constexpr (__is_avx512_abi<_Abi>() && __have_avx512dq) 3096 { 3097 const auto __xi = __to_intrin(__x); 3098 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3099 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3100 return __k1 ^ _mm512_mask_fpclass_ps_mask(__k1, __xi, __mode); 3101 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3102 return __k1 ^ _mm512_mask_fpclass_pd_mask(__k1, __xi, __mode); 3103 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3104 return __k1 ^ _mm256_mask_fpclass_ps_mask(__k1, __xi, __mode); 3105 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3106 return __k1 ^ _mm256_mask_fpclass_pd_mask(__k1, __xi, __mode); 3107 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3108 return __k1 ^ _mm_mask_fpclass_ps_mask(__k1, __xi, __mode); 3109 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3110 return __k1 ^ _mm_mask_fpclass_pd_mask(__k1, __xi, __mode); 3111 else 3112 __assert_unreachable<_Tp>(); 3113 } 3114 else if constexpr (__have_avx512dq) 3115 { 3116 if constexpr (__have_avx512vl && __is_sse_ps<_Tp, _Np>()) 3117 return _mm_movm_epi32( 3118 _knot_mask8(_mm_fpclass_ps_mask(__to_intrin(__x), __mode))); 3119 else if constexpr (__have_avx512vl && __is_avx_ps<_Tp, _Np>()) 3120 return _mm256_movm_epi32( 3121 _knot_mask8(_mm256_fpclass_ps_mask(__x, __mode))); 3122 else if constexpr (__is_avx512_ps<_Tp, _Np>()) 3123 return _knot_mask16(_mm512_fpclass_ps_mask(__x, __mode)); 3124 else if constexpr (__have_avx512vl && __is_sse_pd<_Tp, _Np>()) 3125 return _mm_movm_epi64( 3126 _knot_mask8(_mm_fpclass_pd_mask(__x, __mode))); 3127 else if constexpr (__have_avx512vl && __is_avx_pd<_Tp, _Np>()) 3128 return _mm256_movm_epi64( 3129 _knot_mask8(_mm256_fpclass_pd_mask(__x, __mode))); 3130 else if constexpr (__is_avx512_pd<_Tp, _Np>()) 3131 return _knot_mask8(_mm512_fpclass_pd_mask(__x, __mode)); 3132 else 3133 __assert_unreachable<_Tp>(); 3134 } 3135 else if constexpr (__is_avx512_abi<_Abi>()) 3136 { 3137 using _I = __int_for_sizeof_t<_Tp>; 3138 const auto absn = __vector_bitcast<_I>(_S_abs(__x)); 3139 const auto minn = __vector_bitcast<_I>( 3140 __vector_broadcast<_Np>(__norm_min_v<_Tp>)); 3141 #if __FINITE_MATH_ONLY__ 3142 return _S_less_equal<_I, _Np>(minn, absn); 3143 #else 3144 const auto infn 3145 = __vector_bitcast<_I>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 3146 return __and(_S_less_equal<_I, _Np>(minn, absn), 3147 _S_less<_I, _Np>(absn, infn)); 3148 #endif 3149 } 3150 else 3151 return _Base::_S_isnormal(__x); 3152 } 3153 3154 // }}} 3155 // _S_isnan {{{ 3156 template
3157 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3158 _S_isnan(_SimdWrapper<_Tp, _Np> __x) 3159 { return _S_isunordered(__x, __x); } 3160 3161 // }}} 3162 // _S_isunordered {{{ 3163 template
3164 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 3165 _S_isunordered([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x, 3166 [[maybe_unused]] _SimdWrapper<_Tp, _Np> __y) 3167 { 3168 #if __FINITE_MATH_ONLY__ 3169 return {}; // false 3170 #else 3171 const auto __xi = __to_intrin(__x); 3172 const auto __yi = __to_intrin(__y); 3173 if constexpr (__is_avx512_abi<_Abi>()) 3174 { 3175 constexpr auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3176 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3177 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3178 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3179 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3180 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3181 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3182 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3183 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3184 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3185 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3186 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3187 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_UNORD_Q); 3188 } 3189 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3190 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_UNORD_Q)); 3191 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3192 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_UNORD_Q)); 3193 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3194 return __auto_bitcast(_mm_cmpunord_ps(__xi, __yi)); 3195 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3196 return __to_masktype(_mm_cmpunord_pd(__xi, __yi)); 3197 else 3198 __assert_unreachable<_Tp>(); 3199 #endif 3200 } 3201 3202 // }}} 3203 // _S_isgreater {{{ 3204 template
3205 static constexpr _MaskMember<_Tp> 3206 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3207 { 3208 const auto __xi = __to_intrin(__x); 3209 const auto __yi = __to_intrin(__y); 3210 if constexpr (__is_avx512_abi<_Abi>()) 3211 { 3212 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3213 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3214 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3215 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3216 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3217 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3218 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3219 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3220 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3221 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3222 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3223 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3224 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GT_OQ); 3225 else 3226 __assert_unreachable<_Tp>(); 3227 } 3228 else if constexpr (__have_avx) 3229 { 3230 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3231 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3232 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3233 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3234 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3235 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GT_OQ)); 3236 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3237 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GT_OQ)); 3238 else 3239 __assert_unreachable<_Tp>(); 3240 } 3241 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3242 && sizeof(_Tp) == 4) 3243 { 3244 const auto __xn = __vector_bitcast
(__xi); 3245 const auto __yn = __vector_bitcast
(__yi); 3246 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3247 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3248 return __auto_bitcast( 3249 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp > __yp)); 3250 } 3251 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3252 && sizeof(_Tp) == 8) 3253 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3254 -_mm_ucomigt_sd(__xi, __yi), 3255 -_mm_ucomigt_sd(_mm_unpackhi_pd(__xi, __xi), 3256 _mm_unpackhi_pd(__yi, __yi))}; 3257 else 3258 return _Base::_S_isgreater(__x, __y); 3259 } 3260 3261 // }}} 3262 // _S_isgreaterequal {{{ 3263 template
3264 static constexpr _MaskMember<_Tp> 3265 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3266 { 3267 const auto __xi = __to_intrin(__x); 3268 const auto __yi = __to_intrin(__y); 3269 if constexpr (__is_avx512_abi<_Abi>()) 3270 { 3271 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3272 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3273 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3274 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3275 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3276 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3277 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3278 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3279 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3280 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3281 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3282 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3283 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_GE_OQ); 3284 else 3285 __assert_unreachable<_Tp>(); 3286 } 3287 else if constexpr (__have_avx) 3288 { 3289 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3290 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3291 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3292 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3293 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3294 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_GE_OQ)); 3295 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3296 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_GE_OQ)); 3297 else 3298 __assert_unreachable<_Tp>(); 3299 } 3300 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3301 && sizeof(_Tp) == 4) 3302 { 3303 const auto __xn = __vector_bitcast
(__xi); 3304 const auto __yn = __vector_bitcast
(__yi); 3305 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3306 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3307 return __auto_bitcast( 3308 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp >= __yp)); 3309 } 3310 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3311 && sizeof(_Tp) == 8) 3312 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3313 -_mm_ucomige_sd(__xi, __yi), 3314 -_mm_ucomige_sd(_mm_unpackhi_pd(__xi, __xi), 3315 _mm_unpackhi_pd(__yi, __yi))}; 3316 else 3317 return _Base::_S_isgreaterequal(__x, __y); 3318 } 3319 3320 // }}} 3321 // _S_isless {{{ 3322 template
3323 static constexpr _MaskMember<_Tp> 3324 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3325 { 3326 const auto __xi = __to_intrin(__x); 3327 const auto __yi = __to_intrin(__y); 3328 if constexpr (__is_avx512_abi<_Abi>()) 3329 { 3330 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3331 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3332 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3333 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3334 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3335 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3336 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3337 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3338 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3339 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3340 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3341 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3342 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LT_OQ); 3343 else 3344 __assert_unreachable<_Tp>(); 3345 } 3346 else if constexpr (__have_avx) 3347 { 3348 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3349 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3350 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3351 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3352 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3353 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LT_OQ)); 3354 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3355 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LT_OQ)); 3356 else 3357 __assert_unreachable<_Tp>(); 3358 } 3359 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3360 && sizeof(_Tp) == 4) 3361 { 3362 const auto __xn = __vector_bitcast
(__xi); 3363 const auto __yn = __vector_bitcast
(__yi); 3364 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3365 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3366 return __auto_bitcast( 3367 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp < __yp)); 3368 } 3369 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3370 && sizeof(_Tp) == 8) 3371 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3372 -_mm_ucomigt_sd(__yi, __xi), 3373 -_mm_ucomigt_sd(_mm_unpackhi_pd(__yi, __yi), 3374 _mm_unpackhi_pd(__xi, __xi))}; 3375 else 3376 return _Base::_S_isless(__x, __y); 3377 } 3378 3379 // }}} 3380 // _S_islessequal {{{ 3381 template
3382 static constexpr _MaskMember<_Tp> 3383 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3384 { 3385 const auto __xi = __to_intrin(__x); 3386 const auto __yi = __to_intrin(__y); 3387 if constexpr (__is_avx512_abi<_Abi>()) 3388 { 3389 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3390 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3391 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3392 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3393 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3394 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3395 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3396 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3397 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3398 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3399 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3400 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3401 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_LE_OQ); 3402 else 3403 __assert_unreachable<_Tp>(); 3404 } 3405 else if constexpr (__have_avx) 3406 { 3407 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3408 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3409 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3410 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3411 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3412 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_LE_OQ)); 3413 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3414 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_LE_OQ)); 3415 else 3416 __assert_unreachable<_Tp>(); 3417 } 3418 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3419 && sizeof(_Tp) == 4) 3420 { 3421 const auto __xn = __vector_bitcast
(__xi); 3422 const auto __yn = __vector_bitcast
(__yi); 3423 const auto __xp = __xn < 0 ? -(__xn & 0x7fff'ffff) : __xn; 3424 const auto __yp = __yn < 0 ? -(__yn & 0x7fff'ffff) : __yn; 3425 return __auto_bitcast( 3426 __and(__to_masktype(_mm_cmpord_ps(__xi, __yi)), __xp <= __yp)); 3427 } 3428 else if constexpr (__have_sse2 && sizeof(__xi) == 16 3429 && sizeof(_Tp) == 8) 3430 return __vector_type_t<__int_with_sizeof_t<8>, 2>{ 3431 -_mm_ucomige_sd(__yi, __xi), 3432 -_mm_ucomige_sd(_mm_unpackhi_pd(__yi, __yi), 3433 _mm_unpackhi_pd(__xi, __xi))}; 3434 else 3435 return _Base::_S_islessequal(__x, __y); 3436 } 3437 3438 // }}} 3439 // _S_islessgreater {{{ 3440 template
3441 static constexpr _MaskMember<_Tp> 3442 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 3443 { 3444 const auto __xi = __to_intrin(__x); 3445 const auto __yi = __to_intrin(__y); 3446 if constexpr (__is_avx512_abi<_Abi>()) 3447 { 3448 const auto __k1 = _Abi::template _S_implicit_mask_intrin<_Tp>(); 3449 if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 4) 3450 return _mm512_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3451 else if constexpr (sizeof(__xi) == 64 && sizeof(_Tp) == 8) 3452 return _mm512_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3453 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3454 return _mm256_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3455 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3456 return _mm256_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3457 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3458 return _mm_mask_cmp_ps_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3459 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3460 return _mm_mask_cmp_pd_mask(__k1, __xi, __yi, _CMP_NEQ_OQ); 3461 else 3462 __assert_unreachable<_Tp>(); 3463 } 3464 else if constexpr (__have_avx) 3465 { 3466 if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 4) 3467 return __to_masktype(_mm256_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3468 else if constexpr (sizeof(__xi) == 32 && sizeof(_Tp) == 8) 3469 return __to_masktype(_mm256_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3470 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3471 return __auto_bitcast(_mm_cmp_ps(__xi, __yi, _CMP_NEQ_OQ)); 3472 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3473 return __to_masktype(_mm_cmp_pd(__xi, __yi, _CMP_NEQ_OQ)); 3474 else 3475 __assert_unreachable<_Tp>(); 3476 } 3477 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 4) 3478 return __auto_bitcast( 3479 __and(_mm_cmpord_ps(__xi, __yi), _mm_cmpneq_ps(__xi, __yi))); 3480 else if constexpr (sizeof(__xi) == 16 && sizeof(_Tp) == 8) 3481 return __to_masktype( 3482 __and(_mm_cmpord_pd(__xi, __yi), _mm_cmpneq_pd(__xi, __yi))); 3483 else 3484 __assert_unreachable<_Tp>(); 3485 } 3486 3487 //}}} }}} 3488 template
class _Op, typename _Tp, typename _K, size_t _Np> 3489 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 3490 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, const _SimdWrapper<_Tp, _Np> __v) 3491 { 3492 if (__k._M_is_constprop_none_of()) 3493 return __v; 3494 else if (__k._M_is_constprop_all_of()) 3495 { 3496 auto __vv = _Base::_M_make_simd(__v); 3497 _Op
__op; 3498 return __data(__op(__vv)); 3499 } 3500 else if constexpr (__is_bitmask_v
3501 && (is_same_v<_Op
, __increment
> 3502 || is_same_v<_Op
, __decrement
>)) 3503 { 3504 // optimize masked unary increment and decrement as masked sub +/-1 3505 constexpr int __pm_one 3506 = is_same_v<_Op
, __increment
> ? -1 : 1; 3507 #ifdef __clang__ 3508 return __movm<_Np, _Tp>(__k._M_data) ? __v._M_data - __pm_one : __v._M_data; 3509 #else // __clang__ 3510 using _TV = __vector_type_t<_Tp, _Np>; 3511 constexpr size_t __bytes = sizeof(__v) < 16 ? 16 : sizeof(__v); 3512 constexpr size_t __width = __bytes / sizeof(_Tp); 3513 if constexpr (is_integral_v<_Tp>) 3514 { 3515 constexpr bool __lp64 = sizeof(long) == sizeof(long long); 3516 using _Ip = std::make_signed_t<_Tp>; 3517 using _Up = std::conditional_t< 3518 std::is_same_v<_Ip, long>, 3519 std::conditional_t<__lp64, long long, int>, 3520 std::conditional_t< 3521 std::is_same_v<_Ip, signed char>, char, _Ip>>; 3522 const auto __value = __intrin_bitcast<__vector_type_t<_Up, __width>>(__v._M_data); 3523 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3524 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3525 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask(__value, \ 3526 __vector_broadcast<__width>(_Up(__pm_one)), __value, __k._M_data)) 3527 _GLIBCXX_SIMD_MASK_SUB(1, 64, psubb512); 3528 _GLIBCXX_SIMD_MASK_SUB(1, 32, psubb256); 3529 _GLIBCXX_SIMD_MASK_SUB(1, 16, psubb128); 3530 _GLIBCXX_SIMD_MASK_SUB(2, 64, psubw512); 3531 _GLIBCXX_SIMD_MASK_SUB(2, 32, psubw256); 3532 _GLIBCXX_SIMD_MASK_SUB(2, 16, psubw128); 3533 _GLIBCXX_SIMD_MASK_SUB(4, 64, psubd512); 3534 _GLIBCXX_SIMD_MASK_SUB(4, 32, psubd256); 3535 _GLIBCXX_SIMD_MASK_SUB(4, 16, psubd128); 3536 _GLIBCXX_SIMD_MASK_SUB(8, 64, psubq512); 3537 _GLIBCXX_SIMD_MASK_SUB(8, 32, psubq256); 3538 _GLIBCXX_SIMD_MASK_SUB(8, 16, psubq128); 3539 #undef _GLIBCXX_SIMD_MASK_SUB 3540 } 3541 else 3542 { 3543 const auto __value = __intrin_bitcast<__vector_type_t<_Tp, __width>>(__v._M_data); 3544 #define _GLIBCXX_SIMD_MASK_SUB_512(_Sizeof, _Width, _Instr) \ 3545 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3546 return __builtin_ia32_##_Instr##_mask( \ 3547 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3548 __k._M_data, _MM_FROUND_CUR_DIRECTION) 3549 #define _GLIBCXX_SIMD_MASK_SUB(_Sizeof, _Width, _Instr) \ 3550 if constexpr (sizeof(_Tp) == _Sizeof && sizeof(__value) == _Width) \ 3551 return __intrin_bitcast<_TV>(__builtin_ia32_##_Instr##_mask( \ 3552 __value, __vector_broadcast<__width>(_Tp(__pm_one)), __value, \ 3553 __k._M_data)) 3554 _GLIBCXX_SIMD_MASK_SUB_512(4, 64, subps512); 3555 _GLIBCXX_SIMD_MASK_SUB(4, 32, subps256); 3556 _GLIBCXX_SIMD_MASK_SUB(4, 16, subps128); 3557 _GLIBCXX_SIMD_MASK_SUB_512(8, 64, subpd512); 3558 _GLIBCXX_SIMD_MASK_SUB(8, 32, subpd256); 3559 _GLIBCXX_SIMD_MASK_SUB(8, 16, subpd128); 3560 #undef _GLIBCXX_SIMD_MASK_SUB_512 3561 #undef _GLIBCXX_SIMD_MASK_SUB 3562 } 3563 #endif // __clang__ 3564 } 3565 else 3566 return _Base::template _S_masked_unary<_Op>(__k, __v); 3567 } 3568 }; 3569 3570 // }}} 3571 // _MaskImplX86Mixin {{{ 3572 struct _MaskImplX86Mixin 3573 { 3574 template
3575 using _TypeTag = _Tp*; 3576 3577 using _Base = _MaskImplBuiltinMixin; 3578 3579 // _S_to_maskvector(bool) {{{ 3580 template
3581 _GLIBCXX_SIMD_INTRINSIC static constexpr 3582 enable_if_t
, _SimdWrapper<_Up, _ToN>> 3583 _S_to_maskvector(_Tp __x) 3584 { 3585 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3586 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 3587 : __vector_type_t<_Up, _ToN>(); 3588 } 3589 3590 // }}} 3591 // _S_to_maskvector(_SanitizedBitMask) {{{ 3592 template
3593 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3594 _S_to_maskvector(_SanitizedBitMask<_Np> __x) 3595 { 3596 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3597 using _UV = __vector_type_t<_Up, _ToN>; 3598 using _UI = __intrinsic_type_t<_Up, _ToN>; 3599 [[maybe_unused]] const auto __k = __x._M_to_bits(); 3600 if constexpr (_Np == 1) 3601 return _S_to_maskvector<_Up, _ToN>(__k); 3602 else if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3603 return __generate_from_n_evaluations
( 3604 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return -__x[__i.value]; }); 3605 else if constexpr (sizeof(_Up) == 1) 3606 { 3607 if constexpr (sizeof(_UI) == 16) 3608 { 3609 if constexpr (__have_avx512bw_vl) 3610 return __intrin_bitcast<_UV>(_mm_movm_epi8(__k)); 3611 else if constexpr (__have_avx512bw) 3612 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi8(__k))); 3613 else if constexpr (__have_avx512f) 3614 { 3615 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3616 auto __as16bits 3617 = __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3618 __hi256(__as32bits))); 3619 return __intrin_bitcast<_UV>( 3620 _mm_packs_epi16(__lo128(__as16bits), __hi128(__as16bits))); 3621 } 3622 else if constexpr (__have_ssse3) 3623 { 3624 const auto __bitmask = __to_intrin( 3625 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 3626 8, 16, 32, 64, 128)); 3627 return __intrin_bitcast<_UV>( 3628 __vector_bitcast<_Up>( 3629 _mm_shuffle_epi8(__to_intrin( 3630 __vector_type_t<_ULLong, 2>{__k}), 3631 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3632 1, 1, 1, 1, 1, 1, 1)) 3633 & __bitmask) 3634 != 0); 3635 } 3636 // else fall through 3637 } 3638 else if constexpr (sizeof(_UI) == 32) 3639 { 3640 if constexpr (__have_avx512bw_vl) 3641 return __vector_bitcast<_Up>(_mm256_movm_epi8(__k)); 3642 else if constexpr (__have_avx512bw) 3643 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi8(__k))); 3644 else if constexpr (__have_avx512f) 3645 { 3646 auto __as16bits = // 0 16 1 17 ... 15 31 3647 _mm512_srli_epi32(_mm512_maskz_mov_epi32(__k, ~__m512i()), 3648 16) 3649 | _mm512_slli_epi32(_mm512_maskz_mov_epi32(__k >> 16, 3650 ~__m512i()), 3651 16); 3652 auto __0_16_1_17 = __xzyw(_mm256_packs_epi16( 3653 __lo256(__as16bits), 3654 __hi256(__as16bits)) // 0 16 1 17 2 18 3 19 8 24 9 25 ... 3655 ); 3656 // deinterleave: 3657 return __vector_bitcast<_Up>(__xzyw(_mm256_shuffle_epi8( 3658 __0_16_1_17, // 0 16 1 17 2 ... 3659 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 3660 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3661 3, 5, 7, 9, 11, 13, 3662 15)))); // 0-7 16-23 8-15 24-31 -> xzyw 3663 // 0-3 8-11 16-19 24-27 3664 // 4-7 12-15 20-23 28-31 3665 } 3666 else if constexpr (__have_avx2) 3667 { 3668 const auto __bitmask 3669 = _mm256_broadcastsi128_si256(__to_intrin( 3670 __make_vector<_UChar>(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 3671 4, 8, 16, 32, 64, 128))); 3672 return __vector_bitcast<_Up>( 3673 __vector_bitcast<_Up>( 3674 _mm256_shuffle_epi8( 3675 _mm256_broadcastsi128_si256( 3676 __to_intrin(__vector_type_t<_ULLong, 2>{__k})), 3677 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3678 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3679 3, 3, 3, 3, 3, 3)) 3680 & __bitmask) 3681 != 0); 3682 } 3683 // else fall through 3684 } 3685 else if constexpr (sizeof(_UI) == 64) 3686 return reinterpret_cast<_UV>(_mm512_movm_epi8(__k)); 3687 if constexpr (std::min(_ToN, _Np) <= 4) 3688 { 3689 if constexpr (_Np > 7) // avoid overflow 3690 __x &= _SanitizedBitMask<_Np>(0x0f); 3691 const _UInt __char_mask 3692 = ((_UInt(__x.to_ulong()) * 0x00204081U) & 0x01010101ULL) 3693 * 0xff; 3694 _UV __r = {}; 3695 __builtin_memcpy(&__r, &__char_mask, 3696 std::min(sizeof(__r), sizeof(__char_mask))); 3697 return __r; 3698 } 3699 else if constexpr (std::min(_ToN, _Np) <= 7) 3700 { 3701 if constexpr (_Np > 7) // avoid overflow 3702 __x &= _SanitizedBitMask<_Np>(0x7f); 3703 const _ULLong __char_mask 3704 = ((__x.to_ulong() * 0x40810204081ULL) & 0x0101010101010101ULL) 3705 * 0xff; 3706 _UV __r = {}; 3707 __builtin_memcpy(&__r, &__char_mask, 3708 std::min(sizeof(__r), sizeof(__char_mask))); 3709 return __r; 3710 } 3711 } 3712 else if constexpr (sizeof(_Up) == 2) 3713 { 3714 if constexpr (sizeof(_UI) == 16) 3715 { 3716 if constexpr (__have_avx512bw_vl) 3717 return __intrin_bitcast<_UV>(_mm_movm_epi16(__k)); 3718 else if constexpr (__have_avx512bw) 3719 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi16(__k))); 3720 else if constexpr (__have_avx512f) 3721 { 3722 __m256i __as32bits = {}; 3723 if constexpr (__have_avx512vl) 3724 __as32bits = _mm256_maskz_mov_epi32(__k, ~__m256i()); 3725 else 3726 __as32bits 3727 = __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i())); 3728 return __intrin_bitcast<_UV>( 3729 _mm_packs_epi32(__lo128(__as32bits), __hi128(__as32bits))); 3730 } 3731 // else fall through 3732 } 3733 else if constexpr (sizeof(_UI) == 32) 3734 { 3735 if constexpr (__have_avx512bw_vl) 3736 return __vector_bitcast<_Up>(_mm256_movm_epi16(__k)); 3737 else if constexpr (__have_avx512bw) 3738 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi16(__k))); 3739 else if constexpr (__have_avx512f) 3740 { 3741 auto __as32bits = _mm512_maskz_mov_epi32(__k, ~__m512i()); 3742 return __vector_bitcast<_Up>( 3743 __xzyw(_mm256_packs_epi32(__lo256(__as32bits), 3744 __hi256(__as32bits)))); 3745 } 3746 // else fall through 3747 } 3748 else if constexpr (sizeof(_UI) == 64) 3749 return __vector_bitcast<_Up>(_mm512_movm_epi16(__k)); 3750 } 3751 else if constexpr (sizeof(_Up) == 4) 3752 { 3753 if constexpr (sizeof(_UI) == 16) 3754 { 3755 if constexpr (__have_avx512dq_vl) 3756 return __intrin_bitcast<_UV>(_mm_movm_epi32(__k)); 3757 else if constexpr (__have_avx512dq) 3758 return __intrin_bitcast<_UV>(__lo128(_mm512_movm_epi32(__k))); 3759 else if constexpr (__have_avx512vl) 3760 return __intrin_bitcast<_UV>( 3761 _mm_maskz_mov_epi32(__k, ~__m128i())); 3762 else if constexpr (__have_avx512f) 3763 return __intrin_bitcast<_UV>( 3764 __lo128(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3765 // else fall through 3766 } 3767 else if constexpr (sizeof(_UI) == 32) 3768 { 3769 if constexpr (__have_avx512dq_vl) 3770 return __vector_bitcast<_Up>(_mm256_movm_epi32(__k)); 3771 else if constexpr (__have_avx512dq) 3772 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi32(__k))); 3773 else if constexpr (__have_avx512vl) 3774 return __vector_bitcast<_Up>( 3775 _mm256_maskz_mov_epi32(__k, ~__m256i())); 3776 else if constexpr (__have_avx512f) 3777 return __vector_bitcast<_Up>( 3778 __lo256(_mm512_maskz_mov_epi32(__k, ~__m512i()))); 3779 // else fall through 3780 } 3781 else if constexpr (sizeof(_UI) == 64) 3782 return __vector_bitcast<_Up>( 3783 __have_avx512dq ? _mm512_movm_epi32(__k) 3784 : _mm512_maskz_mov_epi32(__k, ~__m512i())); 3785 } 3786 else if constexpr (sizeof(_Up) == 8) 3787 { 3788 if constexpr (sizeof(_UI) == 16) 3789 { 3790 if constexpr (__have_avx512dq_vl) 3791 return __vector_bitcast<_Up>(_mm_movm_epi64(__k)); 3792 else if constexpr (__have_avx512dq) 3793 return __vector_bitcast<_Up>(__lo128(_mm512_movm_epi64(__k))); 3794 else if constexpr (__have_avx512vl) 3795 return __vector_bitcast<_Up>( 3796 _mm_maskz_mov_epi64(__k, ~__m128i())); 3797 else if constexpr (__have_avx512f) 3798 return __vector_bitcast<_Up>( 3799 __lo128(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3800 // else fall through 3801 } 3802 else if constexpr (sizeof(_UI) == 32) 3803 { 3804 if constexpr (__have_avx512dq_vl) 3805 return __vector_bitcast<_Up>(_mm256_movm_epi64(__k)); 3806 else if constexpr (__have_avx512dq) 3807 return __vector_bitcast<_Up>(__lo256(_mm512_movm_epi64(__k))); 3808 else if constexpr (__have_avx512vl) 3809 return __vector_bitcast<_Up>( 3810 _mm256_maskz_mov_epi64(__k, ~__m256i())); 3811 else if constexpr (__have_avx512f) 3812 return __vector_bitcast<_Up>( 3813 __lo256(_mm512_maskz_mov_epi64(__k, ~__m512i()))); 3814 // else fall through 3815 } 3816 else if constexpr (sizeof(_UI) == 64) 3817 return __vector_bitcast<_Up>( 3818 __have_avx512dq ? _mm512_movm_epi64(__k) 3819 : _mm512_maskz_mov_epi64(__k, ~__m512i())); 3820 } 3821 3822 using _UpUInt = make_unsigned_t<_Up>; 3823 using _V = __vector_type_t<_UpUInt, _ToN>; 3824 constexpr size_t __bits_per_element = sizeof(_Up) * __CHAR_BIT__; 3825 if constexpr (_ToN == 2) 3826 { 3827 return __vector_bitcast<_Up>(_V{_UpUInt(-__x[0]), _UpUInt(-__x[1])}); 3828 } 3829 else if constexpr (!__have_avx2 && __have_avx && sizeof(_V) == 32) 3830 { 3831 if constexpr (sizeof(_Up) == 4) 3832 return __vector_bitcast<_Up>(_mm256_cmp_ps( 3833 _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(__k)), 3834 _mm256_castsi256_ps(_mm256_setr_epi32( 3835 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80))), 3836 _mm256_setzero_ps(), _CMP_NEQ_UQ)); 3837 else if constexpr (sizeof(_Up) == 8) 3838 return __vector_bitcast<_Up>(_mm256_cmp_pd( 3839 _mm256_and_pd(_mm256_castsi256_pd(_mm256_set1_epi64x(__k)), 3840 _mm256_castsi256_pd( 3841 _mm256_setr_epi64x(0x01, 0x02, 0x04, 0x08))), 3842 _mm256_setzero_pd(), _CMP_NEQ_UQ)); 3843 else 3844 __assert_unreachable<_Up>(); 3845 } 3846 else if constexpr (__bits_per_element >= _ToN) 3847 { 3848 constexpr auto __bitmask 3849 = __generate_vector<_V>([](auto __i) 3850 constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _UpUInt 3851 { return __i < _ToN ? 1ull << __i : 0; }); 3852 const auto __bits 3853 = __vector_broadcast<_ToN, _UpUInt>(__k) & __bitmask; 3854 if constexpr (__bits_per_element > _ToN) 3855 return __vector_bitcast<_Up>(__bits) > 0; 3856 else 3857 return __vector_bitcast<_Up>(__bits != 0); 3858 } 3859 else 3860 { 3861 const _V __tmp 3862 = __generate_vector<_V>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3863 return static_cast<_UpUInt>( 3864 __k >> (__bits_per_element * (__i / __bits_per_element))); 3865 }) 3866 & __generate_vector<_V>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 3867 return static_cast<_UpUInt>(1ull 3868 << (__i % __bits_per_element)); 3869 }); // mask bit index 3870 return __intrin_bitcast<_UV>(__tmp != _V()); 3871 } 3872 } 3873 3874 // }}} 3875 // _S_to_maskvector(_SimdWrapper) {{{ 3876 template
3878 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 3879 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 3880 { 3881 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 3882 using _TW = _SimdWrapper<_Tp, _Np>; 3883 using _UW = _SimdWrapper<_Up, _ToN>; 3884 using _UI = __intrinsic_type_t<_Up, _ToN>; 3885 if constexpr (is_same_v<_Tp, bool>) // bits -> vector 3886 return _S_to_maskvector<_Up, _ToN>( 3887 _BitMask<_Np>(__x._M_data)._M_sanitized()); 3888 // vector -> vector bitcast 3889 else if constexpr (sizeof(_Up) == sizeof(_Tp) 3890 && sizeof(_TW) == sizeof(_UW)) 3891 return __wrapper_bitcast<_Up, _ToN>( 3892 _ToN <= _Np 3893 ? __x 3894 : simd_abi::_VecBuiltin
::_S_masked(__x)); 3895 else // vector -> vector {{{ 3896 { 3897 if (__x._M_is_constprop() || __builtin_is_constant_evaluated()) 3898 { 3899 const auto __y = __vector_bitcast<__int_for_sizeof_t<_Tp>>(__x); 3900 return __generate_from_n_evaluations
>( 3902 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Up { return __y[__i.value]; }); 3903 } 3904 using _To = __vector_type_t<_Up, _ToN>; 3905 [[maybe_unused]] constexpr size_t _FromN = _Np; 3906 constexpr int _FromBytes = sizeof(_Tp); 3907 constexpr int _ToBytes = sizeof(_Up); 3908 const auto __k = __x._M_data; 3909 3910 if constexpr (_FromBytes == _ToBytes) 3911 return __intrin_bitcast<_To>(__k); 3912 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 16) 3913 { // SSE -> SSE {{{ 3914 if constexpr (_FromBytes == 4 && _ToBytes == 8) 3915 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3916 else if constexpr (_FromBytes == 2 && _ToBytes == 8) 3917 { 3918 const auto __y 3919 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3920 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3921 } 3922 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 3923 { 3924 auto __y 3925 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3926 auto __z 3927 = __vector_bitcast
(__interleave128_lo(__y, __y)); 3928 return __intrin_bitcast<_To>(__interleave128_lo(__z, __z)); 3929 } 3930 else if constexpr (_FromBytes == 8 && _ToBytes == 4 3931 && __have_sse2) 3932 return __intrin_bitcast<_To>( 3933 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3934 else if constexpr (_FromBytes == 8 && _ToBytes == 4) 3935 return __vector_shuffle<1, 3, 6, 7>(__vector_bitcast<_Up>(__k), 3936 _UI()); 3937 else if constexpr (_FromBytes == 2 && _ToBytes == 4) 3938 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3939 else if constexpr (_FromBytes == 1 && _ToBytes == 4) 3940 { 3941 const auto __y 3942 = __vector_bitcast
(__interleave128_lo(__k, __k)); 3943 return __intrin_bitcast<_To>(__interleave128_lo(__y, __y)); 3944 } 3945 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 3946 { 3947 if constexpr (__have_sse2 && !__have_ssse3) 3948 return __intrin_bitcast<_To>(_mm_packs_epi32( 3949 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()), 3950 __m128i())); 3951 else 3952 return __intrin_bitcast<_To>( 3953 __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>( 3954 __vector_bitcast<_Up>(__k))); 3955 } 3956 else if constexpr (_FromBytes == 4 && _ToBytes == 2) 3957 return __intrin_bitcast<_To>( 3958 _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i())); 3959 else if constexpr (_FromBytes == 1 && _ToBytes == 2) 3960 return __intrin_bitcast<_To>(__interleave128_lo(__k, __k)); 3961 else if constexpr (_FromBytes == 8 && _ToBytes == 1 3962 && __have_ssse3) 3963 return __intrin_bitcast<_To>( 3964 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3965 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, -1, 3966 -1, -1, -1, -1, -1, -1, -1, 3967 -1))); 3968 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 3969 { 3970 auto __y 3971 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3972 __y = _mm_packs_epi32(__y, __m128i()); 3973 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3974 } 3975 else if constexpr (_FromBytes == 4 && _ToBytes == 1 3976 && __have_ssse3) 3977 return __intrin_bitcast<_To>( 3978 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 3979 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 3980 -1, -1, -1, -1, -1, -1, -1, 3981 -1))); 3982 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 3983 { 3984 const auto __y 3985 = _mm_packs_epi32(__vector_bitcast<_LLong>(__k), __m128i()); 3986 return __intrin_bitcast<_To>(_mm_packs_epi16(__y, __m128i())); 3987 } 3988 else if constexpr (_FromBytes == 2 && _ToBytes == 1) 3989 return __intrin_bitcast<_To>( 3990 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())); 3991 else 3992 __assert_unreachable<_Tp>(); 3993 } // }}} 3994 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 32) 3995 { // AVX -> AVX {{{ 3996 if constexpr (_FromBytes == _ToBytes) 3997 __assert_unreachable<_Tp>(); 3998 else if constexpr (_FromBytes == _ToBytes * 2) 3999 { 4000 const auto __y = __vector_bitcast<_LLong>(__k); 4001 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4002 _mm_packs_epi16(__lo128(__y), __hi128(__y)))); 4003 } 4004 else if constexpr (_FromBytes == _ToBytes * 4) 4005 { 4006 const auto __y = __vector_bitcast<_LLong>(__k); 4007 return __intrin_bitcast<_To>(_mm256_castsi128_si256( 4008 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4009 __m128i()))); 4010 } 4011 else if constexpr (_FromBytes == _ToBytes * 8) 4012 { 4013 const auto __y = __vector_bitcast<_LLong>(__k); 4014 return __intrin_bitcast<_To>( 4015 _mm256_castsi128_si256(_mm_shuffle_epi8( 4016 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4017 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 4018 -1, -1, -1, -1, -1)))); 4019 } 4020 else if constexpr (_FromBytes * 2 == _ToBytes) 4021 { 4022 auto __y = __xzyw(__to_intrin(__k)); 4023 if constexpr (is_floating_point_v< 4024 _Tp> || (!__have_avx2 && _FromBytes == 4)) 4025 { 4026 const auto __yy = __vector_bitcast
(__y); 4027 return __intrin_bitcast<_To>( 4028 _mm256_unpacklo_ps(__yy, __yy)); 4029 } 4030 else 4031 return __intrin_bitcast<_To>( 4032 _mm256_unpacklo_epi8(__y, __y)); 4033 } 4034 else if constexpr (_FromBytes * 4 == _ToBytes) 4035 { 4036 auto __y 4037 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4038 __lo128(__vector_bitcast<_LLong>( 4039 __k))); // drops 3/4 of input 4040 return __intrin_bitcast<_To>( 4041 __concat(_mm_unpacklo_epi16(__y, __y), 4042 _mm_unpackhi_epi16(__y, __y))); 4043 } 4044 else if constexpr (_FromBytes == 1 && _ToBytes == 8) 4045 { 4046 auto __y 4047 = _mm_unpacklo_epi8(__lo128(__vector_bitcast<_LLong>(__k)), 4048 __lo128(__vector_bitcast<_LLong>( 4049 __k))); // drops 3/4 of input 4050 __y 4051 = _mm_unpacklo_epi16(__y, 4052 __y); // drops another 1/2 => 7/8 total 4053 return __intrin_bitcast<_To>( 4054 __concat(_mm_unpacklo_epi32(__y, __y), 4055 _mm_unpackhi_epi32(__y, __y))); 4056 } 4057 else 4058 __assert_unreachable<_Tp>(); 4059 } // }}} 4060 else if constexpr (sizeof(_UI) == 32 && sizeof(__k) == 16) 4061 { // SSE -> AVX {{{ 4062 if constexpr (_FromBytes == _ToBytes) 4063 return __intrin_bitcast<_To>( 4064 __intrinsic_type_t<_Tp, 32 / sizeof(_Tp)>( 4065 __zero_extend(__to_intrin(__k)))); 4066 else if constexpr (_FromBytes * 2 == _ToBytes) 4067 { // keep all 4068 return __intrin_bitcast<_To>( 4069 __concat(_mm_unpacklo_epi8(__vector_bitcast<_LLong>(__k), 4070 __vector_bitcast<_LLong>(__k)), 4071 _mm_unpackhi_epi8(__vector_bitcast<_LLong>(__k), 4072 __vector_bitcast<_LLong>(__k)))); 4073 } 4074 else if constexpr (_FromBytes * 4 == _ToBytes) 4075 { 4076 if constexpr (__have_avx2) 4077 { 4078 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4079 __concat(__vector_bitcast<_LLong>(__k), 4080 __vector_bitcast<_LLong>(__k)), 4081 _mm256_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 4082 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 4083 6, 6, 7, 7, 7, 7))); 4084 } 4085 else 4086 { 4087 return __intrin_bitcast<_To>(__concat( 4088 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4089 _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 4090 2, 2, 2, 2, 3, 3, 3, 3)), 4091 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4092 _mm_setr_epi8(4, 4, 4, 4, 5, 5, 5, 5, 4093 6, 6, 6, 6, 7, 7, 7, 4094 7)))); 4095 } 4096 } 4097 else if constexpr (_FromBytes * 8 == _ToBytes) 4098 { 4099 if constexpr (__have_avx2) 4100 { 4101 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 4102 __concat(__vector_bitcast<_LLong>(__k), 4103 __vector_bitcast<_LLong>(__k)), 4104 _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 4105 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4106 3, 3, 3, 3, 3, 3))); 4107 } 4108 else 4109 { 4110 return __intrin_bitcast<_To>(__concat( 4111 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4112 _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 4113 1, 1, 1, 1, 1, 1, 1, 1)), 4114 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4115 _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 4116 3, 3, 3, 3, 3, 3, 3, 4117 3)))); 4118 } 4119 } 4120 else if constexpr (_FromBytes == _ToBytes * 2) 4121 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4122 _mm_packs_epi16(__vector_bitcast<_LLong>(__k), __m128i())))); 4123 else if constexpr (_FromBytes == 8 && _ToBytes == 2) 4124 { 4125 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4126 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4127 _mm_setr_epi8(6, 7, 14, 15, -1, -1, -1, -1, 4128 -1, -1, -1, -1, -1, -1, -1, 4129 -1))))); 4130 } 4131 else if constexpr (_FromBytes == 4 && _ToBytes == 1) 4132 { 4133 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4134 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4135 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, 4136 -1, -1, -1, -1, -1, -1, -1, 4137 -1))))); 4138 } 4139 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4140 { 4141 return __intrin_bitcast<_To>(__m256i(__zero_extend( 4142 _mm_shuffle_epi8(__vector_bitcast<_LLong>(__k), 4143 _mm_setr_epi8(7, 15, -1, -1, -1, -1, -1, 4144 -1, -1, -1, -1, -1, -1, -1, 4145 -1, -1))))); 4146 } 4147 else 4148 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4149 } // }}} 4150 else if constexpr (sizeof(_UI) == 16 && sizeof(__k) == 32) 4151 { // AVX -> SSE {{{ 4152 if constexpr (_FromBytes == _ToBytes) 4153 { // keep low 1/2 4154 return __intrin_bitcast<_To>(__lo128(__k)); 4155 } 4156 else if constexpr (_FromBytes == _ToBytes * 2) 4157 { // keep all 4158 auto __y = __vector_bitcast<_LLong>(__k); 4159 return __intrin_bitcast<_To>( 4160 _mm_packs_epi16(__lo128(__y), __hi128(__y))); 4161 } 4162 else if constexpr (_FromBytes == _ToBytes * 4) 4163 { // add 1/2 undef 4164 auto __y = __vector_bitcast<_LLong>(__k); 4165 return __intrin_bitcast<_To>( 4166 _mm_packs_epi16(_mm_packs_epi16(__lo128(__y), __hi128(__y)), 4167 __m128i())); 4168 } 4169 else if constexpr (_FromBytes == 8 && _ToBytes == 1) 4170 { // add 3/4 undef 4171 auto __y = __vector_bitcast<_LLong>(__k); 4172 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 4173 _mm_packs_epi16(__lo128(__y), __hi128(__y)), 4174 _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, 4175 -1, -1, -1, -1))); 4176 } 4177 else if constexpr (_FromBytes * 2 == _ToBytes) 4178 { // keep low 1/4 4179 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4180 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4181 } 4182 else if constexpr (_FromBytes * 4 == _ToBytes) 4183 { // keep low 1/8 4184 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4185 __y = _mm_unpacklo_epi8(__y, __y); 4186 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4187 } 4188 else if constexpr (_FromBytes * 8 == _ToBytes) 4189 { // keep low 1/16 4190 auto __y = __lo128(__vector_bitcast<_LLong>(__k)); 4191 __y = _mm_unpacklo_epi8(__y, __y); 4192 __y = _mm_unpacklo_epi8(__y, __y); 4193 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__y, __y)); 4194 } 4195 else 4196 static_assert(!is_same_v<_Tp, _Tp>, "should be unreachable"); 4197 } // }}} 4198 else 4199 return _Base::template _S_to_maskvector<_Up, _ToN>(__x); 4200 /* 4201 if constexpr (_FromBytes > _ToBytes) { 4202 const _To __y = __vector_bitcast<_Up>(__k); 4203 return [&]
(index_sequence<_Is...>) { 4204 constexpr int _Stride = _FromBytes / _ToBytes; 4205 return _To{__y[(_Is + 1) * _Stride - 1]...}; 4206 }(make_index_sequence
()); 4207 } else { 4208 // {0, 0, 1, 1} (_Dups = 2, _Is<4>) 4209 // {0, 0, 0, 0, 1, 1, 1, 1} (_Dups = 4, _Is<8>) 4210 // {0, 0, 1, 1, 2, 2, 3, 3} (_Dups = 2, _Is<8>) 4211 // ... 4212 return [&]
(index_sequence<_Is...>) { 4213 constexpr int __dup = _ToBytes / _FromBytes; 4214 return __intrin_bitcast<_To>(_From{__k[_Is / __dup]...}); 4215 }(make_index_sequence<_FromN>()); 4216 } 4217 */ 4218 } // }}} 4219 } 4220 4221 // }}} 4222 // _S_to_bits {{{ 4223 template
4224 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 4225 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 4226 { 4227 if constexpr (is_same_v<_Tp, bool>) 4228 return _BitMask<_Np>(__x._M_data)._M_sanitized(); 4229 else 4230 { 4231 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4232 if (__builtin_is_constant_evaluated() 4233 || __builtin_constant_p(__x._M_data)) 4234 { 4235 const auto __bools = -__x._M_data; 4236 const _ULLong __k = __call_with_n_evaluations<_Np>( 4237 [](auto... __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4238 return (__bits | ...); 4239 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 4240 return _ULLong(__bools[+__i]) << __i; 4241 }); 4242 if (__builtin_is_constant_evaluated() 4243 || __builtin_constant_p(__k)) 4244 return __k; 4245 } 4246 const auto __xi = __to_intrin(__x); 4247 if constexpr (sizeof(_Tp) == 1) 4248 if constexpr (sizeof(__xi) == 16) 4249 if constexpr (__have_avx512bw_vl) 4250 return _BitMask<_Np>(_mm_movepi8_mask(__xi)); 4251 else // implies SSE2 4252 return _BitMask<_Np>(_mm_movemask_epi8(__xi)); 4253 else if constexpr (sizeof(__xi) == 32) 4254 if constexpr (__have_avx512bw_vl) 4255 return _BitMask<_Np>(_mm256_movepi8_mask(__xi)); 4256 else // implies AVX2 4257 return _BitMask<_Np>(_mm256_movemask_epi8(__xi)); 4258 else // implies AVX512BW 4259 return _BitMask<_Np>(_mm512_movepi8_mask(__xi)); 4260 4261 else if constexpr (sizeof(_Tp) == 2) 4262 if constexpr (sizeof(__xi) == 16) 4263 if constexpr (__have_avx512bw_vl) 4264 return _BitMask<_Np>(_mm_movepi16_mask(__xi)); 4265 else if constexpr (__have_avx512bw) 4266 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4267 else // implies SSE2 4268 return _BitMask<_Np>( 4269 _mm_movemask_epi8(_mm_packs_epi16(__xi, __m128i()))); 4270 else if constexpr (sizeof(__xi) == 32) 4271 if constexpr (__have_avx512bw_vl) 4272 return _BitMask<_Np>(_mm256_movepi16_mask(__xi)); 4273 else if constexpr (__have_avx512bw) 4274 return _BitMask<_Np>(_mm512_movepi16_mask(__zero_extend(__xi))); 4275 else // implies SSE2 4276 return _BitMask<_Np>(_mm_movemask_epi8( 4277 _mm_packs_epi16(__lo128(__xi), __hi128(__xi)))); 4278 else // implies AVX512BW 4279 return _BitMask<_Np>(_mm512_movepi16_mask(__xi)); 4280 4281 else if constexpr (sizeof(_Tp) == 4) 4282 if constexpr (sizeof(__xi) == 16) 4283 if constexpr (__have_avx512dq_vl) 4284 return _BitMask<_Np>(_mm_movepi32_mask(__xi)); 4285 else if constexpr (__have_avx512vl) 4286 return _BitMask<_Np>(_mm_cmplt_epi32_mask(__xi, __m128i())); 4287 else if constexpr (__have_avx512dq) 4288 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4289 else if constexpr (__have_avx512f) 4290 return _BitMask<_Np>( 4291 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4292 else // implies SSE 4293 return _BitMask<_Np>( 4294 _mm_movemask_ps(reinterpret_cast<__m128>(__xi))); 4295 else if constexpr (sizeof(__xi) == 32) 4296 if constexpr (__have_avx512dq_vl) 4297 return _BitMask<_Np>(_mm256_movepi32_mask(__xi)); 4298 else if constexpr (__have_avx512dq) 4299 return _BitMask<_Np>(_mm512_movepi32_mask(__zero_extend(__xi))); 4300 else if constexpr (__have_avx512vl) 4301 return _BitMask<_Np>(_mm256_cmplt_epi32_mask(__xi, __m256i())); 4302 else if constexpr (__have_avx512f) 4303 return _BitMask<_Np>( 4304 _mm512_cmplt_epi32_mask(__zero_extend(__xi), __m512i())); 4305 else // implies AVX 4306 return _BitMask<_Np>( 4307 _mm256_movemask_ps(reinterpret_cast<__m256>(__xi))); 4308 else // implies AVX512?? 4309 if constexpr (__have_avx512dq) 4310 return _BitMask<_Np>(_mm512_movepi32_mask(__xi)); 4311 else // implies AVX512F 4312 return _BitMask<_Np>(_mm512_cmplt_epi32_mask(__xi, __m512i())); 4313 4314 else if constexpr (sizeof(_Tp) == 8) 4315 if constexpr (sizeof(__xi) == 16) 4316 if constexpr (__have_avx512dq_vl) 4317 return _BitMask<_Np>(_mm_movepi64_mask(__xi)); 4318 else if constexpr (__have_avx512dq) 4319 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4320 else if constexpr (__have_avx512vl) 4321 return _BitMask<_Np>(_mm_cmplt_epi64_mask(__xi, __m128i())); 4322 else if constexpr (__have_avx512f) 4323 return _BitMask<_Np>( 4324 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4325 else // implies SSE2 4326 return _BitMask<_Np>( 4327 _mm_movemask_pd(reinterpret_cast<__m128d>(__xi))); 4328 else if constexpr (sizeof(__xi) == 32) 4329 if constexpr (__have_avx512dq_vl) 4330 return _BitMask<_Np>(_mm256_movepi64_mask(__xi)); 4331 else if constexpr (__have_avx512dq) 4332 return _BitMask<_Np>(_mm512_movepi64_mask(__zero_extend(__xi))); 4333 else if constexpr (__have_avx512vl) 4334 return _BitMask<_Np>(_mm256_cmplt_epi64_mask(__xi, __m256i())); 4335 else if constexpr (__have_avx512f) 4336 return _BitMask<_Np>( 4337 _mm512_cmplt_epi64_mask(__zero_extend(__xi), __m512i())); 4338 else // implies AVX 4339 return _BitMask<_Np>( 4340 _mm256_movemask_pd(reinterpret_cast<__m256d>(__xi))); 4341 else // implies AVX512?? 4342 if constexpr (__have_avx512dq) 4343 return _BitMask<_Np>(_mm512_movepi64_mask(__xi)); 4344 else // implies AVX512F 4345 return _BitMask<_Np>(_mm512_cmplt_epi64_mask(__xi, __m512i())); 4346 4347 else 4348 __assert_unreachable<_Tp>(); 4349 } 4350 } 4351 // }}} 4352 }; 4353 4354 // }}} 4355 // _MaskImplX86 {{{ 4356 template
4357 struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi> 4358 { 4359 using _MaskImplX86Mixin::_S_to_bits; 4360 using _MaskImplX86Mixin::_S_to_maskvector; 4361 using _MaskImplBuiltin<_Abi>::_S_convert; 4362 4363 // member types {{{ 4364 template
4365 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 4366 4367 template
4368 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 4369 4370 template
4371 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 4372 4373 using _Base = _MaskImplBuiltin<_Abi>; 4374 4375 // }}} 4376 // _S_broadcast {{{ 4377 template
4378 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4379 _S_broadcast(bool __x) 4380 { 4381 if constexpr (__is_avx512_abi<_Abi>()) 4382 return __x ? _Abi::_S_masked(_MaskMember<_Tp>(-1)) 4383 : _MaskMember<_Tp>(); 4384 else 4385 return _Base::template _S_broadcast<_Tp>(__x); 4386 } 4387 4388 // }}} 4389 // _S_load {{{ 4390 template
4391 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 4392 _S_load(const bool* __mem) 4393 { 4394 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 4395 if (__builtin_is_constant_evaluated()) 4396 { 4397 if constexpr (__is_avx512_abi<_Abi>()) 4398 { 4399 _MaskMember<_Tp> __r{}; 4400 for (size_t __i = 0; __i < _S_size<_Tp>; ++__i) 4401 __r._M_data |= _ULLong(__mem[__i]) << __i; 4402 return __r; 4403 } 4404 else 4405 return _Base::template _S_load<_Tp>(__mem); 4406 } 4407 else if constexpr (__have_avx512bw) 4408 { 4409 const auto __to_vec_or_bits 4410 = [](auto __bits) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> decltype(auto) { 4411 if constexpr (__is_avx512_abi<_Abi>()) 4412 return __bits; 4413 else 4414 return _S_to_maskvector<_Tp>( 4415 _BitMask<_S_size<_Tp>>(__bits)._M_sanitized()); 4416 }; 4417 4418 if constexpr (_S_size<_Tp> <= 16 && __have_avx512vl) 4419 { 4420 __m128i __a = {}; 4421 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4422 return __to_vec_or_bits(_mm_test_epi8_mask(__a, __a)); 4423 } 4424 else if constexpr (_S_size<_Tp> <= 32 && __have_avx512vl) 4425 { 4426 __m256i __a = {}; 4427 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4428 return __to_vec_or_bits(_mm256_test_epi8_mask(__a, __a)); 4429 } 4430 else if constexpr (_S_size<_Tp> <= 64) 4431 { 4432 __m512i __a = {}; 4433 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4434 return __to_vec_or_bits(_mm512_test_epi8_mask(__a, __a)); 4435 } 4436 } 4437 else if constexpr (__is_avx512_abi<_Abi>()) 4438 { 4439 if constexpr (_S_size<_Tp> <= 8) 4440 { 4441 __m128i __a = {}; 4442 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4443 const auto __b = _mm512_cvtepi8_epi64(__a); 4444 return _mm512_test_epi64_mask(__b, __b); 4445 } 4446 else if constexpr (_S_size<_Tp> <= 16) 4447 { 4448 __m128i __a = {}; 4449 __builtin_memcpy(&__a, __mem, _S_size<_Tp>); 4450 const auto __b = _mm512_cvtepi8_epi32(__a); 4451 return _mm512_test_epi32_mask(__b, __b); 4452 } 4453 else if constexpr (_S_size<_Tp> <= 32) 4454 { 4455 __m128i __a = {}; 4456 __builtin_memcpy(&__a, __mem, 16); 4457 const auto __b = _mm512_cvtepi8_epi32(__a); 4458 __builtin_memcpy(&__a, __mem + 16, _S_size<_Tp> - 16); 4459 const auto __c = _mm512_cvtepi8_epi32(__a); 4460 return _mm512_test_epi32_mask(__b, __b) 4461 | (_mm512_test_epi32_mask(__c, __c) << 16); 4462 } 4463 else if constexpr (_S_size<_Tp> <= 64) 4464 { 4465 __m128i __a = {}; 4466 __builtin_memcpy(&__a, __mem, 16); 4467 const auto __b = _mm512_cvtepi8_epi32(__a); 4468 __builtin_memcpy(&__a, __mem + 16, 16); 4469 const auto __c = _mm512_cvtepi8_epi32(__a); 4470 if constexpr (_S_size<_Tp> <= 48) 4471 { 4472 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 32); 4473 const auto __d = _mm512_cvtepi8_epi32(__a); 4474 return _mm512_test_epi32_mask(__b, __b) 4475 | (_mm512_test_epi32_mask(__c, __c) << 16) 4476 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32); 4477 } 4478 else 4479 { 4480 __builtin_memcpy(&__a, __mem + 16, 16); 4481 const auto __d = _mm512_cvtepi8_epi32(__a); 4482 __builtin_memcpy(&__a, __mem + 32, _S_size<_Tp> - 48); 4483 const auto __e = _mm512_cvtepi8_epi32(__a); 4484 return _mm512_test_epi32_mask(__b, __b) 4485 | (_mm512_test_epi32_mask(__c, __c) << 16) 4486 | (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32) 4487 | (_ULLong(_mm512_test_epi32_mask(__e, __e)) << 48); 4488 } 4489 } 4490 else 4491 __assert_unreachable<_Tp>(); 4492 } 4493 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> == 2) 4494 return __vector_bitcast<_Tp>( 4495 __vector_type16_t
{-int(__mem[0]), -int(__mem[0]), 4496 -int(__mem[1]), -int(__mem[1])}); 4497 else if constexpr (sizeof(_Tp) == 8 && _S_size<_Tp> <= 4 && __have_avx) 4498 { 4499 int __bool4 = 0; 4500 __builtin_memcpy(&__bool4, __mem, _S_size<_Tp>); 4501 const auto __k = __to_intrin( 4502 (__vector_broadcast<4>(__bool4) 4503 & __make_vector