Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_builtin.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_builtin.h 1 // Simd Abi specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 27 28 #if __cplusplus >= 201703L 29 30 #include
31 #include
32 #include
33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 // _S_allbits{{{ 36 template
37 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits 38 = reinterpret_cast<_V>(~__vector_type_t
()); 39 40 // }}} 41 // _S_signmask, _S_absmask{{{ 42 template
> 43 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask 44 = __xor(_V() + 1, _V() - 1); 45 46 template
> 47 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask 48 = __andnot(_S_signmask<_V>, _S_allbits<_V>); 49 50 //}}} 51 // __vector_permute
{{{ 52 // Index == -1 requests zeroing of the output element 53 template
> 54 constexpr _Tp 55 __vector_permute(_Tp __x) 56 { 57 static_assert(sizeof...(_Indices) == _TVT::_S_full_size); 58 return __make_vector
( 59 (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...); 60 } 61 62 // }}} 63 // __vector_shuffle
{{{ 64 // Index == -1 requests zeroing of the output element 65 template
> 66 constexpr _Tp 67 __vector_shuffle(_Tp __x, _Tp __y) 68 { 69 return _Tp{(_Indices == -1 ? 0 70 : _Indices < _TVT::_S_full_size 71 ? __x[_Indices] 72 : __y[_Indices - _TVT::_S_full_size])...}; 73 } 74 75 // }}} 76 // __make_wrapper{{{ 77 template
78 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)> 79 __make_wrapper(const _Args&... __args) 80 { return __make_vector<_Tp>(__args...); } 81 82 // }}} 83 // __wrapper_bitcast{{{ 84 template
86 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np> 87 __wrapper_bitcast(_SimdWrapper<_Up, _M> __x) 88 { 89 static_assert(_Np > 1); 90 return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data); 91 } 92 93 // }}} 94 // __shift_elements_right{{{ 95 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct 96 template
> 97 _GLIBCXX_SIMD_INTRINSIC _Tp 98 __shift_elements_right(_Tp __v) 99 { 100 [[maybe_unused]] const auto __iv = __to_intrin(__v); 101 static_assert(__shift <= sizeof(_Tp)); 102 if constexpr (__shift == 0) 103 return __v; 104 else if constexpr (__shift == sizeof(_Tp)) 105 return _Tp(); 106 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 107 else if constexpr (__have_sse && __shift == 8 108 && _TVT::template _S_is
) 109 return _mm_movehl_ps(__iv, __iv); 110 else if constexpr (__have_sse2 && __shift == 8 111 && _TVT::template _S_is
) 112 return _mm_unpackhi_pd(__iv, __iv); 113 else if constexpr (__have_sse2 && sizeof(_Tp) == 16) 114 return reinterpret_cast
( 115 _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift)); 116 else if constexpr (__shift == 16 && sizeof(_Tp) == 32) 117 { 118 /*if constexpr (__have_avx && _TVT::template _S_is
) 119 return _mm256_permute2f128_pd(__iv, __iv, 0x81); 120 else if constexpr (__have_avx && _TVT::template _S_is
) 121 return _mm256_permute2f128_ps(__iv, __iv, 0x81); 122 else if constexpr (__have_avx) 123 return reinterpret_cast
( 124 _mm256_permute2f128_si256(__iv, __iv, 0x81)); 125 else*/ 126 return __zero_extend(__hi128(__v)); 127 } 128 else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16) 129 { 130 const auto __vll = __vector_bitcast<_LLong>(__v); 131 return reinterpret_cast
( 132 _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81), 133 __vll, __shift)); 134 } 135 else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16) 136 { 137 const auto __vll = __vector_bitcast<_LLong>(__v); 138 return reinterpret_cast
( 139 __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift), 140 _mm_srli_si128(__hi128(__vll), __shift))); 141 } 142 else if constexpr (sizeof(_Tp) == 32 && __shift > 16) 143 return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v))); 144 else if constexpr (sizeof(_Tp) == 64 && __shift == 32) 145 return __zero_extend(__hi256(__v)); 146 else if constexpr (__have_avx512f && sizeof(_Tp) == 64) 147 { 148 if constexpr (__shift >= 48) 149 return __zero_extend( 150 __shift_elements_right<__shift - 48>(__extract<3, 4>(__v))); 151 else if constexpr (__shift >= 32) 152 return __zero_extend( 153 __shift_elements_right<__shift - 32>(__hi256(__v))); 154 else if constexpr (__shift % 8 == 0) 155 return reinterpret_cast
( 156 _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v), 157 __shift / 8)); 158 else if constexpr (__shift % 4 == 0) 159 return reinterpret_cast
( 160 _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v), 161 __shift / 4)); 162 else if constexpr (__have_avx512bw && __shift < 16) 163 { 164 const auto __vll = __vector_bitcast<_LLong>(__v); 165 return reinterpret_cast
( 166 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9), 167 __vll, __shift)); 168 } 169 else if constexpr (__have_avx512bw && __shift < 32) 170 { 171 const auto __vll = __vector_bitcast<_LLong>(__v); 172 return reinterpret_cast
( 173 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee), 174 _mm512_shuffle_i32x4(__vll, __vll, 0xf9), 175 __shift - 16)); 176 } 177 else 178 __assert_unreachable<_Tp>(); 179 } 180 /* 181 } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64) 182 return __auto_bitcast(__extract<__shift / 16, 4>(__v)); 183 */ 184 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 185 else 186 { 187 constexpr int __chunksize = __shift % 8 == 0 ? 8 188 : __shift % 4 == 0 ? 4 189 : __shift % 2 == 0 ? 2 190 : 1; 191 auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v); 192 using _Up = decltype(__w); 193 return __intrin_bitcast<_Tp>( 194 __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( 195 [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 196 return _Up{__chunks...}; 197 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 198 return __w[__shift / __chunksize + __i]; 199 })); 200 } 201 } 202 203 // }}} 204 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ 205 template
206 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr 207 _SimdWrapper<_Tp, _Np / _Total * _Combine> 208 __extract_part(const _SimdWrapper<_Tp, _Np> __x) 209 { 210 if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0) 211 return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); 212 else 213 { 214 constexpr size_t __values_per_part = _Np / _Total; 215 constexpr size_t __values_to_skip = _Index * __values_per_part; 216 constexpr size_t __return_size = __values_per_part * _Combine; 217 using _R = __vector_type_t<_Tp, __return_size>; 218 static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) 219 <= sizeof(__x), 220 "out of bounds __extract_part"); 221 // the following assertion would ensure no "padding" to be read 222 // static_assert(_Total >= _Index + _Combine, "_Total must be greater 223 // than _Index"); 224 225 // static_assert(__return_size * _Total == _Np, "_Np must be divisible 226 // by _Total"); 227 if (__x._M_is_constprop()) 228 return __generate_from_n_evaluations<__return_size, _R>( 229 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 230 return __x[__values_to_skip + __i]; 231 }); 232 if constexpr (_Index == 0 && _Total == 1) 233 return __x; 234 else if constexpr (_Index == 0) 235 return __intrin_bitcast<_R>(__as_vector(__x)); 236 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 237 else if constexpr (sizeof(__x) == 32 238 && __return_size * sizeof(_Tp) <= 16) 239 { 240 constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp); 241 if constexpr (__bytes_to_skip == 16) 242 return __vector_bitcast<_Tp, __return_size>( 243 __hi128(__as_vector(__x))); 244 else 245 return __vector_bitcast<_Tp, __return_size>( 246 _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)), 247 __lo128(__vector_bitcast<_LLong>(__x)), 248 __bytes_to_skip)); 249 } 250 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 251 else if constexpr (_Index > 0 252 && (__values_to_skip % __return_size != 0 253 || sizeof(_R) >= 8) 254 && (__values_to_skip + __return_size) * sizeof(_Tp) 255 <= 64 256 && sizeof(__x) >= 16) 257 return __intrin_bitcast<_R>( 258 __shift_elements_right<__values_to_skip * sizeof(_Tp)>( 259 __as_vector(__x))); 260 else 261 { 262 _R __r = {}; 263 __builtin_memcpy(&__r, 264 reinterpret_cast
(&__x) 265 + sizeof(_Tp) * __values_to_skip, 266 __return_size * sizeof(_Tp)); 267 return __r; 268 } 269 } 270 } 271 272 // }}} 273 // __extract_part(_SimdWrapper
) {{{ 274 template
275 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper
276 __extract_part(const _SimdWrapper
__x) 277 { 278 static_assert(_Combine == 1, "_Combine != 1 not implemented"); 279 static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0); 280 return __x._M_data >> (_Index * _Np / _Total); 281 } 282 283 // }}} 284 285 // __vector_convert {{{ 286 // implementation requires an index sequence 287 template
288 _GLIBCXX_SIMD_INTRINSIC constexpr _To 289 __vector_convert(_From __a, index_sequence<_I...>) 290 { 291 using _Tp = typename _VectorTraits<_To>::value_type; 292 return _To{static_cast<_Tp>(__a[_I])...}; 293 } 294 295 template
296 _GLIBCXX_SIMD_INTRINSIC constexpr _To 297 __vector_convert(_From __a, _From __b, index_sequence<_I...>) 298 { 299 using _Tp = typename _VectorTraits<_To>::value_type; 300 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...}; 301 } 302 303 template
304 _GLIBCXX_SIMD_INTRINSIC constexpr _To 305 __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>) 306 { 307 using _Tp = typename _VectorTraits<_To>::value_type; 308 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 309 static_cast<_Tp>(__c[_I])...}; 310 } 311 312 template
313 _GLIBCXX_SIMD_INTRINSIC constexpr _To 314 __vector_convert(_From __a, _From __b, _From __c, _From __d, 315 index_sequence<_I...>) 316 { 317 using _Tp = typename _VectorTraits<_To>::value_type; 318 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 319 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...}; 320 } 321 322 template
323 _GLIBCXX_SIMD_INTRINSIC constexpr _To 324 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 325 index_sequence<_I...>) 326 { 327 using _Tp = typename _VectorTraits<_To>::value_type; 328 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 329 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 330 static_cast<_Tp>(__e[_I])...}; 331 } 332 333 template
334 _GLIBCXX_SIMD_INTRINSIC constexpr _To 335 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 336 _From __f, index_sequence<_I...>) 337 { 338 using _Tp = typename _VectorTraits<_To>::value_type; 339 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 340 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 341 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...}; 342 } 343 344 template
345 _GLIBCXX_SIMD_INTRINSIC constexpr _To 346 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 347 _From __f, _From __g, index_sequence<_I...>) 348 { 349 using _Tp = typename _VectorTraits<_To>::value_type; 350 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 351 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 352 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 353 static_cast<_Tp>(__g[_I])...}; 354 } 355 356 template
357 _GLIBCXX_SIMD_INTRINSIC constexpr _To 358 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 359 _From __f, _From __g, _From __h, index_sequence<_I...>) 360 { 361 using _Tp = typename _VectorTraits<_To>::value_type; 362 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 363 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 364 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 365 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...}; 366 } 367 368 template
369 _GLIBCXX_SIMD_INTRINSIC constexpr _To 370 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 371 _From __f, _From __g, _From __h, _From __i, 372 index_sequence<_I...>) 373 { 374 using _Tp = typename _VectorTraits<_To>::value_type; 375 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 376 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 377 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 378 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 379 static_cast<_Tp>(__i[_I])...}; 380 } 381 382 template
383 _GLIBCXX_SIMD_INTRINSIC constexpr _To 384 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 385 _From __f, _From __g, _From __h, _From __i, _From __j, 386 index_sequence<_I...>) 387 { 388 using _Tp = typename _VectorTraits<_To>::value_type; 389 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 390 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 391 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 392 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 393 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...}; 394 } 395 396 template
397 _GLIBCXX_SIMD_INTRINSIC constexpr _To 398 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 399 _From __f, _From __g, _From __h, _From __i, _From __j, 400 _From __k, index_sequence<_I...>) 401 { 402 using _Tp = typename _VectorTraits<_To>::value_type; 403 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 404 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 405 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 406 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 407 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 408 static_cast<_Tp>(__k[_I])...}; 409 } 410 411 template
412 _GLIBCXX_SIMD_INTRINSIC constexpr _To 413 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 414 _From __f, _From __g, _From __h, _From __i, _From __j, 415 _From __k, _From __l, index_sequence<_I...>) 416 { 417 using _Tp = typename _VectorTraits<_To>::value_type; 418 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 419 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 420 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 421 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 422 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 423 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...}; 424 } 425 426 template
427 _GLIBCXX_SIMD_INTRINSIC constexpr _To 428 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 429 _From __f, _From __g, _From __h, _From __i, _From __j, 430 _From __k, _From __l, _From __m, index_sequence<_I...>) 431 { 432 using _Tp = typename _VectorTraits<_To>::value_type; 433 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 434 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 435 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 436 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 437 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 438 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 439 static_cast<_Tp>(__m[_I])...}; 440 } 441 442 template
443 _GLIBCXX_SIMD_INTRINSIC constexpr _To 444 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 445 _From __f, _From __g, _From __h, _From __i, _From __j, 446 _From __k, _From __l, _From __m, _From __n, 447 index_sequence<_I...>) 448 { 449 using _Tp = typename _VectorTraits<_To>::value_type; 450 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 451 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 452 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 453 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 454 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 455 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 456 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...}; 457 } 458 459 template
460 _GLIBCXX_SIMD_INTRINSIC constexpr _To 461 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 462 _From __f, _From __g, _From __h, _From __i, _From __j, 463 _From __k, _From __l, _From __m, _From __n, _From __o, 464 index_sequence<_I...>) 465 { 466 using _Tp = typename _VectorTraits<_To>::value_type; 467 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 468 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 469 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 470 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 471 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 472 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 473 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 474 static_cast<_Tp>(__o[_I])...}; 475 } 476 477 template
478 _GLIBCXX_SIMD_INTRINSIC constexpr _To 479 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 480 _From __f, _From __g, _From __h, _From __i, _From __j, 481 _From __k, _From __l, _From __m, _From __n, _From __o, 482 _From __p, index_sequence<_I...>) 483 { 484 using _Tp = typename _VectorTraits<_To>::value_type; 485 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 486 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 487 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 488 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 489 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 490 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 491 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 492 static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...}; 493 } 494 495 // Defer actual conversion to the overload that takes an index sequence. Note 496 // that this function adds zeros or drops values off the end if you don't ensure 497 // matching width. 498 template
499 _GLIBCXX_SIMD_INTRINSIC constexpr _To 500 __vector_convert(_SimdWrapper<_From, _FromSize>... __xs) 501 { 502 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 503 using _From0 = __first_of_pack_t<_From...>; 504 using _FW = _SimdWrapper<_From0, _FromSize>; 505 if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop())) 506 { 507 if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1)) 508 == 0) // power-of-two number of arguments 509 return __convert_x86<_To>(__as_vector(__xs)...); 510 else // append zeros and recurse until the above branch is taken 511 return __vector_convert<_To>(__xs..., _FW{}); 512 } 513 else 514 #endif 515 return __vector_convert<_To>( 516 __as_vector(__xs)..., 517 make_index_sequence<(sizeof...(__xs) == 1 ? std::min( 518 _VectorTraits<_To>::_S_full_size, int(_FromSize)) 519 : _FromSize)>()); 520 } 521 522 // }}} 523 // __convert function{{{ 524 template
525 _GLIBCXX_SIMD_INTRINSIC constexpr auto 526 __convert(_From __v0, _More... __vs) 527 { 528 static_assert((true && ... && is_same_v<_From, _More>) ); 529 if constexpr (__is_vectorizable_v<_From>) 530 { 531 using _V = typename _VectorTraits<_To>::type; 532 using _Tp = typename _VectorTraits<_To>::value_type; 533 return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...}; 534 } 535 else if constexpr (__is_vector_type_v<_From>) 536 return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...); 537 else // _SimdWrapper arguments 538 { 539 constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More)); 540 if constexpr (__is_vectorizable_v<_To>) 541 return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...); 542 else if constexpr (!__is_vector_type_v<_To>) 543 return _To(__convert
(__v0, __vs...)); 544 else 545 { 546 static_assert( 547 sizeof...(_More) == 0 548 || _VectorTraits<_To>::_S_full_size >= __input_size, 549 "__convert(...) requires the input to fit into the output"); 550 return __vector_convert<_To>(__v0, __vs...); 551 } 552 } 553 } 554 555 // }}} 556 // __convert_all{{{ 557 // Converts __v into array<_To, N>, where N is _NParts if non-zero or 558 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v). 559 // Note: this function may return less than all converted elements 560 template
> 566 _GLIBCXX_SIMD_INTRINSIC auto 567 __convert_all(_From __v) 568 { 569 if constexpr (is_arithmetic_v<_To> && _NParts != 1) 570 { 571 static_assert(_Offset < _FromVT::_S_full_size); 572 constexpr auto _Np 573 = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts; 574 return __generate_from_n_evaluations<_Np, array<_To, _Np>>( 575 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 576 return static_cast<_To>(__v[__i + _Offset]); 577 }); 578 } 579 else 580 { 581 static_assert(__is_vector_type_v<_To>); 582 using _ToVT = _VectorTraits<_To>; 583 if constexpr (__is_vector_type_v<_From>) 584 return __convert_all<_To, _NParts>(__as_wrapper(__v)); 585 else if constexpr (_NParts == 1) 586 { 587 static_assert(_Offset % _ToVT::_S_full_size == 0); 588 return array<_To, 1>{__vector_convert<_To>( 589 __extract_part<_Offset / _ToVT::_S_full_size, 590 __div_roundup(_FromVT::_S_partial_width, 591 _ToVT::_S_full_size)>(__v))}; 592 } 593 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 594 else if constexpr (!__have_sse4_1 && _Offset == 0 595 && is_integral_v
596 && sizeof(typename _FromVT::value_type) 597 < sizeof(typename _ToVT::value_type) 598 && !(sizeof(typename _FromVT::value_type) == 4 599 && is_same_v
)) 600 { 601 using _ToT = typename _ToVT::value_type; 602 using _FromT = typename _FromVT::value_type; 603 constexpr size_t _Np 604 = _NParts != 0 605 ? _NParts 606 : (_FromVT::_S_partial_width / _ToVT::_S_full_size); 607 using _R = array<_To, _Np>; 608 // __adjust modifies its input to have _Np (use _SizeConstant) 609 // entries so that no unnecessary intermediate conversions are 610 // requested and, more importantly, no intermediate conversions are 611 // missing 612 [[maybe_unused]] auto __adjust 613 = [](auto __n, 614 auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> { 615 return __vector_bitcast<_FromT, decltype(__n)::value>(__vv); 616 }; 617 [[maybe_unused]] const auto __vi = __to_intrin(__v); 618 auto&& __make_array 619 = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 620 if constexpr (_Np == 1) 621 return _R{__intrin_bitcast<_To>(__x0)}; 622 else 623 return _R{__intrin_bitcast<_To>(__x0), 624 __intrin_bitcast<_To>(__x1)}; 625 }; 626 627 if constexpr (_Np == 0) 628 return _R{}; 629 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2) 630 { 631 static_assert(is_integral_v<_FromT>); 632 static_assert(is_integral_v<_ToT>); 633 if constexpr (is_unsigned_v<_FromT>) 634 return __make_array(_mm_unpacklo_epi8(__vi, __m128i()), 635 _mm_unpackhi_epi8(__vi, __m128i())); 636 else 637 return __make_array( 638 _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8), 639 _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8)); 640 } 641 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4) 642 { 643 static_assert(is_integral_v<_FromT>); 644 if constexpr (is_floating_point_v<_ToT>) 645 { 646 const auto __ints 647 = __convert_all<__vector_type16_t
, _Np>( 648 __adjust(_SizeConstant<_Np * 4>(), __v)); 649 return __generate_from_n_evaluations<_Np, _R>( 650 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 651 return __vector_convert<_To>(__as_wrapper(__ints[__i])); 652 }); 653 } 654 else if constexpr (is_unsigned_v<_FromT>) 655 return __make_array(_mm_unpacklo_epi16(__vi, __m128i()), 656 _mm_unpackhi_epi16(__vi, __m128i())); 657 else 658 return __make_array( 659 _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16), 660 _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16)); 661 } 662 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 663 && is_integral_v<_FromT> && is_integral_v<_ToT>) 664 { 665 if constexpr (is_unsigned_v<_FromT>) 666 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 667 _mm_unpackhi_epi32(__vi, __m128i())); 668 else 669 return __make_array( 670 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 671 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 672 } 673 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 674 && is_integral_v<_FromT> && is_integral_v<_ToT>) 675 { 676 if constexpr (is_unsigned_v<_FromT>) 677 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 678 _mm_unpackhi_epi32(__vi, __m128i())); 679 else 680 return __make_array( 681 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 682 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 683 } 684 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4 685 && is_signed_v<_FromT>) 686 { 687 const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi), 688 _mm_unpackhi_epi8(__vi, __vi)}; 689 const __vector_type_t
__vvvv[4] = { 690 __vector_bitcast
(_mm_unpacklo_epi16(__vv[0], __vv[0])), 691 __vector_bitcast
(_mm_unpackhi_epi16(__vv[0], __vv[0])), 692 __vector_bitcast
(_mm_unpacklo_epi16(__vv[1], __vv[1])), 693 __vector_bitcast
(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; 694 if constexpr (sizeof(_ToT) == 4) 695 return __generate_from_n_evaluations<_Np, _R>( 696 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 697 return __vector_convert<_To>( 698 _SimdWrapper
(__vvvv[__i] >> 24)); 699 }); 700 else if constexpr (is_integral_v<_ToT>) 701 return __generate_from_n_evaluations<_Np, _R>( 702 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 703 const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); 704 const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); 705 return __vector_bitcast<_ToT>( 706 __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) 707 : _mm_unpackhi_epi32(__sx32, __signbits)); 708 }); 709 else 710 return __generate_from_n_evaluations<_Np, _R>( 711 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 712 const _SimdWrapper
__int4 = __vvvv[__i / 2] >> 24; 713 return __vector_convert<_To>( 714 __i % 2 == 0 ? __int4 715 : _SimdWrapper
( 716 _mm_unpackhi_epi64(__to_intrin(__int4), 717 __to_intrin(__int4)))); 718 }); 719 } 720 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) 721 { 722 const auto __shorts = __convert_all<__vector_type16_t< 723 conditional_t
, short, unsigned short>>>( 724 __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); 725 return __generate_from_n_evaluations<_Np, _R>( 726 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 727 return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; 728 }); 729 } 730 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 731 && is_signed_v<_FromT> && is_integral_v<_ToT>) 732 { 733 const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi), 734 _mm_unpackhi_epi16(__vi, __vi)}; 735 const __vector_type16_t
__vvvv[4] 736 = {__vector_bitcast
( 737 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16), 738 _mm_srai_epi32(__vv[0], 31))), 739 __vector_bitcast
( 740 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16), 741 _mm_srai_epi32(__vv[0], 31))), 742 __vector_bitcast
( 743 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16), 744 _mm_srai_epi32(__vv[1], 31))), 745 __vector_bitcast
( 746 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), 747 _mm_srai_epi32(__vv[1], 31)))}; 748 return __generate_from_n_evaluations<_Np, _R>( 749 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 750 return __vector_bitcast<_ToT>(__vvvv[__i]); 751 }); 752 } 753 else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) 754 { 755 const auto __ints 756 = __convert_all<__vector_type16_t
|| is_floating_point_v<_ToT>, int, 758 unsigned int>>>( 759 __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); 760 return __generate_from_n_evaluations<_Np, _R>( 761 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 762 return __convert_all<_To>(__ints[__i / 2])[__i % 2]; 763 }); 764 } 765 else 766 __assert_unreachable<_To>(); 767 } 768 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 769 else if constexpr ((_FromVT::_S_partial_width - _Offset) 770 > _ToVT::_S_full_size) 771 { 772 /* 773 static_assert( 774 (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) == 775 0, 776 "__convert_all only supports power-of-2 number of elements. 777 Otherwise " "the return type cannot be array<_To, N>."); 778 */ 779 constexpr size_t _NTotal 780 = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size; 781 constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts; 782 static_assert( 783 _Np <= _NTotal 784 || (_Np == _NTotal + 1 785 && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size 786 > 0)); 787 using _R = array<_To, _Np>; 788 if constexpr (_Np == 1) 789 return _R{__vector_convert<_To>( 790 __extract_part<_Offset, _FromVT::_S_partial_width, 791 _ToVT::_S_full_size>(__v))}; 792 else 793 return __generate_from_n_evaluations<_Np, _R>( 794 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 795 auto __part 796 = __extract_part<__i * _ToVT::_S_full_size + _Offset, 797 _FromVT::_S_partial_width, 798 _ToVT::_S_full_size>(__v); 799 return __vector_convert<_To>(__part); 800 }); 801 } 802 else if constexpr (_Offset == 0) 803 return array<_To, 1>{__vector_convert<_To>(__v)}; 804 else 805 return array<_To, 1>{__vector_convert<_To>( 806 __extract_part<_Offset, _FromVT::_S_partial_width, 807 _FromVT::_S_partial_width - _Offset>(__v))}; 808 } 809 } 810 811 // }}} 812 813 // _GnuTraits {{{ 814 template
815 struct _GnuTraits 816 { 817 using _IsValid = true_type; 818 using _SimdImpl = typename _Abi::_SimdImpl; 819 using _MaskImpl = typename _Abi::_MaskImpl; 820 821 // simd and simd_mask member types {{{ 822 using _SimdMember = _SimdWrapper<_Tp, _Np>; 823 using _MaskMember = _SimdWrapper<_Mp, _Np>; 824 static constexpr size_t _S_simd_align = alignof(_SimdMember); 825 static constexpr size_t _S_mask_align = alignof(_MaskMember); 826 827 // }}} 828 // size metadata {{{ 829 static constexpr size_t _S_full_size = _SimdMember::_S_full_size; 830 static constexpr bool _S_is_partial = _SimdMember::_S_is_partial; 831 832 // }}} 833 // _SimdBase / base class for simd, providing extra conversions {{{ 834 struct _SimdBase2 835 { 836 explicit 837 operator __intrinsic_type_t<_Tp, _Np>() const 838 { return __to_intrin(static_cast
*>(this)->_M_data); } 839 840 explicit 841 operator __vector_type_t<_Tp, _Np>() const 842 { return __data(*static_cast
*>(this)); } 843 }; 844 845 struct _SimdBase1 846 { 847 explicit 848 operator __intrinsic_type_t<_Tp, _Np>() const 849 { return __data(*static_cast
*>(this)); } 850 }; 851 852 using _SimdBase = conditional_t< 853 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 854 _SimdBase1, _SimdBase2>; 855 856 // }}} 857 // _MaskBase {{{ 858 struct _MaskBase2 859 { 860 explicit 861 operator __intrinsic_type_t<_Tp, _Np>() const 862 { return static_cast
*>(this) ->_M_data.__intrin(); } 863 864 explicit 865 operator __vector_type_t<_Tp, _Np>() const 866 { return static_cast
*>(this)->_M_data._M_data; } 867 }; 868 869 struct _MaskBase1 870 { 871 explicit 872 operator __intrinsic_type_t<_Tp, _Np>() const 873 { return __data(*static_cast
*>(this)); } 874 }; 875 876 using _MaskBase = conditional_t< 877 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 878 _MaskBase1, _MaskBase2>; 879 880 // }}} 881 // _MaskCastType {{{ 882 // parameter type of one explicit simd_mask constructor 883 class _MaskCastType 884 { 885 using _Up = __intrinsic_type_t<_Tp, _Np>; 886 _Up _M_data; 887 888 public: 889 _MaskCastType(_Up __x) : _M_data(__x) {} 890 891 operator _MaskMember() const { return _M_data; } 892 }; 893 894 // }}} 895 // _SimdCastType {{{ 896 // parameter type of one explicit simd constructor 897 class _SimdCastType1 898 { 899 using _Ap = __intrinsic_type_t<_Tp, _Np>; 900 _SimdMember _M_data; 901 902 public: 903 constexpr 904 _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 905 906 constexpr 907 operator _SimdMember() const { return _M_data; } 908 }; 909 910 class _SimdCastType2 911 { 912 using _Ap = __intrinsic_type_t<_Tp, _Np>; 913 using _Bp = __vector_type_t<_Tp, _Np>; 914 _SimdMember _M_data; 915 916 public: 917 constexpr 918 _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 919 920 constexpr 921 _SimdCastType2(_Bp __b) : _M_data(__b) {} 922 923 constexpr 924 operator _SimdMember() const { return _M_data; } 925 }; 926 927 using _SimdCastType = conditional_t< 928 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 929 _SimdCastType1, _SimdCastType2>; 930 //}}} 931 }; 932 933 // }}} 934 struct _CommonImplX86; 935 struct _CommonImplNeon; 936 struct _CommonImplBuiltin; 937 template
struct _SimdImplBuiltin; 938 template
struct _MaskImplBuiltin; 939 template
struct _SimdImplX86; 940 template
struct _MaskImplX86; 941 template
struct _SimdImplNeon; 942 template
struct _MaskImplNeon; 943 template
struct _SimdImplPpc; 944 template
struct _MaskImplPpc; 945 946 // simd_abi::_VecBuiltin {{{ 947 template
948 struct simd_abi::_VecBuiltin 949 { 950 template
951 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 952 953 // validity traits {{{ 954 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 955 956 template
957 struct _IsValidSizeFor 958 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 959 && _UsedBytes % sizeof(_Tp) == 0 960 && _UsedBytes <= __vectorized_sizeof<_Tp>() 961 && (!__have_avx512f || _UsedBytes <= 32))> {}; 962 963 template
964 struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>, 965 _IsValidSizeFor<_Tp>> {}; 966 967 template
968 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 969 970 // }}} 971 // _SimdImpl/_MaskImpl {{{ 972 #if _GLIBCXX_SIMD_X86INTRIN 973 using _CommonImpl = _CommonImplX86; 974 using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>; 975 using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>; 976 #elif _GLIBCXX_SIMD_HAVE_NEON 977 using _CommonImpl = _CommonImplNeon; 978 using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>; 979 using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>; 980 #else 981 using _CommonImpl = _CommonImplBuiltin; 982 #ifdef __ALTIVEC__ 983 using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>; 984 using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>; 985 #else 986 using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>; 987 using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>; 988 #endif 989 #endif 990 991 // }}} 992 // __traits {{{ 993 template
994 using _MaskValueType = __int_for_sizeof_t<_Tp>; 995 996 template
997 using __traits 998 = conditional_t<_S_is_valid_v<_Tp>, 999 _GnuTraits<_Tp, _MaskValueType<_Tp>, 1000 _VecBuiltin<_UsedBytes>, _S_size<_Tp>>, 1001 _InvalidTraits>; 1002 1003 //}}} 1004 // size metadata {{{ 1005 template
1006 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1007 1008 template
1009 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1010 1011 // }}} 1012 // implicit masks {{{ 1013 template
1014 using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>; 1015 1016 template
1017 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1018 _S_implicit_mask() 1019 { 1020 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 1021 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1022 return ~_UV(); 1023 else 1024 { 1025 constexpr auto __size = _S_size<_Tp>; 1026 _GLIBCXX_SIMD_USE_CONSTEXPR auto __r 1027 = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1028 { return __i < __size ? -1 : 0; }); 1029 return __r; 1030 } 1031 } 1032 1033 template
1034 _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>> 1035 _S_implicit_mask_intrin() 1036 { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); } 1037 1038 template
> 1039 _GLIBCXX_SIMD_INTRINSIC static constexpr _TW 1040 _S_masked(_TW __x) 1041 { 1042 using _Tp = typename _TVT::value_type; 1043 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1044 return __x; 1045 else 1046 return __and(__as_vector(__x), 1047 __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>())); 1048 } 1049 1050 template
> 1051 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1052 __make_padding_nonzero(_TW __x) 1053 { 1054 using _Tp = typename _TVT::value_type; 1055 if constexpr (!_S_is_partial<_Tp>) 1056 return __x; 1057 else 1058 { 1059 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask 1060 = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()); 1061 if constexpr (is_integral_v<_Tp>) 1062 return __or(__x, ~__implicit_mask); 1063 else 1064 { 1065 _GLIBCXX_SIMD_USE_CONSTEXPR auto __one 1066 = __andnot(__implicit_mask, 1067 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))); 1068 // it's not enough to return `x | 1_in_padding` because the 1069 // padding in x might be inf or nan (independent of 1070 // __FINITE_MATH_ONLY__, because it's about padding bits) 1071 return __or(__and(__x, __implicit_mask), __one); 1072 } 1073 } 1074 } 1075 // }}} 1076 }; 1077 1078 // }}} 1079 // simd_abi::_VecBltnBtmsk {{{ 1080 template
1081 struct simd_abi::_VecBltnBtmsk 1082 { 1083 template
1084 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 1085 1086 // validity traits {{{ 1087 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 1088 1089 template
1090 struct _IsValidSizeFor 1091 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 1092 && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64 1093 && (_UsedBytes > 32 || __have_avx512vl))> {}; 1094 1095 // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also 1096 // required. 1097 template
1098 struct _IsValid 1099 : conjunction< 1100 _IsValidAbiTag, __bool_constant<__have_avx512f>, 1101 __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>, 1102 __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>, 1103 _IsValidSizeFor<_Tp>> {}; 1104 1105 template
1106 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 1107 1108 // }}} 1109 // simd/_MaskImpl {{{ 1110 #if _GLIBCXX_SIMD_X86INTRIN 1111 using _CommonImpl = _CommonImplX86; 1112 using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>; 1113 using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>; 1114 #else 1115 template
1116 struct _MissingImpl; 1117 1118 using _CommonImpl = _MissingImpl<_UsedBytes>; 1119 using _SimdImpl = _MissingImpl<_UsedBytes>; 1120 using _MaskImpl = _MissingImpl<_UsedBytes>; 1121 #endif 1122 1123 // }}} 1124 // __traits {{{ 1125 template
1126 using _MaskMember = _SimdWrapper
>; 1127 1128 template
1129 using __traits = conditional_t< 1130 _S_is_valid_v<_Tp>, 1131 _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>, 1132 _InvalidTraits>; 1133 1134 //}}} 1135 // size metadata {{{ 1136 template
1137 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1138 template
1139 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1140 1141 // }}} 1142 // implicit mask {{{ 1143 private: 1144 template
1145 using _ImplicitMask = _SimdWrapper
>; 1146 1147 public: 1148 template
1149 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np> 1150 __implicit_mask_n() 1151 { 1152 using _Tp = __bool_storage_member_type_t<_Np>; 1153 return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp(); 1154 } 1155 1156 template
1157 _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp> 1158 _S_implicit_mask() 1159 { return __implicit_mask_n<_S_size<_Tp>>(); } 1160 1161 template
1162 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>> 1163 _S_implicit_mask_intrin() 1164 { return __implicit_mask_n<_S_size<_Tp>>(); } 1165 1166 template
1167 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1168 _S_masked(_SimdWrapper<_Tp, _Np> __x) 1169 { 1170 if constexpr (is_same_v<_Tp, bool>) 1171 if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0) 1172 return _MaskImpl::_S_bit_and( 1173 __x, _SimdWrapper<_Tp, _Np>( 1174 __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1))); 1175 else 1176 return __x; 1177 else 1178 return _S_masked(__x._M_data); 1179 } 1180 1181 template
1182 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 1183 _S_masked(_TV __x) 1184 { 1185 using _Tp = typename _VectorTraits<_TV>::value_type; 1186 static_assert( 1187 !__is_bitmask_v<_TV>, 1188 "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't " 1189 "know the number of elements. Use _SimdWrapper
instead."); 1190 if constexpr (_S_is_partial<_Tp>) 1191 { 1192 constexpr size_t _Np = _S_size<_Tp>; 1193 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1194 _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(), 1195 _SimdWrapper<_Tp, _Np>(__x)); 1196 } 1197 else 1198 return __x; 1199 } 1200 1201 template
> 1202 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1203 __make_padding_nonzero(_TV __x) 1204 { 1205 using _Tp = typename _TVT::value_type; 1206 if constexpr (!_S_is_partial<_Tp>) 1207 return __x; 1208 else 1209 { 1210 constexpr size_t _Np = _S_size<_Tp>; 1211 if constexpr (is_integral_v
) 1212 return __x 1213 | __generate_vector<_Tp, _S_full_size<_Tp>>( 1214 [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { 1215 if (__i < _Np) 1216 return 0; 1217 else 1218 return 1; 1219 }); 1220 else 1221 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1222 _S_implicit_mask<_Tp>(), 1223 _SimdWrapper<_Tp, _Np>( 1224 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))), 1225 _SimdWrapper<_Tp, _Np>(__x)) 1226 ._M_data; 1227 } 1228 } 1229 1230 // }}} 1231 }; 1232 1233 //}}} 1234 // _CommonImplBuiltin {{{ 1235 struct _CommonImplBuiltin 1236 { 1237 // _S_converts_via_decomposition{{{ 1238 // This lists all cases where a __vector_convert needs to fall back to 1239 // conversion of individual scalars (i.e. decompose the input vector into 1240 // scalars, convert, compose output vector). In those cases, _S_masked_load & 1241 // _S_masked_store prefer to use the _S_bit_iteration implementation. 1242 template
1243 static inline constexpr bool __converts_via_decomposition_v 1244 = sizeof(_From) != sizeof(_To); 1245 1246 // }}} 1247 // _S_load{{{ 1248 template
1249 _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np> 1250 _S_load(const void* __p) 1251 { 1252 static_assert(_Np > 1); 1253 static_assert(_Bytes % sizeof(_Tp) == 0); 1254 using _Rp = __vector_type_t<_Tp, _Np>; 1255 if constexpr (sizeof(_Rp) == _Bytes) 1256 { 1257 _Rp __r; 1258 __builtin_memcpy(&__r, __p, _Bytes); 1259 return __r; 1260 } 1261 else 1262 { 1263 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424 1264 using _Up = conditional_t< 1265 is_integral_v<_Tp>, 1266 conditional_t<_Bytes % 4 == 0, 1267 conditional_t<_Bytes % 8 == 0, long long, int>, 1268 conditional_t<_Bytes % 2 == 0, short, signed char>>, 1269 conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp, 1270 double>>; 1271 using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>; 1272 if constexpr (sizeof(_V) != sizeof(_Rp)) 1273 { // on i386 with 4 < _Bytes <= 8 1274 _Rp __r{}; 1275 __builtin_memcpy(&__r, __p, _Bytes); 1276 return __r; 1277 } 1278 else 1279 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424 1280 using _V = _Rp; 1281 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1282 { 1283 _V __r{}; 1284 static_assert(_Bytes <= sizeof(_V)); 1285 __builtin_memcpy(&__r, __p, _Bytes); 1286 return reinterpret_cast<_Rp>(__r); 1287 } 1288 } 1289 } 1290 1291 // }}} 1292 // _S_store {{{ 1293 template
1294 _GLIBCXX_SIMD_INTRINSIC static void 1295 _S_memcpy(char* __dst, const char* __src) 1296 { 1297 if constexpr (_Bytes > 0) 1298 { 1299 constexpr size_t _Ns = std::__bit_floor(_Bytes); 1300 __builtin_memcpy(__dst, __src, _Ns); 1301 _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns); 1302 } 1303 } 1304 1305 template
1306 _GLIBCXX_SIMD_INTRINSIC static void 1307 _S_store(_TV __x, void* __addr) 1308 { 1309 constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes; 1310 static_assert(sizeof(__x) >= _Bytes); 1311 1312 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424 1313 if constexpr (__is_vector_type_v<_TV>) 1314 _S_memcpy<_Bytes>(reinterpret_cast
(__addr), reinterpret_cast
(&__x)); 1315 else 1316 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1317 __builtin_memcpy(__addr, &__x, _Bytes); 1318 } 1319 1320 template
1321 _GLIBCXX_SIMD_INTRINSIC static void 1322 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 1323 { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); } 1324 1325 // }}} 1326 // _S_store_bool_array(_BitMask) {{{ 1327 template
1328 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1329 _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) 1330 { 1331 if constexpr (_Np == 1) 1332 __mem[0] = __x[0]; 1333 else if (__builtin_is_constant_evaluated()) 1334 { 1335 for (size_t __i = 0; __i < _Np; ++__i) 1336 __mem[__i] = __x[__i]; 1337 } 1338 else if constexpr (_Np == 2) 1339 { 1340 short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101; 1341 _S_store<_Np>(__bool2, __mem); 1342 } 1343 else if constexpr (_Np == 3) 1344 { 1345 int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101; 1346 _S_store<_Np>(__bool3, __mem); 1347 } 1348 else 1349 { 1350 __execute_n_times<__div_roundup(_Np, 4)>( 1351 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1352 constexpr int __offset = __i * 4; 1353 constexpr int __remaining = _Np - __offset; 1354 if constexpr (__remaining > 4 && __remaining <= 7) 1355 { 1356 const _ULLong __bool7 1357 = (__x.template _M_extract<__offset>()._M_to_bits() 1358 * 0x40810204081ULL) 1359 & 0x0101010101010101ULL; 1360 _S_store<__remaining>(__bool7, __mem + __offset); 1361 } 1362 else if constexpr (__remaining >= 4) 1363 { 1364 int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 1365 if constexpr (__remaining > 7) 1366 __bits &= 0xf; 1367 const int __bool4 = (__bits * 0x204081) & 0x01010101; 1368 _S_store<4>(__bool4, __mem + __offset); 1369 } 1370 }); 1371 } 1372 } 1373 1374 // }}} 1375 // _S_blend{{{ 1376 template
1377 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1378 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 1379 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 1380 { return __k._M_data ? __at1._M_data : __at0._M_data; } 1381 1382 // }}} 1383 }; 1384 1385 // }}} 1386 // _SimdImplBuiltin {{{1 1387 template
1388 struct _SimdImplBuiltin 1389 { 1390 // member types {{{2 1391 template
1392 static constexpr size_t _S_max_store_size = 16; 1393 1394 using abi_type = _Abi; 1395 1396 template
1397 using _TypeTag = _Tp*; 1398 1399 template
1400 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 1401 1402 template
1403 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 1404 1405 template
1406 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 1407 1408 template
1409 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 1410 1411 using _CommonImpl = typename _Abi::_CommonImpl; 1412 using _SuperImpl = typename _Abi::_SimdImpl; 1413 using _MaskImpl = typename _Abi::_MaskImpl; 1414 1415 // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2 1416 template
1417 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1418 _M_make_simd(_SimdWrapper<_Tp, _Np> __x) 1419 { return {__private_init, __x}; } 1420 1421 template
1422 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1423 _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x) 1424 { return {__private_init, __vector_bitcast<_Tp>(__x)}; } 1425 1426 // _S_broadcast {{{2 1427 template
1428 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1429 _S_broadcast(_Tp __x) noexcept 1430 { return __vector_broadcast<_S_full_size<_Tp>>(__x); } 1431 1432 // _S_generator {{{2 1433 template
1434 inline static constexpr _SimdMember<_Tp> 1435 _S_generator(_Fp&& __gen, _TypeTag<_Tp>) 1436 { 1437 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1438 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1439 if constexpr (__i < _S_size<_Tp>) 1440 return __gen(__i); 1441 else 1442 return 0; 1443 }); 1444 } 1445 1446 // _S_load {{{2 1447 template
1448 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1449 _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept 1450 { 1451 constexpr size_t _Np = _S_size<_Tp>; 1452 constexpr size_t __max_load_size 1453 = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64 1454 : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32 1455 : 16; 1456 constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; 1457 if (__builtin_is_constant_evaluated()) 1458 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1459 [&](auto __i) constexpr { 1460 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1461 }); 1462 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1463 return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( 1464 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1465 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1466 }); 1467 else if constexpr (is_same_v<_Up, _Tp>) 1468 return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, 1469 _Np * sizeof(_Tp)>(__mem); 1470 else if constexpr (__bytes_to_load <= __max_load_size) 1471 return __convert<_SimdMember<_Tp>>( 1472 _CommonImpl::template _S_load<_Up, _Np>(__mem)); 1473 else if constexpr (__bytes_to_load % __max_load_size == 0) 1474 { 1475 constexpr size_t __n_loads = __bytes_to_load / __max_load_size; 1476 constexpr size_t __elements_per_load = _Np / __n_loads; 1477 return __call_with_n_evaluations<__n_loads>( 1478 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1479 return __convert<_SimdMember<_Tp>>(__uncvted...); 1480 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1481 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1482 __mem + __i * __elements_per_load); 1483 }); 1484 } 1485 else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 1486 && __max_load_size > 16) 1487 { // e.g. int[] ->
with AVX2 1488 constexpr size_t __n_loads 1489 = __bytes_to_load / (__max_load_size / 2); 1490 constexpr size_t __elements_per_load = _Np / __n_loads; 1491 return __call_with_n_evaluations<__n_loads>( 1492 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1493 return __convert<_SimdMember<_Tp>>(__uncvted...); 1494 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1495 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1496 __mem + __i * __elements_per_load); 1497 }); 1498 } 1499 else // e.g. int[] ->
1500 return __call_with_subscripts( 1501 __mem, make_index_sequence<_Np>(), 1502 [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1503 return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; 1504 }); 1505 } 1506 1507 // _S_masked_load {{{2 1508 template
1509 static constexpr inline _SimdWrapper<_Tp, _Np> 1510 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 1511 const _Up* __mem) noexcept 1512 { 1513 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1514 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1515 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1516 }); 1517 return __merge; 1518 } 1519 1520 // _S_store {{{2 1521 template
1522 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1523 _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept 1524 { 1525 // TODO: converting int -> "smaller int" can be optimized with AVX512 1526 constexpr size_t _Np = _S_size<_Tp>; 1527 constexpr size_t __max_store_size 1528 = _SuperImpl::template _S_max_store_size<_Up>; 1529 if (__builtin_is_constant_evaluated()) 1530 { 1531 for (size_t __i = 0; __i < _Np; ++__i) 1532 __mem[__i] = __v[__i]; 1533 } 1534 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1535 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1536 __mem[__i] = __v[__i]; 1537 }); 1538 else if constexpr (is_same_v<_Up, _Tp>) 1539 _CommonImpl::_S_store(__v, __mem); 1540 else if constexpr (sizeof(_Up) * _Np <= __max_store_size) 1541 _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)), 1542 __mem); 1543 else 1544 { 1545 constexpr size_t __vsize = __max_store_size / sizeof(_Up); 1546 // round up to convert the last partial vector as well: 1547 constexpr size_t __stores = __div_roundup(_Np, __vsize); 1548 constexpr size_t __full_stores = _Np / __vsize; 1549 using _V = __vector_type_t<_Up, __vsize>; 1550 const array<_V, __stores> __converted 1551 = __convert_all<_V, __stores>(__v); 1552 __execute_n_times<__full_stores>( 1553 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1554 _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); 1555 }); 1556 if constexpr (__full_stores < __stores) 1557 _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) 1558 * sizeof(_Up)>( 1559 __converted[__full_stores], __mem + __full_stores * __vsize); 1560 } 1561 } 1562 1563 // _S_masked_store_nocvt {{{2 1564 template
1565 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1566 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) 1567 { 1568 _BitOps::_S_bit_iteration( 1569 _MaskImpl::_S_to_bits(__k), 1570 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1571 __mem[__i] = __v[__i]; 1572 }); 1573 } 1574 1575 // _S_masked_store {{{2 1576 template
, 1577 typename _Tp = typename _TVT::value_type, typename _Up> 1578 static constexpr inline void 1579 _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept 1580 { 1581 constexpr size_t _TV_size = _S_size<_Tp>; 1582 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1583 constexpr size_t __max_store_size 1584 = _SuperImpl::template _S_max_store_size<_Up>; 1585 if constexpr ( 1586 is_same_v< 1587 _Tp, 1588 _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) 1589 { 1590 // bitwise or no conversion, reinterpret: 1591 const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1592 if constexpr (__is_bitmask_v
) 1593 return _MaskMember<_Up>(__k._M_data); 1594 else 1595 return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k); 1596 }(); 1597 _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v), 1598 __mem, __kk); 1599 } 1600 else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up) 1601 && !_CommonImpl:: 1602 template __converts_via_decomposition_v< 1603 _Tp, _Up, __max_store_size>) 1604 { // conversion via decomposition is better handled via the 1605 // bit_iteration 1606 // fallback below 1607 constexpr size_t _UW_size 1608 = std::min(_TV_size, __max_store_size / sizeof(_Up)); 1609 static_assert(_UW_size <= _TV_size); 1610 using _UW = _SimdWrapper<_Up, _UW_size>; 1611 using _UV = __vector_type_t<_Up, _UW_size>; 1612 using _UAbi = simd_abi::deduce_t<_Up, _UW_size>; 1613 if constexpr (_UW_size == _TV_size) // one convert+store 1614 { 1615 const _UW __converted = __convert<_UW>(__v); 1616 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1617 __converted, __mem, 1618 _UAbi::_MaskImpl::template _S_convert< 1619 __int_for_sizeof_t<_Up>>(__k)); 1620 } 1621 else 1622 { 1623 static_assert(_UW_size * sizeof(_Up) == __max_store_size); 1624 constexpr size_t _NFullStores = _TV_size / _UW_size; 1625 constexpr size_t _NAllStores 1626 = __div_roundup(_TV_size, _UW_size); 1627 constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; 1628 const array<_UV, _NAllStores> __converted 1629 = __convert_all<_UV, _NAllStores>(__v); 1630 __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1631 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1632 _UW(__converted[__i]), __mem + __i * _UW_size, 1633 _UAbi::_MaskImpl::template _S_convert< 1634 __int_for_sizeof_t<_Up>>( 1635 __extract_part<__i, _NParts>(__k.__as_full_vector()))); 1636 }); 1637 if constexpr (_NAllStores 1638 > _NFullStores) // one partial at the end 1639 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1640 _UW(__converted[_NFullStores]), 1641 __mem + _NFullStores * _UW_size, 1642 _UAbi::_MaskImpl::template _S_convert< 1643 __int_for_sizeof_t<_Up>>( 1644 __extract_part<_NFullStores, _NParts>( 1645 __k.__as_full_vector()))); 1646 } 1647 } 1648 else 1649 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1650 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1651 __mem[__i] = static_cast<_Up>(__v[__i]); 1652 }); 1653 } 1654 1655 // _S_complement {{{2 1656 template
1657 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1658 _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept 1659 { return ~__x._M_data; } 1660 1661 // _S_unary_minus {{{2 1662 template
1663 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1664 _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept 1665 { 1666 // GCC doesn't use the psign instructions, but pxor & psub seem to be 1667 // just as good a choice as pcmpeqd & psign. So meh. 1668 return -__x._M_data; 1669 } 1670 1671 // arithmetic operators {{{2 1672 template
1673 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1674 _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1675 { return __x._M_data + __y._M_data; } 1676 1677 template
1678 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1679 _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1680 { return __x._M_data - __y._M_data; } 1681 1682 template
1683 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1684 _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1685 { return __x._M_data * __y._M_data; } 1686 1687 template
1688 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1689 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1690 { 1691 // Note that division by 0 is always UB, so we must ensure we avoid the 1692 // case for partial registers 1693 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1694 return __x._M_data / __y._M_data; 1695 else 1696 return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data); 1697 } 1698 1699 template
1700 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1701 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1702 { 1703 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1704 return __x._M_data % __y._M_data; 1705 else 1706 return __as_vector(__x) 1707 % _Abi::__make_padding_nonzero(__as_vector(__y)); 1708 } 1709 1710 template
1711 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1712 _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1713 { return __and(__x, __y); } 1714 1715 template
1716 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1717 _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1718 { return __or(__x, __y); } 1719 1720 template
1721 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1722 _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1723 { return __xor(__x, __y); } 1724 1725 template
1726 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1727 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1728 { return __x._M_data << __y._M_data; } 1729 1730 template
1731 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1732 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1733 { return __x._M_data >> __y._M_data; } 1734 1735 template
1736 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1737 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y) 1738 { return __x._M_data << __y; } 1739 1740 template
1741 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1742 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y) 1743 { return __x._M_data >> __y; } 1744 1745 // compares {{{2 1746 // _S_equal_to {{{3 1747 template
1748 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1749 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1750 { return __x._M_data == __y._M_data; } 1751 1752 // _S_not_equal_to {{{3 1753 template
1754 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1755 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1756 { return __x._M_data != __y._M_data; } 1757 1758 // _S_less {{{3 1759 template
1760 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1761 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1762 { return __x._M_data < __y._M_data; } 1763 1764 // _S_less_equal {{{3 1765 template
1766 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1767 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1768 { return __x._M_data <= __y._M_data; } 1769 1770 // _S_negate {{{2 1771 template
1772 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1773 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 1774 { return !__x._M_data; } 1775 1776 // _S_min, _S_max, _S_minmax {{{2 1777 template
1778 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1779 _SimdWrapper<_Tp, _Np> 1780 _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1781 { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; } 1782 1783 template
1784 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1785 _SimdWrapper<_Tp, _Np> 1786 _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1787 { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; } 1788 1789 template
1790 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1791 pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>> 1792 _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1793 { 1794 return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data, 1795 __a._M_data < __b._M_data ? __b._M_data : __a._M_data}; 1796 } 1797 1798 // reductions {{{2 1799 template
1801 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1802 _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>, 1803 simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1804 { 1805 using _V = __vector_type_t<_Tp, _Np / 2>; 1806 static_assert(sizeof(_V) <= sizeof(__x)); 1807 // _S_full_size is the size of the smallest native SIMD register that 1808 // can store _Np/2 elements: 1809 using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>; 1810 using _HalfSimd = __deduced_simd<_Tp, _Np / 2>; 1811 const auto __xx = __as_vector(__x); 1812 return _HalfSimd::abi_type::_SimdImpl::_S_reduce( 1813 static_cast<_HalfSimd>(__as_vector(__binary_op( 1814 static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)), 1815 static_cast<_FullSimd>(__intrin_bitcast<_V>( 1816 __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>( 1817 __xx)))))), 1818 __binary_op); 1819 } 1820 1821 template
1822 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1823 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1824 { 1825 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 1826 if constexpr (_Np == 1) 1827 return __x[0]; 1828 else if constexpr (_Np == 2) 1829 return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1830 simd<_Tp, simd_abi::scalar>(__x[1]))[0]; 1831 else if (__builtin_is_constant_evaluated()) 1832 { 1833 simd<_Tp, simd_abi::scalar> __acc = __x[0]; 1834 for (size_t __i = 1; __i < _Np; ++__i) 1835 __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i])); 1836 return __acc[0]; 1837 } 1838 else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{ 1839 { 1840 [[maybe_unused]] constexpr auto __full_size 1841 = _Abi::template _S_full_size<_Tp>; 1842 if constexpr (_Np == 3) 1843 return __binary_op( 1844 __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1845 simd<_Tp, simd_abi::scalar>(__x[1])), 1846 simd<_Tp, simd_abi::scalar>(__x[2]))[0]; 1847 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1848 plus<>>) 1849 { 1850 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1851 return _Ap::_SimdImpl::_S_reduce( 1852 simd<_Tp, _Ap>(__private_init, 1853 _Abi::_S_masked(__as_vector(__x))), 1854 __binary_op); 1855 } 1856 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1857 multiplies<>>) 1858 { 1859 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1860 using _TW = _SimdWrapper<_Tp, __full_size>; 1861 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full 1862 = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector(); 1863 _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one 1864 = __vector_broadcast<__full_size>(_Tp(1)); 1865 const _TW __x_full = __data(__x).__as_full_vector(); 1866 const _TW __x_padded_with_ones 1867 = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one, 1868 __x_full); 1869 return _Ap::_SimdImpl::_S_reduce( 1870 simd<_Tp, _Ap>(__private_init, __x_padded_with_ones), 1871 __binary_op); 1872 } 1873 else if constexpr (_Np & 1) 1874 { 1875 using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>; 1876 return __binary_op( 1877 simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce( 1878 simd<_Tp, _Ap>( 1879 __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>( 1880 __as_vector(__x))), 1881 __binary_op)), 1882 simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0]; 1883 } 1884 else 1885 return _S_reduce_partial<_Np>( 1886 make_index_sequence<_Np / 2>(), 1887 make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op); 1888 } //}}} 1889 else if constexpr (sizeof(__x) == 16) //{{{ 1890 { 1891 if constexpr (_Np == 16) 1892 { 1893 const auto __y = __data(__x); 1894 __x = __binary_op( 1895 _M_make_simd<_Tp, _Np>( 1896 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1897 7, 7>(__y)), 1898 _M_make_simd<_Tp, _Np>( 1899 __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 1900 14, 14, 15, 15>(__y))); 1901 } 1902 if constexpr (_Np >= 8) 1903 { 1904 const auto __y = __vector_bitcast
(__data(__x)); 1905 __x = __binary_op( 1906 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1907 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))), 1908 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1909 __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y)))); 1910 } 1911 if constexpr (_Np >= 4) 1912 { 1913 using _Up = conditional_t
, float, int>; 1914 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1915 __x = __binary_op(__x, 1916 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1917 __vector_permute<3, 2, 1, 0>(__y)))); 1918 } 1919 using _Up = conditional_t
, double, _LLong>; 1920 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1921 __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1922 __vector_permute<1, 1>(__y)))); 1923 return __x[0]; 1924 } //}}} 1925 else 1926 { 1927 static_assert(sizeof(__x) > __min_vector_size<_Tp>); 1928 static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2 1929 using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>; 1930 using _V = simd<_Tp, _Ap>; 1931 return _Ap::_SimdImpl::_S_reduce( 1932 __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))), 1933 _V(__private_init, 1934 __extract<1, 2>(__as_vector(__x)))), 1935 static_cast<_BinaryOperation&&>(__binary_op)); 1936 } 1937 } 1938 1939 // math {{{2 1940 // frexp, modf and copysign implemented in simd_math.h 1941 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \ 1942 template
\ 1943 static _Tp \ 1944 _S_##__name(const _Tp& __x, const _More&... __more) \ 1945 { \ 1946 return __generate_vector<_Tp>( \ 1947 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1948 return __name(__x[__i], __more[__i]...); \ 1949 }); \ 1950 } 1951 1952 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ 1953 template
\ 1954 static typename _Tp::mask_type \ 1955 _S_##__name(const _Tp& __x, const _More&... __more) \ 1956 { \ 1957 return __generate_vector<_Tp>( \ 1958 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1959 return __name(__x[__i], __more[__i]...); \ 1960 }); \ 1961 } 1962 1963 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ 1964 template
\ 1965 static auto \ 1966 _S_##__name(const _Tp& __x, const _More&... __more) \ 1967 { \ 1968 return __fixed_size_storage_t<_RetTp, \ 1969 _VectorTraits<_Tp>::_S_partial_width>:: \ 1970 _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1971 return __meta._S_generator( \ 1972 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1973 return __name(__x[__meta._S_offset + __i], \ 1974 __more[__meta._S_offset + __i]...); \ 1975 }, \ 1976 static_cast<_RetTp*>(nullptr)); \ 1977 }); \ 1978 } 1979 1980 _GLIBCXX_SIMD_MATH_FALLBACK(acos) 1981 _GLIBCXX_SIMD_MATH_FALLBACK(asin) 1982 _GLIBCXX_SIMD_MATH_FALLBACK(atan) 1983 _GLIBCXX_SIMD_MATH_FALLBACK(atan2) 1984 _GLIBCXX_SIMD_MATH_FALLBACK(cos) 1985 _GLIBCXX_SIMD_MATH_FALLBACK(sin) 1986 _GLIBCXX_SIMD_MATH_FALLBACK(tan) 1987 _GLIBCXX_SIMD_MATH_FALLBACK(acosh) 1988 _GLIBCXX_SIMD_MATH_FALLBACK(asinh) 1989 _GLIBCXX_SIMD_MATH_FALLBACK(atanh) 1990 _GLIBCXX_SIMD_MATH_FALLBACK(cosh) 1991 _GLIBCXX_SIMD_MATH_FALLBACK(sinh) 1992 _GLIBCXX_SIMD_MATH_FALLBACK(tanh) 1993 _GLIBCXX_SIMD_MATH_FALLBACK(exp) 1994 _GLIBCXX_SIMD_MATH_FALLBACK(exp2) 1995 _GLIBCXX_SIMD_MATH_FALLBACK(expm1) 1996 _GLIBCXX_SIMD_MATH_FALLBACK(ldexp) 1997 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) 1998 _GLIBCXX_SIMD_MATH_FALLBACK(log) 1999 _GLIBCXX_SIMD_MATH_FALLBACK(log10) 2000 _GLIBCXX_SIMD_MATH_FALLBACK(log1p) 2001 _GLIBCXX_SIMD_MATH_FALLBACK(log2) 2002 _GLIBCXX_SIMD_MATH_FALLBACK(logb) 2003 2004 // modf implemented in simd_math.h 2005 _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) 2006 _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) 2007 _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) 2008 _GLIBCXX_SIMD_MATH_FALLBACK(fabs) 2009 _GLIBCXX_SIMD_MATH_FALLBACK(pow) 2010 _GLIBCXX_SIMD_MATH_FALLBACK(sqrt) 2011 _GLIBCXX_SIMD_MATH_FALLBACK(erf) 2012 _GLIBCXX_SIMD_MATH_FALLBACK(erfc) 2013 _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) 2014 _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) 2015 2016 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) 2017 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) 2018 2019 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) 2020 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) 2021 2022 _GLIBCXX_SIMD_MATH_FALLBACK(fmod) 2023 _GLIBCXX_SIMD_MATH_FALLBACK(remainder) 2024 2025 template
> 2026 static _Tp 2027 _S_remquo(const _Tp __x, const _Tp __y, 2028 __fixed_size_storage_t
* __z) 2029 { 2030 return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2031 int __tmp; 2032 auto __r = remquo(__x[__i], __y[__i], &__tmp); 2033 __z->_M_set(__i, __tmp); 2034 return __r; 2035 }); 2036 } 2037 2038 // copysign in simd_math.h 2039 _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) 2040 _GLIBCXX_SIMD_MATH_FALLBACK(fdim) 2041 _GLIBCXX_SIMD_MATH_FALLBACK(fmax) 2042 _GLIBCXX_SIMD_MATH_FALLBACK(fmin) 2043 _GLIBCXX_SIMD_MATH_FALLBACK(fma) 2044 2045 template
2046 static constexpr _MaskMember<_Tp> 2047 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 2048 _SimdWrapper<_Tp, _Np> __y) noexcept 2049 { 2050 using _Ip = __int_for_sizeof_t<_Tp>; 2051 const auto __xn = __vector_bitcast<_Ip>(__x); 2052 const auto __yn = __vector_bitcast<_Ip>(__y); 2053 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2054 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2055 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2056 __xp > __yp); 2057 } 2058 2059 template
2060 static constexpr _MaskMember<_Tp> 2061 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, 2062 _SimdWrapper<_Tp, _Np> __y) noexcept 2063 { 2064 using _Ip = __int_for_sizeof_t<_Tp>; 2065 const auto __xn = __vector_bitcast<_Ip>(__x); 2066 const auto __yn = __vector_bitcast<_Ip>(__y); 2067 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2068 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2069 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2070 __xp >= __yp); 2071 } 2072 2073 template
2074 static constexpr _MaskMember<_Tp> 2075 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept 2076 { 2077 using _Ip = __int_for_sizeof_t<_Tp>; 2078 const auto __xn = __vector_bitcast<_Ip>(__x); 2079 const auto __yn = __vector_bitcast<_Ip>(__y); 2080 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2081 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2082 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2083 __xp < __yp); 2084 } 2085 2086 template
2087 static constexpr _MaskMember<_Tp> 2088 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, 2089 _SimdWrapper<_Tp, _Np> __y) noexcept 2090 { 2091 using _Ip = __int_for_sizeof_t<_Tp>; 2092 const auto __xn = __vector_bitcast<_Ip>(__x); 2093 const auto __yn = __vector_bitcast<_Ip>(__y); 2094 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2095 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2096 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2097 __xp <= __yp); 2098 } 2099 2100 template
2101 static constexpr _MaskMember<_Tp> 2102 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, 2103 _SimdWrapper<_Tp, _Np> __y) noexcept 2104 { 2105 return __andnot(_SuperImpl::_S_isunordered(__x, __y), 2106 _SuperImpl::_S_not_equal_to(__x, __y)); 2107 } 2108 2109 #undef _GLIBCXX_SIMD_MATH_FALLBACK 2110 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET 2111 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET 2112 // _S_abs {{{3 2113 template
2114 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2115 _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept 2116 { 2117 // if (__builtin_is_constant_evaluated()) 2118 // { 2119 // return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2120 // } 2121 if constexpr (is_floating_point_v<_Tp>) 2122 // `v < 0 ? -v : v` cannot compile to the efficient implementation of 2123 // masking the signbit off because it must consider v == -0 2124 2125 // ~(-0.) & v would be easy, but breaks with fno-signed-zeros 2126 return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data); 2127 else 2128 return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2129 } 2130 2131 // }}}3 2132 // _S_plus_minus {{{ 2133 // Returns __x + __y - __y without -fassociative-math optimizing to __x. 2134 // - _TV must be __vector_type_t
. 2135 // - _UV must be _TV or floating-point type. 2136 template
2137 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 2138 _S_plus_minus(_TV __x, _UV __y) noexcept 2139 { 2140 #if defined __i386__ && !defined __SSE_MATH__ 2141 if constexpr (sizeof(__x) == 8) 2142 { // operations on __x would use the FPU 2143 static_assert(is_same_v<_TV, __vector_type_t
>); 2144 const auto __x4 = __vector_bitcast
(__x); 2145 if constexpr (is_same_v<_TV, _UV>) 2146 return __vector_bitcast
( 2147 _S_plus_minus(__x4, __vector_bitcast
(__y))); 2148 else 2149 return __vector_bitcast
(_S_plus_minus(__x4, __y)); 2150 } 2151 #endif 2152 #if !defined __clang__ && __GCC_IEC_559 == 0 2153 if (__builtin_is_constant_evaluated() 2154 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 2155 return (__x + __y) - __y; 2156 else 2157 return [&] { 2158 __x += __y; 2159 if constexpr(__have_sse) 2160 { 2161 if constexpr (sizeof(__x) >= 16) 2162 asm("" : "+x"(__x)); 2163 else if constexpr (is_same_v<__vector_type_t
, _TV>) 2164 asm("" : "+x"(__x[0]), "+x"(__x[1])); 2165 else 2166 __assert_unreachable<_TV>(); 2167 } 2168 else if constexpr(__have_neon) 2169 asm("" : "+w"(__x)); 2170 else if constexpr (__have_power_vmx) 2171 { 2172 if constexpr (is_same_v<__vector_type_t
, _TV>) 2173 asm("" : "+fgr"(__x[0]), "+fgr"(__x[1])); 2174 else 2175 asm("" : "+v"(__x)); 2176 } 2177 else 2178 asm("" : "+g"(__x)); 2179 return __x - __y; 2180 }(); 2181 #else 2182 return (__x + __y) - __y; 2183 #endif 2184 } 2185 2186 // }}} 2187 // _S_nearbyint {{{3 2188 template
> 2189 _GLIBCXX_SIMD_INTRINSIC static _Tp 2190 _S_nearbyint(_Tp __x_) noexcept 2191 { 2192 using value_type = typename _TVT::value_type; 2193 using _V = typename _TVT::type; 2194 const _V __x = __x_; 2195 const _V __absx = __and(__x, _S_absmask<_V>); 2196 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v
); 2197 _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs 2198 = _V() + (1ull << (__digits_v
- 1)); 2199 const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs); 2200 const _V __shifted = _S_plus_minus(__x, __shifter); 2201 return __absx < __shifter_abs ? __shifted : __x; 2202 } 2203 2204 // _S_rint {{{3 2205 template
> 2206 _GLIBCXX_SIMD_INTRINSIC static _Tp 2207 _S_rint(_Tp __x) noexcept 2208 { return _SuperImpl::_S_nearbyint(__x); } 2209 2210 // _S_trunc {{{3 2211 template
2212 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2213 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2214 { 2215 using _V = __vector_type_t<_Tp, _Np>; 2216 const _V __absx = __and(__x._M_data, _S_absmask<_V>); 2217 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>); 2218 constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1); 2219 _V __truncated = _S_plus_minus(__absx, __shifter); 2220 __truncated -= __truncated > __absx ? _V() + 1 : _V(); 2221 return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated) 2222 : __x._M_data; 2223 } 2224 2225 // _S_round {{{3 2226 template
2227 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2228 _S_round(_SimdWrapper<_Tp, _Np> __x) 2229 { 2230 const auto __abs_x = _SuperImpl::_S_abs(__x); 2231 const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data; 2232 const auto __r_abs // round(abs(x)) = 2233 = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0); 2234 return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs); 2235 } 2236 2237 // _S_floor {{{3 2238 template
2239 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2240 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2241 { 2242 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2243 const auto __negative_input 2244 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2245 const auto __mask 2246 = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2247 return __or(__andnot(__mask, __y), 2248 __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1))); 2249 } 2250 2251 // _S_ceil {{{3 2252 template
2253 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2254 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2255 { 2256 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2257 const auto __negative_input 2258 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2259 const auto __inv_mask 2260 = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2261 return __or(__and(__inv_mask, __y), 2262 __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1))); 2263 } 2264 2265 // _S_isnan {{{3 2266 template
2267 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2268 _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2269 { 2270 #if __FINITE_MATH_ONLY__ 2271 return {}; // false 2272 #elif !defined __SUPPORT_SNAN__ 2273 return ~(__x._M_data == __x._M_data); 2274 #elif defined __STDC_IEC_559__ 2275 using _Ip = __int_for_sizeof_t<_Tp>; 2276 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2277 const auto __infn 2278 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 2279 return __infn < __absn; 2280 #else 2281 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?" 2282 #endif 2283 } 2284 2285 // _S_isfinite {{{3 2286 template
2287 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2288 _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2289 { 2290 #if __FINITE_MATH_ONLY__ 2291 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 2292 _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV(); 2293 return __alltrue; 2294 #else 2295 // if all exponent bits are set, __x is either inf or NaN 2296 using _Ip = __int_for_sizeof_t<_Tp>; 2297 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2298 const auto __maxn 2299 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2300 return __absn <= __maxn; 2301 #endif 2302 } 2303 2304 // _S_isunordered {{{3 2305 template
2306 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2307 _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2308 { return __or(_S_isnan(__x), _S_isnan(__y)); } 2309 2310 // _S_signbit {{{3 2311 template
2312 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2313 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2314 { 2315 using _Ip = __int_for_sizeof_t<_Tp>; 2316 return __vector_bitcast<_Ip>(__x) < 0; 2317 // Arithmetic right shift (SRA) would also work (instead of compare), but 2318 // 64-bit SRA isn't available on x86 before AVX512. And in general, 2319 // compares are more likely to be efficient than SRA. 2320 } 2321 2322 // _S_isinf {{{3 2323 template
2324 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2325 _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2326 { 2327 #if __FINITE_MATH_ONLY__ 2328 return {}; // false 2329 #else 2330 return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x), 2331 __vector_broadcast<_Np>( 2332 __infinity_v<_Tp>)); 2333 // alternative: 2334 // compare to inf using the corresponding integer type 2335 /* 2336 return 2337 __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>( 2338 _S_abs(__x)._M_data) 2339 == 2340 __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>( 2341 __infinity_v<_Tp>))); 2342 */ 2343 #endif 2344 } 2345 2346 // _S_isnormal {{{3 2347 template
2348 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2349 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 2350 { 2351 using _Ip = __int_for_sizeof_t<_Tp>; 2352 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2353 const auto __minn 2354 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>)); 2355 #if __FINITE_MATH_ONLY__ 2356 return __absn >= __minn; 2357 #else 2358 const auto __maxn 2359 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2360 return __minn <= __absn && __absn <= __maxn; 2361 #endif 2362 } 2363 2364 // _S_fpclassify {{{3 2365 template
2366 _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t
2367 _S_fpclassify(_SimdWrapper<_Tp, _Np> __x) 2368 { 2369 using _I = __int_for_sizeof_t<_Tp>; 2370 const auto __xn 2371 = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x))); 2372 constexpr size_t _NI = sizeof(__xn) / sizeof(_I); 2373 _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn 2374 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>)); 2375 2376 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal 2377 = __vector_broadcast<_NI, _I>(FP_NORMAL); 2378 #if !__FINITE_MATH_ONLY__ 2379 _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn 2380 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>)); 2381 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan 2382 = __vector_broadcast<_NI, _I>(FP_NAN); 2383 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite 2384 = __vector_broadcast<_NI, _I>(FP_INFINITE); 2385 #endif 2386 #ifndef __FAST_MATH__ 2387 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal 2388 = __vector_broadcast<_NI, _I>(FP_SUBNORMAL); 2389 #endif 2390 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero 2391 = __vector_broadcast<_NI, _I>(FP_ZERO); 2392 2393 __vector_type_t<_I, _NI> 2394 __tmp = __xn < __minn 2395 #ifdef __FAST_MATH__ 2396 ? __fp_zero 2397 #else 2398 ? (__xn == 0 ? __fp_zero : __fp_subnormal) 2399 #endif 2400 #if __FINITE_MATH_ONLY__ 2401 : __fp_normal; 2402 #else 2403 : (__xn < __infn ? __fp_normal 2404 : (__xn == __infn ? __fp_infinite : __fp_nan)); 2405 #endif 2406 2407 if constexpr (sizeof(_I) == sizeof(int)) 2408 { 2409 using _FixedInt = __fixed_size_storage_t
; 2410 const auto __as_int = __vector_bitcast
(__tmp); 2411 if constexpr (_FixedInt::_S_tuple_size == 1) 2412 return {__as_int}; 2413 else if constexpr (_FixedInt::_S_tuple_size == 2 2414 && is_same_v< 2415 typename _FixedInt::_SecondType::_FirstAbi, 2416 simd_abi::scalar>) 2417 return {__extract<0, 2>(__as_int), __as_int[_Np - 1]}; 2418 else if constexpr (_FixedInt::_S_tuple_size == 2) 2419 return {__extract<0, 2>(__as_int), 2420 __auto_bitcast(__extract<1, 2>(__as_int))}; 2421 else 2422 __assert_unreachable<_Tp>(); 2423 } 2424 else if constexpr (_Np == 2 && sizeof(_I) == 8 2425 && __fixed_size_storage_t
::_S_tuple_size == 2) 2426 { 2427 const auto __aslong = __vector_bitcast<_LLong>(__tmp); 2428 return {int(__aslong[0]), {int(__aslong[1])}}; 2429 } 2430 #if _GLIBCXX_SIMD_X86INTRIN 2431 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32 2432 && __fixed_size_storage_t
::_S_tuple_size == 1) 2433 return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)), 2434 __to_intrin(__hi128(__tmp)))}; 2435 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64 2436 && __fixed_size_storage_t
::_S_tuple_size == 1) 2437 return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))}; 2438 #endif // _GLIBCXX_SIMD_X86INTRIN 2439 else if constexpr (__fixed_size_storage_t
::_S_tuple_size == 1) 2440 return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), 2441 [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2442 return __make_wrapper
(__l...); 2443 })}; 2444 else 2445 __assert_unreachable<_Tp>(); 2446 } 2447 2448 // _S_increment & _S_decrement{{{2 2449 template
2450 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2451 _S_increment(_SimdWrapper<_Tp, _Np>& __x) 2452 { __x = __x._M_data + 1; } 2453 2454 template
2455 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2456 _S_decrement(_SimdWrapper<_Tp, _Np>& __x) 2457 { __x = __x._M_data - 1; } 2458 2459 // smart_reference access {{{2 2460 template
2461 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2462 _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept 2463 { __v._M_set(__i, static_cast<_Up&&>(__x)); } 2464 2465 // _S_masked_assign{{{2 2466 template
2467 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2468 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2469 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2470 { 2471 if (__k._M_is_constprop_none_of()) 2472 return; 2473 else if (__k._M_is_constprop_all_of()) 2474 __lhs = __rhs; 2475 else 2476 __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); 2477 } 2478 2479 template
2480 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2481 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2482 __type_identity_t<_Tp> __rhs) 2483 { 2484 if (__k._M_is_constprop_none_of()) 2485 return; 2486 else if (__k._M_is_constprop_all_of()) 2487 __lhs = __vector_broadcast<_Np>(__rhs); 2488 else if (__builtin_constant_p(__rhs) && __rhs == 0) 2489 { 2490 if constexpr (!is_same_v
) 2491 // the __andnot optimization only makes sense if __k._M_data is a 2492 // vector register 2493 __lhs._M_data 2494 = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data); 2495 else 2496 // for AVX512/__mmask, a _mm512_maskz_mov is best 2497 __lhs 2498 = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>()); 2499 } 2500 else 2501 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2502 _SimdWrapper<_Tp, _Np>( 2503 __vector_broadcast<_Np>(__rhs))); 2504 } 2505 2506 // _S_masked_cassign {{{2 2507 template
2508 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2509 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2510 _SimdWrapper<_Tp, _Np>& __lhs, 2511 const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs, 2512 _Op __op) 2513 { 2514 if (__k._M_is_constprop_none_of()) 2515 return; 2516 else if (__k._M_is_constprop_all_of()) 2517 __lhs = __op(_SuperImpl{}, __lhs, __rhs); 2518 else 2519 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2520 __op(_SuperImpl{}, __lhs, __rhs)); 2521 } 2522 2523 template
2524 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2525 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2526 _SimdWrapper<_Tp, _Np>& __lhs, 2527 const __type_identity_t<_Tp> __rhs, _Op __op) 2528 { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); } 2529 2530 // _S_masked_unary {{{2 2531 template
class _Op, typename _Tp, typename _K, 2532 size_t _Np> 2533 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2534 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, 2535 const _SimdWrapper<_Tp, _Np> __v) 2536 { 2537 if (__k._M_is_constprop_none_of()) 2538 return __v; 2539 auto __vv = _M_make_simd(__v); 2540 _Op
__op; 2541 if (__k._M_is_constprop_all_of()) 2542 return __data(__op(__vv)); 2543 else if constexpr (is_same_v<_Op
, __increment
>) 2544 { 2545 static_assert(not std::is_same_v<_K, bool>); 2546 if constexpr (is_integral_v<_Tp>) 2547 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2548 return __v._M_data - __vector_bitcast<_Tp>(__k._M_data); 2549 else if constexpr (not __have_avx2) 2550 return __v._M_data 2551 + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2552 _K, _Tp(1))); 2553 // starting with AVX2 it is more efficient to blend after add 2554 } 2555 else if constexpr (is_same_v<_Op
, __decrement
>) 2556 { 2557 static_assert(not std::is_same_v<_K, bool>); 2558 if constexpr (is_integral_v<_Tp>) 2559 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2560 return __v._M_data + __vector_bitcast<_Tp>(__k._M_data); 2561 else if constexpr (not __have_avx2) 2562 return __v._M_data 2563 - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2564 _K, _Tp(1))); 2565 // starting with AVX2 it is more efficient to blend after sub 2566 } 2567 return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv))); 2568 } 2569 2570 //}}}2 2571 }; 2572 2573 // _MaskImplBuiltinMixin {{{1 2574 struct _MaskImplBuiltinMixin 2575 { 2576 template
2577 using _TypeTag = _Tp*; 2578 2579 // _S_to_maskvector {{{ 2580 template
2581 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2582 _S_to_maskvector(bool __x) 2583 { 2584 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2585 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 2586 : __vector_type_t<_Up, _ToN>{}; 2587 } 2588 2589 template
2591 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2592 _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) 2593 { 2594 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2595 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2596 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2597 if constexpr (__i < _Np) 2598 return __x[__i] ? ~_Up() : _Up(); 2599 else 2600 return _Up(); 2601 }); 2602 } 2603 2604 template
2606 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2607 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 2608 { 2609 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2610 using _TW = _SimdWrapper<_Tp, _Np>; 2611 using _UW = _SimdWrapper<_Up, _ToN>; 2612 if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW)) 2613 return __wrapper_bitcast<_Up, _ToN>(__x); 2614 else if constexpr (is_same_v<_Tp, bool>) // bits -> vector 2615 return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data)); 2616 else 2617 { // vector -> vector 2618 /* 2619 [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data); 2620 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) == 2621 16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr 2622 (sizeof(_Tp) == 4 && sizeof(_Up) == 2 2623 && sizeof(__y) == 16) 2624 return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y); 2625 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 2626 && sizeof(__y) == 16) 2627 return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y); 2628 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 2629 && sizeof(__y) == 16) 2630 return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, 2631 -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 && 2632 sizeof(_Up) == 1 2633 && sizeof(__y) == 16) 2634 return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 2635 -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 && 2636 sizeof(_Up) == 1 2637 && sizeof(__y) == 16) 2638 return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2639 -1, -1, -1, -1, -1>(__y); else 2640 */ 2641 { 2642 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2643 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2644 if constexpr (__i < _Np) 2645 return _Up(__x[__i.value]); 2646 else 2647 return _Up(); 2648 }); 2649 } 2650 } 2651 } 2652 2653 // }}} 2654 // _S_to_bits {{{ 2655 template
2656 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 2657 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 2658 { 2659 static_assert(!is_same_v<_Tp, bool>); 2660 static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong)); 2661 using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; 2662 const auto __bools 2663 = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); 2664 _ULLong __r = 0; 2665 __execute_n_times<_Np>( 2666 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2667 __r |= _ULLong(__bools[__i.value]) << __i; 2668 }); 2669 return __r; 2670 } 2671 2672 // }}} 2673 }; 2674 2675 // _MaskImplBuiltin {{{1 2676 template
2677 struct _MaskImplBuiltin : _MaskImplBuiltinMixin 2678 { 2679 using _MaskImplBuiltinMixin::_S_to_bits; 2680 using _MaskImplBuiltinMixin::_S_to_maskvector; 2681 2682 // member types {{{ 2683 template
2684 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 2685 2686 template
2687 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 2688 2689 using _SuperImpl = typename _Abi::_MaskImpl; 2690 using _CommonImpl = typename _Abi::_CommonImpl; 2691 2692 template
2693 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 2694 2695 // }}} 2696 // _S_broadcast {{{ 2697 template
2698 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2699 _S_broadcast(bool __x) 2700 { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); } 2701 2702 // }}} 2703 // _S_load {{{ 2704 template
2705 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2706 _S_load(const bool* __mem) 2707 { 2708 using _I = __int_for_sizeof_t<_Tp>; 2709 if (not __builtin_is_constant_evaluated()) 2710 if constexpr (sizeof(_Tp) == sizeof(bool)) 2711 { 2712 const auto __bools 2713 = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem); 2714 // bool is {0, 1}, everything else is UB 2715 return __bools > 0; 2716 } 2717 return __generate_vector<_I, _S_size<_Tp>>( 2718 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2719 return __mem[__i] ? ~_I() : _I(); 2720 }); 2721 } 2722 2723 // }}} 2724 // _S_convert {{{ 2725 template
2726 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2727 _S_convert(_BitMask<_Np, _Sanitized> __x) 2728 { 2729 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2730 return _SimdWrapper
>(__x._M_to_bits()); 2731 else 2732 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2733 _S_size<_Tp>>( 2734 __x._M_sanitized()); 2735 } 2736 2737 template
2738 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2739 _S_convert(_SimdWrapper
__x) 2740 { 2741 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2742 return _SimdWrapper
>(__x._M_data); 2743 else 2744 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2745 _S_size<_Tp>>( 2746 _BitMask<_Np>(__x._M_data)._M_sanitized()); 2747 } 2748 2749 template
2750 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2751 _S_convert(_SimdWrapper<_Up, _Np> __x) 2752 { 2753 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2754 return _SimdWrapper
>( 2755 _SuperImpl::_S_to_bits(__x)); 2756 else 2757 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2758 _S_size<_Tp>>(__x); 2759 } 2760 2761 template
2762 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2763 _S_convert(simd_mask<_Up, _UAbi> __x) 2764 { 2765 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2766 { 2767 using _R = _SimdWrapper
>; 2768 if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits 2769 return _R(__data(__x)); 2770 else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits 2771 return _R(__data(__x)); 2772 else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits 2773 return _R(__data(__x)._M_to_bits()); 2774 else // vector -> bits 2775 return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits()); 2776 } 2777 else 2778 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2779 _S_size<_Tp>>( 2780 __data(__x)); 2781 } 2782 2783 // }}} 2784 // _S_masked_load {{{2 2785 template
2786 static inline _SimdWrapper<_Tp, _Np> 2787 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 2788 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 2789 { 2790 // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity 2791 auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); 2792 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), 2793 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2794 __tmp._M_set(__i, -__mem[__i]); 2795 }); 2796 __merge = __wrapper_bitcast<_Tp>(__tmp); 2797 return __merge; 2798 } 2799 2800 // _S_store {{{2 2801 template