Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_builtin.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_builtin.h 1 // Simd Abi specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 27 28 #if __cplusplus >= 201703L 29 30 #include
31 #include
32 #include
33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 // _S_allbits{{{ 36 template
37 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits 38 = reinterpret_cast<_V>(~__vector_type_t
()); 39 40 // }}} 41 // _S_signmask, _S_absmask{{{ 42 template
> 43 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask 44 = __xor(_V() + 1, _V() - 1); 45 46 template
> 47 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask 48 = __andnot(_S_signmask<_V>, _S_allbits<_V>); 49 50 //}}} 51 // __vector_permute
{{{ 52 // Index == -1 requests zeroing of the output element 53 template
> 54 constexpr _Tp 55 __vector_permute(_Tp __x) 56 { 57 static_assert(sizeof...(_Indices) == _TVT::_S_full_size); 58 return __make_vector
( 59 (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...); 60 } 61 62 // }}} 63 // __vector_shuffle
{{{ 64 // Index == -1 requests zeroing of the output element 65 template
> 66 constexpr _Tp 67 __vector_shuffle(_Tp __x, _Tp __y) 68 { 69 return _Tp{(_Indices == -1 ? 0 70 : _Indices < _TVT::_S_full_size 71 ? __x[_Indices] 72 : __y[_Indices - _TVT::_S_full_size])...}; 73 } 74 75 // }}} 76 // __make_wrapper{{{ 77 template
78 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)> 79 __make_wrapper(const _Args&... __args) 80 { return __make_vector<_Tp>(__args...); } 81 82 // }}} 83 // __wrapper_bitcast{{{ 84 template
86 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np> 87 __wrapper_bitcast(_SimdWrapper<_Up, _M> __x) 88 { 89 static_assert(_Np > 1); 90 return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data); 91 } 92 93 // }}} 94 // __shift_elements_right{{{ 95 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct 96 template
> 97 _GLIBCXX_SIMD_INTRINSIC _Tp 98 __shift_elements_right(_Tp __v) 99 { 100 [[maybe_unused]] const auto __iv = __to_intrin(__v); 101 static_assert(__shift <= sizeof(_Tp)); 102 if constexpr (__shift == 0) 103 return __v; 104 else if constexpr (__shift == sizeof(_Tp)) 105 return _Tp(); 106 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 107 else if constexpr (__have_sse && __shift == 8 108 && _TVT::template _S_is
) 109 return _mm_movehl_ps(__iv, __iv); 110 else if constexpr (__have_sse2 && __shift == 8 111 && _TVT::template _S_is
) 112 return _mm_unpackhi_pd(__iv, __iv); 113 else if constexpr (__have_sse2 && sizeof(_Tp) == 16) 114 return reinterpret_cast
( 115 _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift)); 116 else if constexpr (__shift == 16 && sizeof(_Tp) == 32) 117 { 118 /*if constexpr (__have_avx && _TVT::template _S_is
) 119 return _mm256_permute2f128_pd(__iv, __iv, 0x81); 120 else if constexpr (__have_avx && _TVT::template _S_is
) 121 return _mm256_permute2f128_ps(__iv, __iv, 0x81); 122 else if constexpr (__have_avx) 123 return reinterpret_cast
( 124 _mm256_permute2f128_si256(__iv, __iv, 0x81)); 125 else*/ 126 return __zero_extend(__hi128(__v)); 127 } 128 else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16) 129 { 130 const auto __vll = __vector_bitcast<_LLong>(__v); 131 return reinterpret_cast
( 132 _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81), 133 __vll, __shift)); 134 } 135 else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16) 136 { 137 const auto __vll = __vector_bitcast<_LLong>(__v); 138 return reinterpret_cast
( 139 __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift), 140 _mm_srli_si128(__hi128(__vll), __shift))); 141 } 142 else if constexpr (sizeof(_Tp) == 32 && __shift > 16) 143 return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v))); 144 else if constexpr (sizeof(_Tp) == 64 && __shift == 32) 145 return __zero_extend(__hi256(__v)); 146 else if constexpr (__have_avx512f && sizeof(_Tp) == 64) 147 { 148 if constexpr (__shift >= 48) 149 return __zero_extend( 150 __shift_elements_right<__shift - 48>(__extract<3, 4>(__v))); 151 else if constexpr (__shift >= 32) 152 return __zero_extend( 153 __shift_elements_right<__shift - 32>(__hi256(__v))); 154 else if constexpr (__shift % 8 == 0) 155 return reinterpret_cast
( 156 _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v), 157 __shift / 8)); 158 else if constexpr (__shift % 4 == 0) 159 return reinterpret_cast
( 160 _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v), 161 __shift / 4)); 162 else if constexpr (__have_avx512bw && __shift < 16) 163 { 164 const auto __vll = __vector_bitcast<_LLong>(__v); 165 return reinterpret_cast
( 166 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9), 167 __vll, __shift)); 168 } 169 else if constexpr (__have_avx512bw && __shift < 32) 170 { 171 const auto __vll = __vector_bitcast<_LLong>(__v); 172 return reinterpret_cast
( 173 _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee), 174 _mm512_shuffle_i32x4(__vll, __vll, 0xf9), 175 __shift - 16)); 176 } 177 else 178 __assert_unreachable<_Tp>(); 179 } 180 /* 181 } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64) 182 return __auto_bitcast(__extract<__shift / 16, 4>(__v)); 183 */ 184 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 185 else 186 { 187 constexpr int __chunksize = __shift % 8 == 0 ? 8 188 : __shift % 4 == 0 ? 4 189 : __shift % 2 == 0 ? 2 190 : 1; 191 auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v); 192 using _Up = decltype(__w); 193 return __intrin_bitcast<_Tp>( 194 __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( 195 [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 196 return _Up{__chunks...}; 197 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 198 return __w[__shift / __chunksize + __i]; 199 })); 200 } 201 } 202 203 // }}} 204 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ 205 template
206 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr 207 _SimdWrapper<_Tp, _Np / _Total * _Combine> 208 __extract_part(const _SimdWrapper<_Tp, _Np> __x) 209 { 210 if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0) 211 return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); 212 else 213 { 214 constexpr size_t __values_per_part = _Np / _Total; 215 constexpr size_t __values_to_skip = _Index * __values_per_part; 216 constexpr size_t __return_size = __values_per_part * _Combine; 217 using _R = __vector_type_t<_Tp, __return_size>; 218 static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) 219 <= sizeof(__x), 220 "out of bounds __extract_part"); 221 // the following assertion would ensure no "padding" to be read 222 // static_assert(_Total >= _Index + _Combine, "_Total must be greater 223 // than _Index"); 224 225 // static_assert(__return_size * _Total == _Np, "_Np must be divisible 226 // by _Total"); 227 if (__x._M_is_constprop()) 228 return __generate_from_n_evaluations<__return_size, _R>( 229 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 230 return __x[__values_to_skip + __i]; 231 }); 232 if constexpr (_Index == 0 && _Total == 1) 233 return __x; 234 else if constexpr (_Index == 0) 235 return __intrin_bitcast<_R>(__as_vector(__x)); 236 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 237 else if constexpr (sizeof(__x) == 32 238 && __return_size * sizeof(_Tp) <= 16) 239 { 240 constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp); 241 if constexpr (__bytes_to_skip == 16) 242 return __vector_bitcast<_Tp, __return_size>( 243 __hi128(__as_vector(__x))); 244 else 245 return __vector_bitcast<_Tp, __return_size>( 246 _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)), 247 __lo128(__vector_bitcast<_LLong>(__x)), 248 __bytes_to_skip)); 249 } 250 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 251 else if constexpr (_Index > 0 252 && (__values_to_skip % __return_size != 0 253 || sizeof(_R) >= 8) 254 && (__values_to_skip + __return_size) * sizeof(_Tp) 255 <= 64 256 && sizeof(__x) >= 16) 257 return __intrin_bitcast<_R>( 258 __shift_elements_right<__values_to_skip * sizeof(_Tp)>( 259 __as_vector(__x))); 260 else 261 { 262 _R __r = {}; 263 __builtin_memcpy(&__r, 264 reinterpret_cast
(&__x) 265 + sizeof(_Tp) * __values_to_skip, 266 __return_size * sizeof(_Tp)); 267 return __r; 268 } 269 } 270 } 271 272 // }}} 273 // __extract_part(_SimdWrapper
) {{{ 274 template
275 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper
276 __extract_part(const _SimdWrapper
__x) 277 { 278 static_assert(_Combine == 1, "_Combine != 1 not implemented"); 279 static_assert(__have_avx512f && _Np == _Np); 280 static_assert(_Total >= 2 && _Index + _Combine <= _Total && _Index >= 0); 281 return __x._M_data >> (_Index * _Np / _Total); 282 } 283 284 // }}} 285 286 // __vector_convert {{{ 287 // implementation requires an index sequence 288 template
289 _GLIBCXX_SIMD_INTRINSIC constexpr _To 290 __vector_convert(_From __a, index_sequence<_I...>) 291 { 292 using _Tp = typename _VectorTraits<_To>::value_type; 293 return _To{static_cast<_Tp>(__a[_I])...}; 294 } 295 296 template
297 _GLIBCXX_SIMD_INTRINSIC constexpr _To 298 __vector_convert(_From __a, _From __b, index_sequence<_I...>) 299 { 300 using _Tp = typename _VectorTraits<_To>::value_type; 301 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...}; 302 } 303 304 template
305 _GLIBCXX_SIMD_INTRINSIC constexpr _To 306 __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>) 307 { 308 using _Tp = typename _VectorTraits<_To>::value_type; 309 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 310 static_cast<_Tp>(__c[_I])...}; 311 } 312 313 template
314 _GLIBCXX_SIMD_INTRINSIC constexpr _To 315 __vector_convert(_From __a, _From __b, _From __c, _From __d, 316 index_sequence<_I...>) 317 { 318 using _Tp = typename _VectorTraits<_To>::value_type; 319 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 320 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...}; 321 } 322 323 template
324 _GLIBCXX_SIMD_INTRINSIC constexpr _To 325 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 326 index_sequence<_I...>) 327 { 328 using _Tp = typename _VectorTraits<_To>::value_type; 329 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 330 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 331 static_cast<_Tp>(__e[_I])...}; 332 } 333 334 template
335 _GLIBCXX_SIMD_INTRINSIC constexpr _To 336 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 337 _From __f, index_sequence<_I...>) 338 { 339 using _Tp = typename _VectorTraits<_To>::value_type; 340 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 341 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 342 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...}; 343 } 344 345 template
346 _GLIBCXX_SIMD_INTRINSIC constexpr _To 347 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 348 _From __f, _From __g, index_sequence<_I...>) 349 { 350 using _Tp = typename _VectorTraits<_To>::value_type; 351 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 352 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 353 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 354 static_cast<_Tp>(__g[_I])...}; 355 } 356 357 template
358 _GLIBCXX_SIMD_INTRINSIC constexpr _To 359 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 360 _From __f, _From __g, _From __h, index_sequence<_I...>) 361 { 362 using _Tp = typename _VectorTraits<_To>::value_type; 363 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 364 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 365 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 366 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...}; 367 } 368 369 template
370 _GLIBCXX_SIMD_INTRINSIC constexpr _To 371 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 372 _From __f, _From __g, _From __h, _From __i, 373 index_sequence<_I...>) 374 { 375 using _Tp = typename _VectorTraits<_To>::value_type; 376 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 377 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 378 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 379 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 380 static_cast<_Tp>(__i[_I])...}; 381 } 382 383 template
384 _GLIBCXX_SIMD_INTRINSIC constexpr _To 385 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 386 _From __f, _From __g, _From __h, _From __i, _From __j, 387 index_sequence<_I...>) 388 { 389 using _Tp = typename _VectorTraits<_To>::value_type; 390 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 391 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 392 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 393 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 394 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...}; 395 } 396 397 template
398 _GLIBCXX_SIMD_INTRINSIC constexpr _To 399 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 400 _From __f, _From __g, _From __h, _From __i, _From __j, 401 _From __k, index_sequence<_I...>) 402 { 403 using _Tp = typename _VectorTraits<_To>::value_type; 404 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 405 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 406 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 407 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 408 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 409 static_cast<_Tp>(__k[_I])...}; 410 } 411 412 template
413 _GLIBCXX_SIMD_INTRINSIC constexpr _To 414 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 415 _From __f, _From __g, _From __h, _From __i, _From __j, 416 _From __k, _From __l, index_sequence<_I...>) 417 { 418 using _Tp = typename _VectorTraits<_To>::value_type; 419 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 420 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 421 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 422 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 423 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 424 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...}; 425 } 426 427 template
428 _GLIBCXX_SIMD_INTRINSIC constexpr _To 429 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 430 _From __f, _From __g, _From __h, _From __i, _From __j, 431 _From __k, _From __l, _From __m, index_sequence<_I...>) 432 { 433 using _Tp = typename _VectorTraits<_To>::value_type; 434 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 435 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 436 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 437 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 438 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 439 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 440 static_cast<_Tp>(__m[_I])...}; 441 } 442 443 template
444 _GLIBCXX_SIMD_INTRINSIC constexpr _To 445 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 446 _From __f, _From __g, _From __h, _From __i, _From __j, 447 _From __k, _From __l, _From __m, _From __n, 448 index_sequence<_I...>) 449 { 450 using _Tp = typename _VectorTraits<_To>::value_type; 451 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 452 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 453 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 454 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 455 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 456 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 457 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...}; 458 } 459 460 template
461 _GLIBCXX_SIMD_INTRINSIC constexpr _To 462 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 463 _From __f, _From __g, _From __h, _From __i, _From __j, 464 _From __k, _From __l, _From __m, _From __n, _From __o, 465 index_sequence<_I...>) 466 { 467 using _Tp = typename _VectorTraits<_To>::value_type; 468 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 469 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 470 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 471 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 472 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 473 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 474 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 475 static_cast<_Tp>(__o[_I])...}; 476 } 477 478 template
479 _GLIBCXX_SIMD_INTRINSIC constexpr _To 480 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 481 _From __f, _From __g, _From __h, _From __i, _From __j, 482 _From __k, _From __l, _From __m, _From __n, _From __o, 483 _From __p, index_sequence<_I...>) 484 { 485 using _Tp = typename _VectorTraits<_To>::value_type; 486 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 487 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 488 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 489 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 490 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 491 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 492 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 493 static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...}; 494 } 495 496 // Defer actual conversion to the overload that takes an index sequence. Note 497 // that this function adds zeros or drops values off the end if you don't ensure 498 // matching width. 499 template
500 _GLIBCXX_SIMD_INTRINSIC constexpr _To 501 __vector_convert(_SimdWrapper<_From, _FromSize>... __xs) 502 { 503 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 504 using _From0 = __first_of_pack_t<_From...>; 505 using _FW = _SimdWrapper<_From0, _FromSize>; 506 if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop())) 507 { 508 if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1)) 509 == 0) // power-of-two number of arguments 510 return __convert_x86<_To>(__as_vector(__xs)...); 511 else // append zeros and recurse until the above branch is taken 512 return __vector_convert<_To>(__xs..., _FW{}); 513 } 514 else 515 #endif 516 return __vector_convert<_To>( 517 __as_vector(__xs)..., 518 make_index_sequence<(sizeof...(__xs) == 1 ? std::min( 519 _VectorTraits<_To>::_S_full_size, int(_FromSize)) 520 : _FromSize)>()); 521 } 522 523 // }}} 524 // __convert function{{{ 525 template
526 _GLIBCXX_SIMD_INTRINSIC constexpr auto 527 __convert(_From __v0, _More... __vs) 528 { 529 static_assert((true && ... && is_same_v<_From, _More>) ); 530 if constexpr (__is_vectorizable_v<_From>) 531 { 532 using _V = typename _VectorTraits<_To>::type; 533 using _Tp = typename _VectorTraits<_To>::value_type; 534 return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...}; 535 } 536 else if constexpr (__is_vector_type_v<_From>) 537 return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...); 538 else // _SimdWrapper arguments 539 { 540 constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More)); 541 if constexpr (__is_vectorizable_v<_To>) 542 return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...); 543 else if constexpr (!__is_vector_type_v<_To>) 544 return _To(__convert
(__v0, __vs...)); 545 else 546 { 547 static_assert( 548 sizeof...(_More) == 0 549 || _VectorTraits<_To>::_S_full_size >= __input_size, 550 "__convert(...) requires the input to fit into the output"); 551 return __vector_convert<_To>(__v0, __vs...); 552 } 553 } 554 } 555 556 // }}} 557 // __convert_all{{{ 558 // Converts __v into array<_To, N>, where N is _NParts if non-zero or 559 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v). 560 // Note: this function may return less than all converted elements 561 template
> 567 _GLIBCXX_SIMD_INTRINSIC auto 568 __convert_all(_From __v) 569 { 570 if constexpr (is_arithmetic_v<_To> && _NParts != 1) 571 { 572 static_assert(_Offset < _FromVT::_S_full_size); 573 constexpr auto _Np 574 = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts; 575 return __generate_from_n_evaluations<_Np, array<_To, _Np>>( 576 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 577 return static_cast<_To>(__v[__i + _Offset]); 578 }); 579 } 580 else 581 { 582 static_assert(__is_vector_type_v<_To>); 583 using _ToVT = _VectorTraits<_To>; 584 if constexpr (__is_vector_type_v<_From>) 585 return __convert_all<_To, _NParts>(__as_wrapper(__v)); 586 else if constexpr (_NParts == 1) 587 { 588 static_assert(_Offset % _ToVT::_S_full_size == 0); 589 return array<_To, 1>{__vector_convert<_To>( 590 __extract_part<_Offset / _ToVT::_S_full_size, 591 __div_roundup(_FromVT::_S_partial_width, 592 _ToVT::_S_full_size)>(__v))}; 593 } 594 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 595 else if constexpr (!__have_sse4_1 && _Offset == 0 596 && is_integral_v
597 && sizeof(typename _FromVT::value_type) 598 < sizeof(typename _ToVT::value_type) 599 && !(sizeof(typename _FromVT::value_type) == 4 600 && is_same_v
)) 601 { 602 using _ToT = typename _ToVT::value_type; 603 using _FromT = typename _FromVT::value_type; 604 constexpr size_t _Np 605 = _NParts != 0 606 ? _NParts 607 : (_FromVT::_S_partial_width / _ToVT::_S_full_size); 608 using _R = array<_To, _Np>; 609 // __adjust modifies its input to have _Np (use _SizeConstant) 610 // entries so that no unnecessary intermediate conversions are 611 // requested and, more importantly, no intermediate conversions are 612 // missing 613 [[maybe_unused]] auto __adjust 614 = [](auto __n, 615 auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> { 616 return __vector_bitcast<_FromT, decltype(__n)::value>(__vv); 617 }; 618 [[maybe_unused]] const auto __vi = __to_intrin(__v); 619 auto&& __make_array 620 = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 621 if constexpr (_Np == 1) 622 return _R{__intrin_bitcast<_To>(__x0)}; 623 else 624 return _R{__intrin_bitcast<_To>(__x0), 625 __intrin_bitcast<_To>(__x1)}; 626 }; 627 628 if constexpr (_Np == 0) 629 return _R{}; 630 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2) 631 { 632 static_assert(is_integral_v<_FromT>); 633 static_assert(is_integral_v<_ToT>); 634 if constexpr (is_unsigned_v<_FromT>) 635 return __make_array(_mm_unpacklo_epi8(__vi, __m128i()), 636 _mm_unpackhi_epi8(__vi, __m128i())); 637 else 638 return __make_array( 639 _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8), 640 _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8)); 641 } 642 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4) 643 { 644 static_assert(is_integral_v<_FromT>); 645 if constexpr (is_floating_point_v<_ToT>) 646 { 647 const auto __ints 648 = __convert_all<__vector_type16_t
, _Np>( 649 __adjust(_SizeConstant<_Np * 4>(), __v)); 650 return __generate_from_n_evaluations<_Np, _R>( 651 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 652 return __vector_convert<_To>(__as_wrapper(__ints[__i])); 653 }); 654 } 655 else if constexpr (is_unsigned_v<_FromT>) 656 return __make_array(_mm_unpacklo_epi16(__vi, __m128i()), 657 _mm_unpackhi_epi16(__vi, __m128i())); 658 else 659 return __make_array( 660 _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16), 661 _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16)); 662 } 663 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 664 && is_integral_v<_FromT> && is_integral_v<_ToT>) 665 { 666 if constexpr (is_unsigned_v<_FromT>) 667 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 668 _mm_unpackhi_epi32(__vi, __m128i())); 669 else 670 return __make_array( 671 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 672 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 673 } 674 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 675 && is_integral_v<_FromT> && is_integral_v<_ToT>) 676 { 677 if constexpr (is_unsigned_v<_FromT>) 678 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 679 _mm_unpackhi_epi32(__vi, __m128i())); 680 else 681 return __make_array( 682 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 683 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 684 } 685 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4 686 && is_signed_v<_FromT>) 687 { 688 const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi), 689 _mm_unpackhi_epi8(__vi, __vi)}; 690 const __vector_type_t
__vvvv[4] = { 691 __vector_bitcast
(_mm_unpacklo_epi16(__vv[0], __vv[0])), 692 __vector_bitcast
(_mm_unpackhi_epi16(__vv[0], __vv[0])), 693 __vector_bitcast
(_mm_unpacklo_epi16(__vv[1], __vv[1])), 694 __vector_bitcast
(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; 695 if constexpr (sizeof(_ToT) == 4) 696 return __generate_from_n_evaluations<_Np, _R>( 697 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 698 return __vector_convert<_To>( 699 _SimdWrapper
(__vvvv[__i] >> 24)); 700 }); 701 else if constexpr (is_integral_v<_ToT>) 702 return __generate_from_n_evaluations<_Np, _R>( 703 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 704 const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); 705 const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); 706 return __vector_bitcast<_ToT>( 707 __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) 708 : _mm_unpackhi_epi32(__sx32, __signbits)); 709 }); 710 else 711 return __generate_from_n_evaluations<_Np, _R>( 712 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 713 const _SimdWrapper
__int4 = __vvvv[__i / 2] >> 24; 714 return __vector_convert<_To>( 715 __i % 2 == 0 ? __int4 716 : _SimdWrapper
( 717 _mm_unpackhi_epi64(__to_intrin(__int4), 718 __to_intrin(__int4)))); 719 }); 720 } 721 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) 722 { 723 const auto __shorts = __convert_all<__vector_type16_t< 724 conditional_t
, short, unsigned short>>>( 725 __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); 726 return __generate_from_n_evaluations<_Np, _R>( 727 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 728 return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; 729 }); 730 } 731 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 732 && is_signed_v<_FromT> && is_integral_v<_ToT>) 733 { 734 const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi), 735 _mm_unpackhi_epi16(__vi, __vi)}; 736 const __vector_type16_t
__vvvv[4] 737 = {__vector_bitcast
( 738 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16), 739 _mm_srai_epi32(__vv[0], 31))), 740 __vector_bitcast
( 741 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16), 742 _mm_srai_epi32(__vv[0], 31))), 743 __vector_bitcast
( 744 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16), 745 _mm_srai_epi32(__vv[1], 31))), 746 __vector_bitcast
( 747 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), 748 _mm_srai_epi32(__vv[1], 31)))}; 749 return __generate_from_n_evaluations<_Np, _R>( 750 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 751 return __vector_bitcast<_ToT>(__vvvv[__i]); 752 }); 753 } 754 else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) 755 { 756 const auto __ints 757 = __convert_all<__vector_type16_t
|| is_floating_point_v<_ToT>, int, 759 unsigned int>>>( 760 __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); 761 return __generate_from_n_evaluations<_Np, _R>( 762 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 763 return __convert_all<_To>(__ints[__i / 2])[__i % 2]; 764 }); 765 } 766 else 767 __assert_unreachable<_To>(); 768 } 769 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 770 else if constexpr ((_FromVT::_S_partial_width - _Offset) 771 > _ToVT::_S_full_size) 772 { 773 /* 774 static_assert( 775 (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) == 776 0, 777 "__convert_all only supports power-of-2 number of elements. 778 Otherwise " "the return type cannot be array<_To, N>."); 779 */ 780 constexpr size_t _NTotal 781 = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size; 782 constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts; 783 static_assert( 784 _Np <= _NTotal 785 || (_Np == _NTotal + 1 786 && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size 787 > 0)); 788 using _R = array<_To, _Np>; 789 if constexpr (_Np == 1) 790 return _R{__vector_convert<_To>( 791 __extract_part<_Offset, _FromVT::_S_partial_width, 792 _ToVT::_S_full_size>(__v))}; 793 else 794 return __generate_from_n_evaluations<_Np, _R>( 795 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 796 auto __part 797 = __extract_part<__i * _ToVT::_S_full_size + _Offset, 798 _FromVT::_S_partial_width, 799 _ToVT::_S_full_size>(__v); 800 return __vector_convert<_To>(__part); 801 }); 802 } 803 else if constexpr (_Offset == 0) 804 return array<_To, 1>{__vector_convert<_To>(__v)}; 805 else 806 return array<_To, 1>{__vector_convert<_To>( 807 __extract_part<_Offset, _FromVT::_S_partial_width, 808 _FromVT::_S_partial_width - _Offset>(__v))}; 809 } 810 } 811 812 // }}} 813 814 // _GnuTraits {{{ 815 template
816 struct _GnuTraits 817 { 818 using _IsValid = true_type; 819 using _SimdImpl = typename _Abi::_SimdImpl; 820 using _MaskImpl = typename _Abi::_MaskImpl; 821 822 // simd and simd_mask member types {{{ 823 using _SimdMember = _SimdWrapper<_Tp, _Np>; 824 using _MaskMember = _SimdWrapper<_Mp, _Np>; 825 static constexpr size_t _S_simd_align = alignof(_SimdMember); 826 static constexpr size_t _S_mask_align = alignof(_MaskMember); 827 828 // }}} 829 // size metadata {{{ 830 static constexpr size_t _S_full_size = _SimdMember::_S_full_size; 831 static constexpr bool _S_is_partial = _SimdMember::_S_is_partial; 832 833 // }}} 834 // _SimdBase / base class for simd, providing extra conversions {{{ 835 struct _SimdBase2 836 { 837 explicit 838 operator __intrinsic_type_t<_Tp, _Np>() const 839 { return __to_intrin(static_cast
*>(this)->_M_data); } 840 841 explicit 842 operator __vector_type_t<_Tp, _Np>() const 843 { return static_cast
*>(this)->_M_data.__builtin(); } 844 }; 845 846 struct _SimdBase1 847 { 848 explicit 849 operator __intrinsic_type_t<_Tp, _Np>() const 850 { return __data(*static_cast
*>(this)); } 851 }; 852 853 using _SimdBase = conditional_t< 854 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 855 _SimdBase1, _SimdBase2>; 856 857 // }}} 858 // _MaskBase {{{ 859 struct _MaskBase2 860 { 861 explicit 862 operator __intrinsic_type_t<_Tp, _Np>() const 863 { return static_cast
*>(this) ->_M_data.__intrin(); } 864 865 explicit 866 operator __vector_type_t<_Tp, _Np>() const 867 { return static_cast
*>(this)->_M_data._M_data; } 868 }; 869 870 struct _MaskBase1 871 { 872 explicit 873 operator __intrinsic_type_t<_Tp, _Np>() const 874 { return __data(*static_cast
*>(this)); } 875 }; 876 877 using _MaskBase = conditional_t< 878 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 879 _MaskBase1, _MaskBase2>; 880 881 // }}} 882 // _MaskCastType {{{ 883 // parameter type of one explicit simd_mask constructor 884 class _MaskCastType 885 { 886 using _Up = __intrinsic_type_t<_Tp, _Np>; 887 _Up _M_data; 888 889 public: 890 _MaskCastType(_Up __x) : _M_data(__x) {} 891 892 operator _MaskMember() const { return _M_data; } 893 }; 894 895 // }}} 896 // _SimdCastType {{{ 897 // parameter type of one explicit simd constructor 898 class _SimdCastType1 899 { 900 using _Ap = __intrinsic_type_t<_Tp, _Np>; 901 _SimdMember _M_data; 902 903 public: 904 constexpr 905 _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 906 907 constexpr 908 operator _SimdMember() const { return _M_data; } 909 }; 910 911 class _SimdCastType2 912 { 913 using _Ap = __intrinsic_type_t<_Tp, _Np>; 914 using _Bp = __vector_type_t<_Tp, _Np>; 915 _SimdMember _M_data; 916 917 public: 918 constexpr 919 _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 920 921 constexpr 922 _SimdCastType2(_Bp __b) : _M_data(__b) {} 923 924 constexpr 925 operator _SimdMember() const { return _M_data; } 926 }; 927 928 using _SimdCastType = conditional_t< 929 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 930 _SimdCastType1, _SimdCastType2>; 931 //}}} 932 }; 933 934 // }}} 935 struct _CommonImplX86; 936 struct _CommonImplNeon; 937 struct _CommonImplBuiltin; 938 template
struct _SimdImplBuiltin; 939 template
struct _MaskImplBuiltin; 940 template
struct _SimdImplX86; 941 template
struct _MaskImplX86; 942 template
struct _SimdImplNeon; 943 template
struct _MaskImplNeon; 944 template
struct _SimdImplPpc; 945 template
struct _MaskImplPpc; 946 947 // simd_abi::_VecBuiltin {{{ 948 template
949 struct simd_abi::_VecBuiltin 950 { 951 template
952 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 953 954 // validity traits {{{ 955 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 956 957 template
958 struct _IsValidSizeFor 959 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 960 && _UsedBytes % sizeof(_Tp) == 0 961 && _UsedBytes <= __vectorized_sizeof<_Tp>() 962 && (!__have_avx512f || _UsedBytes <= 32))> {}; 963 964 template
965 struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>, 966 _IsValidSizeFor<_Tp>> {}; 967 968 template
969 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 970 971 // }}} 972 // _SimdImpl/_MaskImpl {{{ 973 #if _GLIBCXX_SIMD_X86INTRIN 974 using _CommonImpl = _CommonImplX86; 975 using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>; 976 using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>; 977 #elif _GLIBCXX_SIMD_HAVE_NEON 978 using _CommonImpl = _CommonImplNeon; 979 using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>; 980 using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>; 981 #else 982 using _CommonImpl = _CommonImplBuiltin; 983 #ifdef __ALTIVEC__ 984 using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>; 985 using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>; 986 #else 987 using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>; 988 using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>; 989 #endif 990 #endif 991 992 // }}} 993 // __traits {{{ 994 template
995 using _MaskValueType = __int_for_sizeof_t<_Tp>; 996 997 template
998 using __traits 999 = conditional_t<_S_is_valid_v<_Tp>, 1000 _GnuTraits<_Tp, _MaskValueType<_Tp>, 1001 _VecBuiltin<_UsedBytes>, _S_size<_Tp>>, 1002 _InvalidTraits>; 1003 1004 //}}} 1005 // size metadata {{{ 1006 template
1007 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1008 1009 template
1010 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1011 1012 // }}} 1013 // implicit masks {{{ 1014 template
1015 using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>; 1016 1017 template
1018 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1019 _S_implicit_mask() 1020 { 1021 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 1022 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1023 return ~_UV(); 1024 else 1025 { 1026 constexpr auto __size = _S_size<_Tp>; 1027 _GLIBCXX_SIMD_USE_CONSTEXPR auto __r 1028 = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 1029 { return __i < __size ? -1 : 0; }); 1030 return __r; 1031 } 1032 } 1033 1034 template
1035 _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>> 1036 _S_implicit_mask_intrin() 1037 { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); } 1038 1039 template
> 1040 _GLIBCXX_SIMD_INTRINSIC static constexpr _TW 1041 _S_masked(_TW __x) 1042 { 1043 using _Tp = typename _TVT::value_type; 1044 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 1045 return __x; 1046 else 1047 return __and(__as_vector(__x), 1048 __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>())); 1049 } 1050 1051 template
> 1052 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1053 __make_padding_nonzero(_TW __x) 1054 { 1055 using _Tp = typename _TVT::value_type; 1056 if constexpr (!_S_is_partial<_Tp>) 1057 return __x; 1058 else 1059 { 1060 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask 1061 = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()); 1062 if constexpr (is_integral_v<_Tp>) 1063 return __or(__x, ~__implicit_mask); 1064 else 1065 { 1066 _GLIBCXX_SIMD_USE_CONSTEXPR auto __one 1067 = __andnot(__implicit_mask, 1068 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))); 1069 // it's not enough to return `x | 1_in_padding` because the 1070 // padding in x might be inf or nan (independent of 1071 // __FINITE_MATH_ONLY__, because it's about padding bits) 1072 return __or(__and(__x, __implicit_mask), __one); 1073 } 1074 } 1075 } 1076 // }}} 1077 }; 1078 1079 // }}} 1080 // simd_abi::_VecBltnBtmsk {{{ 1081 template
1082 struct simd_abi::_VecBltnBtmsk 1083 { 1084 template
1085 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 1086 1087 // validity traits {{{ 1088 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 1089 1090 template
1091 struct _IsValidSizeFor 1092 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 1093 && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64 1094 && (_UsedBytes > 32 || __have_avx512vl))> {}; 1095 1096 // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also 1097 // required. 1098 template
1099 struct _IsValid 1100 : conjunction< 1101 _IsValidAbiTag, __bool_constant<__have_avx512f>, 1102 __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>, 1103 __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>, 1104 _IsValidSizeFor<_Tp>> {}; 1105 1106 template
1107 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 1108 1109 // }}} 1110 // simd/_MaskImpl {{{ 1111 #if _GLIBCXX_SIMD_X86INTRIN 1112 using _CommonImpl = _CommonImplX86; 1113 using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>; 1114 using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>; 1115 #else 1116 template
1117 struct _MissingImpl; 1118 1119 using _CommonImpl = _MissingImpl<_UsedBytes>; 1120 using _SimdImpl = _MissingImpl<_UsedBytes>; 1121 using _MaskImpl = _MissingImpl<_UsedBytes>; 1122 #endif 1123 1124 // }}} 1125 // __traits {{{ 1126 template
1127 using _MaskMember = _SimdWrapper
>; 1128 1129 template
1130 using __traits = conditional_t< 1131 _S_is_valid_v<_Tp>, 1132 _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>, 1133 _InvalidTraits>; 1134 1135 //}}} 1136 // size metadata {{{ 1137 template
1138 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1139 template
1140 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1141 1142 // }}} 1143 // implicit mask {{{ 1144 private: 1145 template
1146 using _ImplicitMask = _SimdWrapper
>; 1147 1148 public: 1149 template
1150 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np> 1151 __implicit_mask_n() 1152 { 1153 using _Tp = __bool_storage_member_type_t<_Np>; 1154 return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp(); 1155 } 1156 1157 template
1158 _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp> 1159 _S_implicit_mask() 1160 { return __implicit_mask_n<_S_size<_Tp>>(); } 1161 1162 template
1163 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>> 1164 _S_implicit_mask_intrin() 1165 { return __implicit_mask_n<_S_size<_Tp>>(); } 1166 1167 template
1168 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1169 _S_masked(_SimdWrapper<_Tp, _Np> __x) 1170 { 1171 if constexpr (is_same_v<_Tp, bool>) 1172 if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0) 1173 return _MaskImpl::_S_bit_and( 1174 __x, _SimdWrapper<_Tp, _Np>( 1175 __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1))); 1176 else 1177 return __x; 1178 else 1179 return _S_masked(__x._M_data); 1180 } 1181 1182 template
1183 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 1184 _S_masked(_TV __x) 1185 { 1186 using _Tp = typename _VectorTraits<_TV>::value_type; 1187 static_assert( 1188 !__is_bitmask_v<_TV>, 1189 "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't " 1190 "know the number of elements. Use _SimdWrapper
instead."); 1191 if constexpr (_S_is_partial<_Tp>) 1192 { 1193 constexpr size_t _Np = _S_size<_Tp>; 1194 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1195 _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(), 1196 _SimdWrapper<_Tp, _Np>(__x)); 1197 } 1198 else 1199 return __x; 1200 } 1201 1202 template
> 1203 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1204 __make_padding_nonzero(_TV __x) 1205 { 1206 using _Tp = typename _TVT::value_type; 1207 if constexpr (!_S_is_partial<_Tp>) 1208 return __x; 1209 else 1210 { 1211 constexpr size_t _Np = _S_size<_Tp>; 1212 if constexpr (is_integral_v
) 1213 return __x 1214 | __generate_vector<_Tp, _S_full_size<_Tp>>( 1215 [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { 1216 if (__i < _Np) 1217 return 0; 1218 else 1219 return 1; 1220 }); 1221 else 1222 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1223 _S_implicit_mask<_Tp>(), 1224 _SimdWrapper<_Tp, _Np>( 1225 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))), 1226 _SimdWrapper<_Tp, _Np>(__x)) 1227 ._M_data; 1228 } 1229 } 1230 1231 // }}} 1232 }; 1233 1234 //}}} 1235 // _CommonImplBuiltin {{{ 1236 struct _CommonImplBuiltin 1237 { 1238 // _S_converts_via_decomposition{{{ 1239 // This lists all cases where a __vector_convert needs to fall back to 1240 // conversion of individual scalars (i.e. decompose the input vector into 1241 // scalars, convert, compose output vector). In those cases, _S_masked_load & 1242 // _S_masked_store prefer to use the _S_bit_iteration implementation. 1243 template
1244 static inline constexpr bool __converts_via_decomposition_v 1245 = sizeof(_From) != sizeof(_To); 1246 1247 // }}} 1248 // _S_load{{{ 1249 template
1250 _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np> 1251 _S_load(const void* __p) 1252 { 1253 static_assert(_Np > 1); 1254 static_assert(_Bytes % sizeof(_Tp) == 0); 1255 using _Rp = __vector_type_t<_Tp, _Np>; 1256 if constexpr (sizeof(_Rp) == _Bytes) 1257 { 1258 _Rp __r; 1259 __builtin_memcpy(&__r, __p, _Bytes); 1260 return __r; 1261 } 1262 else 1263 { 1264 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424 1265 using _Up = conditional_t< 1266 is_integral_v<_Tp>, 1267 conditional_t<_Bytes % 4 == 0, 1268 conditional_t<_Bytes % 8 == 0, long long, int>, 1269 conditional_t<_Bytes % 2 == 0, short, signed char>>, 1270 conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp, 1271 double>>; 1272 using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>; 1273 if constexpr (sizeof(_V) != sizeof(_Rp)) 1274 { // on i386 with 4 < _Bytes <= 8 1275 _Rp __r{}; 1276 __builtin_memcpy(&__r, __p, _Bytes); 1277 return __r; 1278 } 1279 else 1280 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424 1281 using _V = _Rp; 1282 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1283 { 1284 _V __r{}; 1285 static_assert(_Bytes <= sizeof(_V)); 1286 __builtin_memcpy(&__r, __p, _Bytes); 1287 return reinterpret_cast<_Rp>(__r); 1288 } 1289 } 1290 } 1291 1292 // }}} 1293 // _S_store {{{ 1294 template
1295 _GLIBCXX_SIMD_INTRINSIC static void 1296 _S_store(_TV __x, void* __addr) 1297 { 1298 constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes; 1299 static_assert(sizeof(__x) >= _Bytes); 1300 1301 if constexpr (__is_vector_type_v<_TV>) 1302 { 1303 using _Tp = typename _VectorTraits<_TV>::value_type; 1304 constexpr size_t _Np = _Bytes / sizeof(_Tp); 1305 static_assert(_Np * sizeof(_Tp) == _Bytes); 1306 1307 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424 1308 using _Up = conditional_t< 1309 (is_integral_v<_Tp> || _Bytes < 4), 1310 conditional_t<(sizeof(__x) > sizeof(long long)), long long, _Tp>, 1311 float>; 1312 const auto __v = __vector_bitcast<_Up>(__x); 1313 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424 1314 const __vector_type_t<_Tp, _Np> __v = __x; 1315 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1316 1317 if constexpr ((_Bytes & (_Bytes - 1)) != 0) 1318 { 1319 constexpr size_t _MoreBytes = std::__bit_ceil(_Bytes); 1320 alignas(decltype(__v)) char __tmp[_MoreBytes]; 1321 __builtin_memcpy(__tmp, &__v, _MoreBytes); 1322 __builtin_memcpy(__addr, __tmp, _Bytes); 1323 } 1324 else 1325 __builtin_memcpy(__addr, &__v, _Bytes); 1326 } 1327 else 1328 __builtin_memcpy(__addr, &__x, _Bytes); 1329 } 1330 1331 template
1332 _GLIBCXX_SIMD_INTRINSIC static void 1333 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 1334 { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); } 1335 1336 // }}} 1337 // _S_store_bool_array(_BitMask) {{{ 1338 template
1339 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1340 _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) 1341 { 1342 if constexpr (_Np == 1) 1343 __mem[0] = __x[0]; 1344 else if (__builtin_is_constant_evaluated()) 1345 { 1346 for (size_t __i = 0; __i < _Np; ++__i) 1347 __mem[__i] = __x[__i]; 1348 } 1349 else if constexpr (_Np == 2) 1350 { 1351 short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101; 1352 _S_store<_Np>(__bool2, __mem); 1353 } 1354 else if constexpr (_Np == 3) 1355 { 1356 int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101; 1357 _S_store<_Np>(__bool3, __mem); 1358 } 1359 else 1360 { 1361 __execute_n_times<__div_roundup(_Np, 4)>( 1362 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1363 constexpr int __offset = __i * 4; 1364 constexpr int __remaining = _Np - __offset; 1365 if constexpr (__remaining > 4 && __remaining <= 7) 1366 { 1367 const _ULLong __bool7 1368 = (__x.template _M_extract<__offset>()._M_to_bits() 1369 * 0x40810204081ULL) 1370 & 0x0101010101010101ULL; 1371 _S_store<__remaining>(__bool7, __mem + __offset); 1372 } 1373 else if constexpr (__remaining >= 4) 1374 { 1375 int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 1376 if constexpr (__remaining > 7) 1377 __bits &= 0xf; 1378 const int __bool4 = (__bits * 0x204081) & 0x01010101; 1379 _S_store<4>(__bool4, __mem + __offset); 1380 } 1381 }); 1382 } 1383 } 1384 1385 // }}} 1386 // _S_blend{{{ 1387 template
1388 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1389 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 1390 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 1391 { return __k._M_data ? __at1._M_data : __at0._M_data; } 1392 1393 // }}} 1394 }; 1395 1396 // }}} 1397 // _SimdImplBuiltin {{{1 1398 template
1399 struct _SimdImplBuiltin 1400 { 1401 // member types {{{2 1402 template
1403 static constexpr size_t _S_max_store_size = 16; 1404 1405 using abi_type = _Abi; 1406 1407 template
1408 using _TypeTag = _Tp*; 1409 1410 template
1411 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 1412 1413 template
1414 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 1415 1416 template
1417 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 1418 1419 template
1420 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 1421 1422 using _CommonImpl = typename _Abi::_CommonImpl; 1423 using _SuperImpl = typename _Abi::_SimdImpl; 1424 using _MaskImpl = typename _Abi::_MaskImpl; 1425 1426 // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2 1427 template
1428 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1429 _M_make_simd(_SimdWrapper<_Tp, _Np> __x) 1430 { return {__private_init, __x}; } 1431 1432 template
1433 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1434 _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x) 1435 { return {__private_init, __vector_bitcast<_Tp>(__x)}; } 1436 1437 // _S_broadcast {{{2 1438 template
1439 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1440 _S_broadcast(_Tp __x) noexcept 1441 { return __vector_broadcast<_S_full_size<_Tp>>(__x); } 1442 1443 // _S_generator {{{2 1444 template
1445 inline static constexpr _SimdMember<_Tp> 1446 _S_generator(_Fp&& __gen, _TypeTag<_Tp>) 1447 { 1448 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1449 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1450 if constexpr (__i < _S_size<_Tp>) 1451 return __gen(__i); 1452 else 1453 return 0; 1454 }); 1455 } 1456 1457 // _S_load {{{2 1458 template
1459 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1460 _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept 1461 { 1462 constexpr size_t _Np = _S_size<_Tp>; 1463 constexpr size_t __max_load_size 1464 = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64 1465 : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32 1466 : 16; 1467 constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; 1468 if (__builtin_is_constant_evaluated()) 1469 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1470 [&](auto __i) constexpr { 1471 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1472 }); 1473 else if constexpr (sizeof(_Up) > 8) 1474 return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( 1475 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1476 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1477 }); 1478 else if constexpr (is_same_v<_Up, _Tp>) 1479 return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, 1480 _Np * sizeof(_Tp)>(__mem); 1481 else if constexpr (__bytes_to_load <= __max_load_size) 1482 return __convert<_SimdMember<_Tp>>( 1483 _CommonImpl::template _S_load<_Up, _Np>(__mem)); 1484 else if constexpr (__bytes_to_load % __max_load_size == 0) 1485 { 1486 constexpr size_t __n_loads = __bytes_to_load / __max_load_size; 1487 constexpr size_t __elements_per_load = _Np / __n_loads; 1488 return __call_with_n_evaluations<__n_loads>( 1489 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1490 return __convert<_SimdMember<_Tp>>(__uncvted...); 1491 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1492 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1493 __mem + __i * __elements_per_load); 1494 }); 1495 } 1496 else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 1497 && __max_load_size > 16) 1498 { // e.g. int[] ->
with AVX2 1499 constexpr size_t __n_loads 1500 = __bytes_to_load / (__max_load_size / 2); 1501 constexpr size_t __elements_per_load = _Np / __n_loads; 1502 return __call_with_n_evaluations<__n_loads>( 1503 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1504 return __convert<_SimdMember<_Tp>>(__uncvted...); 1505 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1506 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1507 __mem + __i * __elements_per_load); 1508 }); 1509 } 1510 else // e.g. int[] ->
1511 return __call_with_subscripts( 1512 __mem, make_index_sequence<_Np>(), 1513 [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1514 return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; 1515 }); 1516 } 1517 1518 // _S_masked_load {{{2 1519 template
1520 static constexpr inline _SimdWrapper<_Tp, _Np> 1521 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 1522 const _Up* __mem) noexcept 1523 { 1524 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1525 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1526 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1527 }); 1528 return __merge; 1529 } 1530 1531 // _S_store {{{2 1532 template
1533 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1534 _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept 1535 { 1536 // TODO: converting int -> "smaller int" can be optimized with AVX512 1537 constexpr size_t _Np = _S_size<_Tp>; 1538 constexpr size_t __max_store_size 1539 = _SuperImpl::template _S_max_store_size<_Up>; 1540 if (__builtin_is_constant_evaluated()) 1541 { 1542 for (size_t __i = 0; __i < _Np; ++__i) 1543 __mem[__i] = __v[__i]; 1544 } 1545 else if constexpr (sizeof(_Up) > 8) 1546 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1547 __mem[__i] = __v[__i]; 1548 }); 1549 else if constexpr (is_same_v<_Up, _Tp>) 1550 _CommonImpl::_S_store(__v, __mem); 1551 else if constexpr (sizeof(_Up) * _Np <= __max_store_size) 1552 _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)), 1553 __mem); 1554 else 1555 { 1556 constexpr size_t __vsize = __max_store_size / sizeof(_Up); 1557 // round up to convert the last partial vector as well: 1558 constexpr size_t __stores = __div_roundup(_Np, __vsize); 1559 constexpr size_t __full_stores = _Np / __vsize; 1560 using _V = __vector_type_t<_Up, __vsize>; 1561 const array<_V, __stores> __converted 1562 = __convert_all<_V, __stores>(__v); 1563 __execute_n_times<__full_stores>( 1564 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1565 _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); 1566 }); 1567 if constexpr (__full_stores < __stores) 1568 _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) 1569 * sizeof(_Up)>( 1570 __converted[__full_stores], __mem + __full_stores * __vsize); 1571 } 1572 } 1573 1574 // _S_masked_store_nocvt {{{2 1575 template
1576 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1577 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) 1578 { 1579 _BitOps::_S_bit_iteration( 1580 _MaskImpl::_S_to_bits(__k), 1581 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1582 __mem[__i] = __v[__i]; 1583 }); 1584 } 1585 1586 // _S_masked_store {{{2 1587 template
, 1588 typename _Tp = typename _TVT::value_type, typename _Up> 1589 static constexpr inline void 1590 _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept 1591 { 1592 constexpr size_t _TV_size = _S_size<_Tp>; 1593 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1594 constexpr size_t __max_store_size 1595 = _SuperImpl::template _S_max_store_size<_Up>; 1596 if constexpr ( 1597 is_same_v< 1598 _Tp, 1599 _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) 1600 { 1601 // bitwise or no conversion, reinterpret: 1602 const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1603 if constexpr (__is_bitmask_v
) 1604 return _MaskMember<_Up>(__k._M_data); 1605 else 1606 return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k); 1607 }(); 1608 _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v), 1609 __mem, __kk); 1610 } 1611 else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up) 1612 && !_CommonImpl:: 1613 template __converts_via_decomposition_v< 1614 _Tp, _Up, __max_store_size>) 1615 { // conversion via decomposition is better handled via the 1616 // bit_iteration 1617 // fallback below 1618 constexpr size_t _UW_size 1619 = std::min(_TV_size, __max_store_size / sizeof(_Up)); 1620 static_assert(_UW_size <= _TV_size); 1621 using _UW = _SimdWrapper<_Up, _UW_size>; 1622 using _UV = __vector_type_t<_Up, _UW_size>; 1623 using _UAbi = simd_abi::deduce_t<_Up, _UW_size>; 1624 if constexpr (_UW_size == _TV_size) // one convert+store 1625 { 1626 const _UW __converted = __convert<_UW>(__v); 1627 _SuperImpl::_S_masked_store_nocvt( 1628 __converted, __mem, 1629 _UAbi::_MaskImpl::template _S_convert< 1630 __int_for_sizeof_t<_Up>>(__k)); 1631 } 1632 else 1633 { 1634 static_assert(_UW_size * sizeof(_Up) == __max_store_size); 1635 constexpr size_t _NFullStores = _TV_size / _UW_size; 1636 constexpr size_t _NAllStores 1637 = __div_roundup(_TV_size, _UW_size); 1638 constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; 1639 const array<_UV, _NAllStores> __converted 1640 = __convert_all<_UV, _NAllStores>(__v); 1641 __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1642 _SuperImpl::_S_masked_store_nocvt( 1643 _UW(__converted[__i]), __mem + __i * _UW_size, 1644 _UAbi::_MaskImpl::template _S_convert< 1645 __int_for_sizeof_t<_Up>>( 1646 __extract_part<__i, _NParts>(__k.__as_full_vector()))); 1647 }); 1648 if constexpr (_NAllStores 1649 > _NFullStores) // one partial at the end 1650 _SuperImpl::_S_masked_store_nocvt( 1651 _UW(__converted[_NFullStores]), 1652 __mem + _NFullStores * _UW_size, 1653 _UAbi::_MaskImpl::template _S_convert< 1654 __int_for_sizeof_t<_Up>>( 1655 __extract_part<_NFullStores, _NParts>( 1656 __k.__as_full_vector()))); 1657 } 1658 } 1659 else 1660 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1661 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1662 __mem[__i] = static_cast<_Up>(__v[__i]); 1663 }); 1664 } 1665 1666 // _S_complement {{{2 1667 template
1668 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1669 _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept 1670 { return ~__x._M_data; } 1671 1672 // _S_unary_minus {{{2 1673 template
1674 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1675 _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept 1676 { 1677 // GCC doesn't use the psign instructions, but pxor & psub seem to be 1678 // just as good a choice as pcmpeqd & psign. So meh. 1679 return -__x._M_data; 1680 } 1681 1682 // arithmetic operators {{{2 1683 template
1684 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1685 _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1686 { return __x._M_data + __y._M_data; } 1687 1688 template
1689 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1690 _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1691 { return __x._M_data - __y._M_data; } 1692 1693 template
1694 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1695 _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1696 { return __x._M_data * __y._M_data; } 1697 1698 template
1699 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1700 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1701 { 1702 // Note that division by 0 is always UB, so we must ensure we avoid the 1703 // case for partial registers 1704 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1705 return __x._M_data / __y._M_data; 1706 else 1707 return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data); 1708 } 1709 1710 template
1711 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1712 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1713 { 1714 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1715 return __x._M_data % __y._M_data; 1716 else 1717 return __as_vector(__x) 1718 % _Abi::__make_padding_nonzero(__as_vector(__y)); 1719 } 1720 1721 template
1722 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1723 _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1724 { return __and(__x, __y); } 1725 1726 template
1727 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1728 _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1729 { return __or(__x, __y); } 1730 1731 template
1732 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1733 _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1734 { return __xor(__x, __y); } 1735 1736 template
1737 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1738 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1739 { return __x._M_data << __y._M_data; } 1740 1741 template
1742 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1743 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1744 { return __x._M_data >> __y._M_data; } 1745 1746 template
1747 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1748 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y) 1749 { return __x._M_data << __y; } 1750 1751 template
1752 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1753 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y) 1754 { return __x._M_data >> __y; } 1755 1756 // compares {{{2 1757 // _S_equal_to {{{3 1758 template
1759 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1760 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1761 { return __x._M_data == __y._M_data; } 1762 1763 // _S_not_equal_to {{{3 1764 template
1765 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1766 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1767 { return __x._M_data != __y._M_data; } 1768 1769 // _S_less {{{3 1770 template
1771 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1772 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1773 { return __x._M_data < __y._M_data; } 1774 1775 // _S_less_equal {{{3 1776 template
1777 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1778 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1779 { return __x._M_data <= __y._M_data; } 1780 1781 // _S_negate {{{2 1782 template
1783 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1784 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 1785 { return !__x._M_data; } 1786 1787 // _S_min, _S_max, _S_minmax {{{2 1788 template
1789 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1790 _SimdWrapper<_Tp, _Np> 1791 _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1792 { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; } 1793 1794 template
1795 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1796 _SimdWrapper<_Tp, _Np> 1797 _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1798 { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; } 1799 1800 template
1801 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1802 pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>> 1803 _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1804 { 1805 return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data, 1806 __a._M_data < __b._M_data ? __b._M_data : __a._M_data}; 1807 } 1808 1809 // reductions {{{2 1810 template
1812 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1813 _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>, 1814 simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1815 { 1816 using _V = __vector_type_t<_Tp, _Np / 2>; 1817 static_assert(sizeof(_V) <= sizeof(__x)); 1818 // _S_full_size is the size of the smallest native SIMD register that 1819 // can store _Np/2 elements: 1820 using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>; 1821 using _HalfSimd = __deduced_simd<_Tp, _Np / 2>; 1822 const auto __xx = __as_vector(__x); 1823 return _HalfSimd::abi_type::_SimdImpl::_S_reduce( 1824 static_cast<_HalfSimd>(__as_vector(__binary_op( 1825 static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)), 1826 static_cast<_FullSimd>(__intrin_bitcast<_V>( 1827 __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>( 1828 __xx)))))), 1829 __binary_op); 1830 } 1831 1832 template
1833 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1834 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1835 { 1836 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 1837 if constexpr (_Np == 1) 1838 return __x[0]; 1839 else if constexpr (_Np == 2) 1840 return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1841 simd<_Tp, simd_abi::scalar>(__x[1]))[0]; 1842 else if (__builtin_is_constant_evaluated()) 1843 { 1844 simd<_Tp, simd_abi::scalar> __acc = __x[0]; 1845 for (size_t __i = 1; __i < _Np; ++__i) 1846 __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i])); 1847 return __acc[0]; 1848 } 1849 else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{ 1850 { 1851 [[maybe_unused]] constexpr auto __full_size 1852 = _Abi::template _S_full_size<_Tp>; 1853 if constexpr (_Np == 3) 1854 return __binary_op( 1855 __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1856 simd<_Tp, simd_abi::scalar>(__x[1])), 1857 simd<_Tp, simd_abi::scalar>(__x[2]))[0]; 1858 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1859 plus<>>) 1860 { 1861 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1862 return _Ap::_SimdImpl::_S_reduce( 1863 simd<_Tp, _Ap>(__private_init, 1864 _Abi::_S_masked(__as_vector(__x))), 1865 __binary_op); 1866 } 1867 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1868 multiplies<>>) 1869 { 1870 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1871 using _TW = _SimdWrapper<_Tp, __full_size>; 1872 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full 1873 = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector(); 1874 _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one 1875 = __vector_broadcast<__full_size>(_Tp(1)); 1876 const _TW __x_full = __data(__x).__as_full_vector(); 1877 const _TW __x_padded_with_ones 1878 = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one, 1879 __x_full); 1880 return _Ap::_SimdImpl::_S_reduce( 1881 simd<_Tp, _Ap>(__private_init, __x_padded_with_ones), 1882 __binary_op); 1883 } 1884 else if constexpr (_Np & 1) 1885 { 1886 using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>; 1887 return __binary_op( 1888 simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce( 1889 simd<_Tp, _Ap>( 1890 __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>( 1891 __as_vector(__x))), 1892 __binary_op)), 1893 simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0]; 1894 } 1895 else 1896 return _S_reduce_partial<_Np>( 1897 make_index_sequence<_Np / 2>(), 1898 make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op); 1899 } //}}} 1900 else if constexpr (sizeof(__x) == 16) //{{{ 1901 { 1902 if constexpr (_Np == 16) 1903 { 1904 const auto __y = __data(__x); 1905 __x = __binary_op( 1906 _M_make_simd<_Tp, _Np>( 1907 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1908 7, 7>(__y)), 1909 _M_make_simd<_Tp, _Np>( 1910 __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 1911 14, 14, 15, 15>(__y))); 1912 } 1913 if constexpr (_Np >= 8) 1914 { 1915 const auto __y = __vector_bitcast
(__data(__x)); 1916 __x = __binary_op( 1917 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1918 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))), 1919 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1920 __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y)))); 1921 } 1922 if constexpr (_Np >= 4) 1923 { 1924 using _Up = conditional_t
, float, int>; 1925 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1926 __x = __binary_op(__x, 1927 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1928 __vector_permute<3, 2, 1, 0>(__y)))); 1929 } 1930 using _Up = conditional_t
, double, _LLong>; 1931 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1932 __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1933 __vector_permute<1, 1>(__y)))); 1934 return __x[0]; 1935 } //}}} 1936 else 1937 { 1938 static_assert(sizeof(__x) > __min_vector_size<_Tp>); 1939 static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2 1940 using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>; 1941 using _V = simd<_Tp, _Ap>; 1942 return _Ap::_SimdImpl::_S_reduce( 1943 __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))), 1944 _V(__private_init, 1945 __extract<1, 2>(__as_vector(__x)))), 1946 static_cast<_BinaryOperation&&>(__binary_op)); 1947 } 1948 } 1949 1950 // math {{{2 1951 // frexp, modf and copysign implemented in simd_math.h 1952 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \ 1953 template
\ 1954 static _Tp \ 1955 _S_##__name(const _Tp& __x, const _More&... __more) \ 1956 { \ 1957 return __generate_vector<_Tp>( \ 1958 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1959 return __name(__x[__i], __more[__i]...); \ 1960 }); \ 1961 } 1962 1963 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ 1964 template
\ 1965 static typename _Tp::mask_type \ 1966 _S_##__name(const _Tp& __x, const _More&... __more) \ 1967 { \ 1968 return __generate_vector<_Tp>( \ 1969 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1970 return __name(__x[__i], __more[__i]...); \ 1971 }); \ 1972 } 1973 1974 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ 1975 template
\ 1976 static auto \ 1977 _S_##__name(const _Tp& __x, const _More&... __more) \ 1978 { \ 1979 return __fixed_size_storage_t<_RetTp, \ 1980 _VectorTraits<_Tp>::_S_partial_width>:: \ 1981 _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1982 return __meta._S_generator( \ 1983 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1984 return __name(__x[__meta._S_offset + __i], \ 1985 __more[__meta._S_offset + __i]...); \ 1986 }, \ 1987 static_cast<_RetTp*>(nullptr)); \ 1988 }); \ 1989 } 1990 1991 _GLIBCXX_SIMD_MATH_FALLBACK(acos) 1992 _GLIBCXX_SIMD_MATH_FALLBACK(asin) 1993 _GLIBCXX_SIMD_MATH_FALLBACK(atan) 1994 _GLIBCXX_SIMD_MATH_FALLBACK(atan2) 1995 _GLIBCXX_SIMD_MATH_FALLBACK(cos) 1996 _GLIBCXX_SIMD_MATH_FALLBACK(sin) 1997 _GLIBCXX_SIMD_MATH_FALLBACK(tan) 1998 _GLIBCXX_SIMD_MATH_FALLBACK(acosh) 1999 _GLIBCXX_SIMD_MATH_FALLBACK(asinh) 2000 _GLIBCXX_SIMD_MATH_FALLBACK(atanh) 2001 _GLIBCXX_SIMD_MATH_FALLBACK(cosh) 2002 _GLIBCXX_SIMD_MATH_FALLBACK(sinh) 2003 _GLIBCXX_SIMD_MATH_FALLBACK(tanh) 2004 _GLIBCXX_SIMD_MATH_FALLBACK(exp) 2005 _GLIBCXX_SIMD_MATH_FALLBACK(exp2) 2006 _GLIBCXX_SIMD_MATH_FALLBACK(expm1) 2007 _GLIBCXX_SIMD_MATH_FALLBACK(ldexp) 2008 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) 2009 _GLIBCXX_SIMD_MATH_FALLBACK(log) 2010 _GLIBCXX_SIMD_MATH_FALLBACK(log10) 2011 _GLIBCXX_SIMD_MATH_FALLBACK(log1p) 2012 _GLIBCXX_SIMD_MATH_FALLBACK(log2) 2013 _GLIBCXX_SIMD_MATH_FALLBACK(logb) 2014 2015 // modf implemented in simd_math.h 2016 _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) 2017 _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) 2018 _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) 2019 _GLIBCXX_SIMD_MATH_FALLBACK(fabs) 2020 _GLIBCXX_SIMD_MATH_FALLBACK(pow) 2021 _GLIBCXX_SIMD_MATH_FALLBACK(sqrt) 2022 _GLIBCXX_SIMD_MATH_FALLBACK(erf) 2023 _GLIBCXX_SIMD_MATH_FALLBACK(erfc) 2024 _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) 2025 _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) 2026 2027 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) 2028 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) 2029 2030 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) 2031 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) 2032 2033 _GLIBCXX_SIMD_MATH_FALLBACK(fmod) 2034 _GLIBCXX_SIMD_MATH_FALLBACK(remainder) 2035 2036 template
> 2037 static _Tp 2038 _S_remquo(const _Tp __x, const _Tp __y, 2039 __fixed_size_storage_t
* __z) 2040 { 2041 return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2042 int __tmp; 2043 auto __r = remquo(__x[__i], __y[__i], &__tmp); 2044 __z->_M_set(__i, __tmp); 2045 return __r; 2046 }); 2047 } 2048 2049 // copysign in simd_math.h 2050 _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) 2051 _GLIBCXX_SIMD_MATH_FALLBACK(fdim) 2052 _GLIBCXX_SIMD_MATH_FALLBACK(fmax) 2053 _GLIBCXX_SIMD_MATH_FALLBACK(fmin) 2054 _GLIBCXX_SIMD_MATH_FALLBACK(fma) 2055 2056 template
2057 static constexpr _MaskMember<_Tp> 2058 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 2059 _SimdWrapper<_Tp, _Np> __y) noexcept 2060 { 2061 using _Ip = __int_for_sizeof_t<_Tp>; 2062 const auto __xn = __vector_bitcast<_Ip>(__x); 2063 const auto __yn = __vector_bitcast<_Ip>(__y); 2064 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2065 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2066 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2067 __xp > __yp); 2068 } 2069 2070 template
2071 static constexpr _MaskMember<_Tp> 2072 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, 2073 _SimdWrapper<_Tp, _Np> __y) noexcept 2074 { 2075 using _Ip = __int_for_sizeof_t<_Tp>; 2076 const auto __xn = __vector_bitcast<_Ip>(__x); 2077 const auto __yn = __vector_bitcast<_Ip>(__y); 2078 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2079 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2080 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2081 __xp >= __yp); 2082 } 2083 2084 template
2085 static constexpr _MaskMember<_Tp> 2086 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept 2087 { 2088 using _Ip = __int_for_sizeof_t<_Tp>; 2089 const auto __xn = __vector_bitcast<_Ip>(__x); 2090 const auto __yn = __vector_bitcast<_Ip>(__y); 2091 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2092 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2093 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2094 __xp < __yp); 2095 } 2096 2097 template
2098 static constexpr _MaskMember<_Tp> 2099 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, 2100 _SimdWrapper<_Tp, _Np> __y) noexcept 2101 { 2102 using _Ip = __int_for_sizeof_t<_Tp>; 2103 const auto __xn = __vector_bitcast<_Ip>(__x); 2104 const auto __yn = __vector_bitcast<_Ip>(__y); 2105 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 2106 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 2107 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 2108 __xp <= __yp); 2109 } 2110 2111 template
2112 static constexpr _MaskMember<_Tp> 2113 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, 2114 _SimdWrapper<_Tp, _Np> __y) noexcept 2115 { 2116 return __andnot(_SuperImpl::_S_isunordered(__x, __y), 2117 _SuperImpl::_S_not_equal_to(__x, __y)); 2118 } 2119 2120 #undef _GLIBCXX_SIMD_MATH_FALLBACK 2121 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET 2122 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET 2123 // _S_abs {{{3 2124 template
2125 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2126 _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept 2127 { 2128 // if (__builtin_is_constant_evaluated()) 2129 // { 2130 // return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2131 // } 2132 if constexpr (is_floating_point_v<_Tp>) 2133 // `v < 0 ? -v : v` cannot compile to the efficient implementation of 2134 // masking the signbit off because it must consider v == -0 2135 2136 // ~(-0.) & v would be easy, but breaks with fno-signed-zeros 2137 return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data); 2138 else 2139 return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2140 } 2141 2142 // }}}3 2143 // _S_plus_minus {{{ 2144 // Returns __x + __y - __y without -fassociative-math optimizing to __x. 2145 // - _TV must be __vector_type_t
. 2146 // - _UV must be _TV or floating-point type. 2147 template
2148 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 2149 _S_plus_minus(_TV __x, _UV __y) noexcept 2150 { 2151 #if defined __i386__ && !defined __SSE_MATH__ 2152 if constexpr (sizeof(__x) == 8) 2153 { // operations on __x would use the FPU 2154 static_assert(is_same_v<_TV, __vector_type_t
>); 2155 const auto __x4 = __vector_bitcast
(__x); 2156 if constexpr (is_same_v<_TV, _UV>) 2157 return __vector_bitcast
( 2158 _S_plus_minus(__x4, __vector_bitcast
(__y))); 2159 else 2160 return __vector_bitcast
(_S_plus_minus(__x4, __y)); 2161 } 2162 #endif 2163 #if !defined __clang__ && __GCC_IEC_559 == 0 2164 if (__builtin_is_constant_evaluated() 2165 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 2166 return (__x + __y) - __y; 2167 else 2168 return [&] { 2169 __x += __y; 2170 if constexpr(__have_sse) 2171 { 2172 if constexpr (sizeof(__x) >= 16) 2173 asm("" : "+x"(__x)); 2174 else if constexpr (is_same_v<__vector_type_t
, _TV>) 2175 asm("" : "+x"(__x[0]), "+x"(__x[1])); 2176 else 2177 __assert_unreachable<_TV>(); 2178 } 2179 else if constexpr(__have_neon) 2180 asm("" : "+w"(__x)); 2181 else if constexpr (__have_power_vmx) 2182 { 2183 if constexpr (is_same_v<__vector_type_t
, _TV>) 2184 asm("" : "+fgr"(__x[0]), "+fgr"(__x[1])); 2185 else 2186 asm("" : "+v"(__x)); 2187 } 2188 else 2189 asm("" : "+g"(__x)); 2190 return __x - __y; 2191 }(); 2192 #else 2193 return (__x + __y) - __y; 2194 #endif 2195 } 2196 2197 // }}} 2198 // _S_nearbyint {{{3 2199 template
> 2200 _GLIBCXX_SIMD_INTRINSIC static _Tp 2201 _S_nearbyint(_Tp __x_) noexcept 2202 { 2203 using value_type = typename _TVT::value_type; 2204 using _V = typename _TVT::type; 2205 const _V __x = __x_; 2206 const _V __absx = __and(__x, _S_absmask<_V>); 2207 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v
); 2208 _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs 2209 = _V() + (1ull << (__digits_v
- 1)); 2210 const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs); 2211 const _V __shifted = _S_plus_minus(__x, __shifter); 2212 return __absx < __shifter_abs ? __shifted : __x; 2213 } 2214 2215 // _S_rint {{{3 2216 template
> 2217 _GLIBCXX_SIMD_INTRINSIC static _Tp 2218 _S_rint(_Tp __x) noexcept 2219 { return _SuperImpl::_S_nearbyint(__x); } 2220 2221 // _S_trunc {{{3 2222 template
2223 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2224 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2225 { 2226 using _V = __vector_type_t<_Tp, _Np>; 2227 const _V __absx = __and(__x._M_data, _S_absmask<_V>); 2228 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>); 2229 constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1); 2230 _V __truncated = _S_plus_minus(__absx, __shifter); 2231 __truncated -= __truncated > __absx ? _V() + 1 : _V(); 2232 return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated) 2233 : __x._M_data; 2234 } 2235 2236 // _S_round {{{3 2237 template
2238 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2239 _S_round(_SimdWrapper<_Tp, _Np> __x) 2240 { 2241 const auto __abs_x = _SuperImpl::_S_abs(__x); 2242 const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data; 2243 const auto __r_abs // round(abs(x)) = 2244 = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0); 2245 return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs); 2246 } 2247 2248 // _S_floor {{{3 2249 template
2250 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2251 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2252 { 2253 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2254 const auto __negative_input 2255 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2256 const auto __mask 2257 = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2258 return __or(__andnot(__mask, __y), 2259 __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1))); 2260 } 2261 2262 // _S_ceil {{{3 2263 template
2264 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2265 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2266 { 2267 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2268 const auto __negative_input 2269 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2270 const auto __inv_mask 2271 = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2272 return __or(__and(__inv_mask, __y), 2273 __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1))); 2274 } 2275 2276 // _S_isnan {{{3 2277 template
2278 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2279 _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2280 { 2281 #if __FINITE_MATH_ONLY__ 2282 return {}; // false 2283 #elif !defined __SUPPORT_SNAN__ 2284 return ~(__x._M_data == __x._M_data); 2285 #elif defined __STDC_IEC_559__ 2286 using _Ip = __int_for_sizeof_t<_Tp>; 2287 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2288 const auto __infn 2289 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 2290 return __infn < __absn; 2291 #else 2292 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?" 2293 #endif 2294 } 2295 2296 // _S_isfinite {{{3 2297 template
2298 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2299 _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2300 { 2301 #if __FINITE_MATH_ONLY__ 2302 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 2303 _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV(); 2304 return __alltrue; 2305 #else 2306 // if all exponent bits are set, __x is either inf or NaN 2307 using _Ip = __int_for_sizeof_t<_Tp>; 2308 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2309 const auto __maxn 2310 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2311 return __absn <= __maxn; 2312 #endif 2313 } 2314 2315 // _S_isunordered {{{3 2316 template
2317 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2318 _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2319 { return __or(_S_isnan(__x), _S_isnan(__y)); } 2320 2321 // _S_signbit {{{3 2322 template
2323 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2324 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2325 { 2326 using _Ip = __int_for_sizeof_t<_Tp>; 2327 return __vector_bitcast<_Ip>(__x) < 0; 2328 // Arithmetic right shift (SRA) would also work (instead of compare), but 2329 // 64-bit SRA isn't available on x86 before AVX512. And in general, 2330 // compares are more likely to be efficient than SRA. 2331 } 2332 2333 // _S_isinf {{{3 2334 template
2335 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2336 _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2337 { 2338 #if __FINITE_MATH_ONLY__ 2339 return {}; // false 2340 #else 2341 return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x), 2342 __vector_broadcast<_Np>( 2343 __infinity_v<_Tp>)); 2344 // alternative: 2345 // compare to inf using the corresponding integer type 2346 /* 2347 return 2348 __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>( 2349 _S_abs(__x)._M_data) 2350 == 2351 __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>( 2352 __infinity_v<_Tp>))); 2353 */ 2354 #endif 2355 } 2356 2357 // _S_isnormal {{{3 2358 template
2359 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2360 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 2361 { 2362 using _Ip = __int_for_sizeof_t<_Tp>; 2363 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2364 const auto __minn 2365 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>)); 2366 #if __FINITE_MATH_ONLY__ 2367 return __absn >= __minn; 2368 #else 2369 const auto __maxn 2370 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2371 return __minn <= __absn && __absn <= __maxn; 2372 #endif 2373 } 2374 2375 // _S_fpclassify {{{3 2376 template
2377 _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t
2378 _S_fpclassify(_SimdWrapper<_Tp, _Np> __x) 2379 { 2380 using _I = __int_for_sizeof_t<_Tp>; 2381 const auto __xn 2382 = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x))); 2383 constexpr size_t _NI = sizeof(__xn) / sizeof(_I); 2384 _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn 2385 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>)); 2386 2387 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal 2388 = __vector_broadcast<_NI, _I>(FP_NORMAL); 2389 #if !__FINITE_MATH_ONLY__ 2390 _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn 2391 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>)); 2392 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan 2393 = __vector_broadcast<_NI, _I>(FP_NAN); 2394 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite 2395 = __vector_broadcast<_NI, _I>(FP_INFINITE); 2396 #endif 2397 #ifndef __FAST_MATH__ 2398 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal 2399 = __vector_broadcast<_NI, _I>(FP_SUBNORMAL); 2400 #endif 2401 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero 2402 = __vector_broadcast<_NI, _I>(FP_ZERO); 2403 2404 __vector_type_t<_I, _NI> 2405 __tmp = __xn < __minn 2406 #ifdef __FAST_MATH__ 2407 ? __fp_zero 2408 #else 2409 ? (__xn == 0 ? __fp_zero : __fp_subnormal) 2410 #endif 2411 #if __FINITE_MATH_ONLY__ 2412 : __fp_normal; 2413 #else 2414 : (__xn < __infn ? __fp_normal 2415 : (__xn == __infn ? __fp_infinite : __fp_nan)); 2416 #endif 2417 2418 if constexpr (sizeof(_I) == sizeof(int)) 2419 { 2420 using _FixedInt = __fixed_size_storage_t
; 2421 const auto __as_int = __vector_bitcast
(__tmp); 2422 if constexpr (_FixedInt::_S_tuple_size == 1) 2423 return {__as_int}; 2424 else if constexpr (_FixedInt::_S_tuple_size == 2 2425 && is_same_v< 2426 typename _FixedInt::_SecondType::_FirstAbi, 2427 simd_abi::scalar>) 2428 return {__extract<0, 2>(__as_int), __as_int[_Np - 1]}; 2429 else if constexpr (_FixedInt::_S_tuple_size == 2) 2430 return {__extract<0, 2>(__as_int), 2431 __auto_bitcast(__extract<1, 2>(__as_int))}; 2432 else 2433 __assert_unreachable<_Tp>(); 2434 } 2435 else if constexpr (_Np == 2 && sizeof(_I) == 8 2436 && __fixed_size_storage_t
::_S_tuple_size == 2) 2437 { 2438 const auto __aslong = __vector_bitcast<_LLong>(__tmp); 2439 return {int(__aslong[0]), {int(__aslong[1])}}; 2440 } 2441 #if _GLIBCXX_SIMD_X86INTRIN 2442 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32 2443 && __fixed_size_storage_t
::_S_tuple_size == 1) 2444 return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)), 2445 __to_intrin(__hi128(__tmp)))}; 2446 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64 2447 && __fixed_size_storage_t
::_S_tuple_size == 1) 2448 return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))}; 2449 #endif // _GLIBCXX_SIMD_X86INTRIN 2450 else if constexpr (__fixed_size_storage_t
::_S_tuple_size == 1) 2451 return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), 2452 [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2453 return __make_wrapper
(__l...); 2454 })}; 2455 else 2456 __assert_unreachable<_Tp>(); 2457 } 2458 2459 // _S_increment & _S_decrement{{{2 2460 template
2461 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2462 _S_increment(_SimdWrapper<_Tp, _Np>& __x) 2463 { __x = __x._M_data + 1; } 2464 2465 template
2466 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2467 _S_decrement(_SimdWrapper<_Tp, _Np>& __x) 2468 { __x = __x._M_data - 1; } 2469 2470 // smart_reference access {{{2 2471 template
2472 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2473 _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept 2474 { __v._M_set(__i, static_cast<_Up&&>(__x)); } 2475 2476 // _S_masked_assign{{{2 2477 template
2478 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2479 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2480 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2481 { 2482 if (__k._M_is_constprop_none_of()) 2483 return; 2484 else if (__k._M_is_constprop_all_of()) 2485 __lhs = __rhs; 2486 else 2487 __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); 2488 } 2489 2490 template
2491 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2492 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2493 __type_identity_t<_Tp> __rhs) 2494 { 2495 if (__k._M_is_constprop_none_of()) 2496 return; 2497 else if (__k._M_is_constprop_all_of()) 2498 __lhs = __vector_broadcast<_Np>(__rhs); 2499 else if (__builtin_constant_p(__rhs) && __rhs == 0) 2500 { 2501 if constexpr (!is_same_v
) 2502 // the __andnot optimization only makes sense if __k._M_data is a 2503 // vector register 2504 __lhs._M_data 2505 = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data); 2506 else 2507 // for AVX512/__mmask, a _mm512_maskz_mov is best 2508 __lhs 2509 = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>()); 2510 } 2511 else 2512 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2513 _SimdWrapper<_Tp, _Np>( 2514 __vector_broadcast<_Np>(__rhs))); 2515 } 2516 2517 // _S_masked_cassign {{{2 2518 template
2519 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2520 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2521 _SimdWrapper<_Tp, _Np>& __lhs, 2522 const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs, 2523 _Op __op) 2524 { 2525 if (__k._M_is_constprop_none_of()) 2526 return; 2527 else if (__k._M_is_constprop_all_of()) 2528 __lhs = __op(_SuperImpl{}, __lhs, __rhs); 2529 else 2530 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2531 __op(_SuperImpl{}, __lhs, __rhs)); 2532 } 2533 2534 template
2535 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2536 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2537 _SimdWrapper<_Tp, _Np>& __lhs, 2538 const __type_identity_t<_Tp> __rhs, _Op __op) 2539 { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); } 2540 2541 // _S_masked_unary {{{2 2542 template
class _Op, typename _Tp, typename _K, 2543 size_t _Np> 2544 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2545 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, 2546 const _SimdWrapper<_Tp, _Np> __v) 2547 { 2548 if (__k._M_is_constprop_none_of()) 2549 return __v; 2550 auto __vv = _M_make_simd(__v); 2551 _Op
__op; 2552 if (__k._M_is_constprop_all_of()) 2553 return __data(__op(__vv)); 2554 else if constexpr (is_same_v<_Op
, __increment
>) 2555 { 2556 static_assert(not std::is_same_v<_K, bool>); 2557 if constexpr (is_integral_v<_Tp>) 2558 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2559 return __v._M_data - __vector_bitcast<_Tp>(__k._M_data); 2560 else if constexpr (not __have_avx2) 2561 return __v._M_data 2562 + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2563 _K, _Tp(1))); 2564 // starting with AVX2 it is more efficient to blend after add 2565 } 2566 else if constexpr (is_same_v<_Op
, __decrement
>) 2567 { 2568 static_assert(not std::is_same_v<_K, bool>); 2569 if constexpr (is_integral_v<_Tp>) 2570 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2571 return __v._M_data + __vector_bitcast<_Tp>(__k._M_data); 2572 else if constexpr (not __have_avx2) 2573 return __v._M_data 2574 - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2575 _K, _Tp(1))); 2576 // starting with AVX2 it is more efficient to blend after sub 2577 } 2578 return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv))); 2579 } 2580 2581 //}}}2 2582 }; 2583 2584 // _MaskImplBuiltinMixin {{{1 2585 struct _MaskImplBuiltinMixin 2586 { 2587 template
2588 using _TypeTag = _Tp*; 2589 2590 // _S_to_maskvector {{{ 2591 template
2592 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2593 _S_to_maskvector(bool __x) 2594 { 2595 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2596 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 2597 : __vector_type_t<_Up, _ToN>{}; 2598 } 2599 2600 template
2602 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2603 _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) 2604 { 2605 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2606 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2607 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2608 if constexpr (__i < _Np) 2609 return __x[__i] ? ~_Up() : _Up(); 2610 else 2611 return _Up(); 2612 }); 2613 } 2614 2615 template
2617 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2618 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 2619 { 2620 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2621 using _TW = _SimdWrapper<_Tp, _Np>; 2622 using _UW = _SimdWrapper<_Up, _ToN>; 2623 if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW)) 2624 return __wrapper_bitcast<_Up, _ToN>(__x); 2625 else if constexpr (is_same_v<_Tp, bool>) // bits -> vector 2626 return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data)); 2627 else 2628 { // vector -> vector 2629 /* 2630 [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data); 2631 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) == 2632 16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr 2633 (sizeof(_Tp) == 4 && sizeof(_Up) == 2 2634 && sizeof(__y) == 16) 2635 return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y); 2636 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 2637 && sizeof(__y) == 16) 2638 return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y); 2639 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 2640 && sizeof(__y) == 16) 2641 return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, 2642 -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 && 2643 sizeof(_Up) == 1 2644 && sizeof(__y) == 16) 2645 return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 2646 -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 && 2647 sizeof(_Up) == 1 2648 && sizeof(__y) == 16) 2649 return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2650 -1, -1, -1, -1, -1>(__y); else 2651 */ 2652 { 2653 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2654 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2655 if constexpr (__i < _Np) 2656 return _Up(__x[__i.value]); 2657 else 2658 return _Up(); 2659 }); 2660 } 2661 } 2662 } 2663 2664 // }}} 2665 // _S_to_bits {{{ 2666 template
2667 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 2668 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 2669 { 2670 static_assert(!is_same_v<_Tp, bool>); 2671 static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong)); 2672 using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; 2673 const auto __bools 2674 = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); 2675 _ULLong __r = 0; 2676 __execute_n_times<_Np>( 2677 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2678 __r |= _ULLong(__bools[__i.value]) << __i; 2679 }); 2680 return __r; 2681 } 2682 2683 // }}} 2684 }; 2685 2686 // _MaskImplBuiltin {{{1 2687 template
2688 struct _MaskImplBuiltin : _MaskImplBuiltinMixin 2689 { 2690 using _MaskImplBuiltinMixin::_S_to_bits; 2691 using _MaskImplBuiltinMixin::_S_to_maskvector; 2692 2693 // member types {{{ 2694 template
2695 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 2696 2697 template
2698 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 2699 2700 using _SuperImpl = typename _Abi::_MaskImpl; 2701 using _CommonImpl = typename _Abi::_CommonImpl; 2702 2703 template
2704 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 2705 2706 // }}} 2707 // _S_broadcast {{{ 2708 template
2709 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2710 _S_broadcast(bool __x) 2711 { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); } 2712 2713 // }}} 2714 // _S_load {{{ 2715 template
2716 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2717 _S_load(const bool* __mem) 2718 { 2719 using _I = __int_for_sizeof_t<_Tp>; 2720 if (not __builtin_is_constant_evaluated()) 2721 if constexpr (sizeof(_Tp) == sizeof(bool)) 2722 { 2723 const auto __bools 2724 = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem); 2725 // bool is {0, 1}, everything else is UB 2726 return __bools > 0; 2727 } 2728 return __generate_vector<_I, _S_size<_Tp>>( 2729 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2730 return __mem[__i] ? ~_I() : _I(); 2731 }); 2732 } 2733 2734 // }}} 2735 // _S_convert {{{ 2736 template
2737 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2738 _S_convert(_BitMask<_Np, _Sanitized> __x) 2739 { 2740 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2741 return _SimdWrapper
>(__x._M_to_bits()); 2742 else 2743 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2744 _S_size<_Tp>>( 2745 __x._M_sanitized()); 2746 } 2747 2748 template
2749 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2750 _S_convert(_SimdWrapper
__x) 2751 { 2752 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2753 return _SimdWrapper
>(__x._M_data); 2754 else 2755 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2756 _S_size<_Tp>>( 2757 _BitMask<_Np>(__x._M_data)._M_sanitized()); 2758 } 2759 2760 template
2761 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2762 _S_convert(_SimdWrapper<_Up, _Np> __x) 2763 { 2764 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2765 return _SimdWrapper
>( 2766 _SuperImpl::_S_to_bits(__x)); 2767 else 2768 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2769 _S_size<_Tp>>(__x); 2770 } 2771 2772 template
2773 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2774 _S_convert(simd_mask<_Up, _UAbi> __x) 2775 { 2776 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2777 { 2778 using _R = _SimdWrapper
>; 2779 if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits 2780 return _R(__data(__x)); 2781 else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits 2782 return _R(__data(__x)); 2783 else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits 2784 return _R(__data(__x)._M_to_bits()); 2785 else // vector -> bits 2786 return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits()); 2787 } 2788 else 2789 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2790 _S_size<_Tp>>( 2791 __data(__x)); 2792 } 2793 2794 // }}} 2795 // _S_masked_load {{{2 2796 template
2797 static inline _SimdWrapper<_Tp, _Np> 2798 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 2799 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 2800 { 2801 // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity 2802 auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); 2803 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), 2804 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2805 __tmp._M_set(__i, -__mem[__i]); 2806 }); 2807 __merge = __wrapper_bitcast<_Tp>(__tmp); 2808 return __merge; 2809 } 2810 2811 // _S_store {{{2 2812 template
2813 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2814 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 2815 { 2816 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2817 __mem[__i] = __v[__i]; 2818 }); 2819 } 2820 2821 // _S_masked_store {{{2 2822 template
2823 static inline void 2824 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 2825 const _SimdWrapper<_Tp, _Np> __k) noexcept 2826 { 2827 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k), 2828 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2829 __mem[__i] = __v[__i]; 2830 }); 2831 } 2832 2833 // _S_from_bitmask{{{2 2834 template
2835 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2836 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 2837 { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); } 2838 2839 // logical and bitwise operators {{{2 2840 template