Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/13/experimental/bits/simd_builtin.h
$ cat -n /usr/include/c++/13/experimental/bits/simd_builtin.h 1 // Simd Abi specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2023 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_ 27 28 #if __cplusplus >= 201703L 29 30 #include
31 #include
32 #include
33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 // _S_allbits{{{ 36 template
37 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits 38 = reinterpret_cast<_V>(~__vector_type_t
()); 39 40 // }}} 41 // _S_signmask, _S_absmask{{{ 42 template
> 43 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask 44 = __xor(_V() + 1, _V() - 1); 45 46 template
> 47 static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask 48 = __andnot(_S_signmask<_V>, _S_allbits<_V>); 49 50 //}}} 51 // __vector_permute
{{{ 52 // Index == -1 requests zeroing of the output element 53 template
, 54 typename = __detail::__odr_helper> 55 constexpr _Tp 56 __vector_permute(_Tp __x) 57 { 58 static_assert(sizeof...(_Indices) == _TVT::_S_full_size); 59 return __make_vector
( 60 (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...); 61 } 62 63 // }}} 64 // __vector_shuffle
{{{ 65 // Index == -1 requests zeroing of the output element 66 template
, 67 typename = __detail::__odr_helper> 68 constexpr _Tp 69 __vector_shuffle(_Tp __x, _Tp __y) 70 { 71 return _Tp{(_Indices == -1 ? 0 72 : _Indices < _TVT::_S_full_size 73 ? __x[_Indices] 74 : __y[_Indices - _TVT::_S_full_size])...}; 75 } 76 77 // }}} 78 // __make_wrapper{{{ 79 template
80 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)> 81 __make_wrapper(const _Args&... __args) 82 { return __make_vector<_Tp>(__args...); } 83 84 // }}} 85 // __wrapper_bitcast{{{ 86 template
88 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np> 89 __wrapper_bitcast(_SimdWrapper<_Up, _M> __x) 90 { 91 static_assert(_Np > 1); 92 return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data); 93 } 94 95 // }}} 96 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ 97 template
98 _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr 99 conditional_t<_Np == _Total and _Combine == 1, _Tp, _SimdWrapper<_Tp, _Np / _Total * _Combine>> 100 __extract_part(const _SimdWrapper<_Tp, _Np> __x) 101 { 102 if constexpr (_Np == _Total and _Combine == 1) 103 return __x[_Index]; 104 else if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0) 105 return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); 106 else 107 { 108 constexpr size_t __values_per_part = _Np / _Total; 109 constexpr size_t __values_to_skip = _Index * __values_per_part; 110 constexpr size_t __return_size = __values_per_part * _Combine; 111 using _R = __vector_type_t<_Tp, __return_size>; 112 static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) 113 <= sizeof(__x), 114 "out of bounds __extract_part"); 115 // the following assertion would ensure no "padding" to be read 116 // static_assert(_Total >= _Index + _Combine, "_Total must be greater 117 // than _Index"); 118 119 // static_assert(__return_size * _Total == _Np, "_Np must be divisible 120 // by _Total"); 121 if (__x._M_is_constprop()) 122 return __generate_from_n_evaluations<__return_size, _R>( 123 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 124 return __x[__values_to_skip + __i]; 125 }); 126 if constexpr (_Index == 0 && _Total == 1) 127 return __x; 128 else if constexpr (_Index == 0) 129 return __intrin_bitcast<_R>(__as_vector(__x)); 130 else 131 return __vec_shuffle(__as_vector(__x), make_index_sequence<__bit_ceil(__return_size)>(), 132 [](size_t __i) { 133 return __i + __values_to_skip; 134 }); 135 } 136 } 137 138 // }}} 139 // __extract_part(_SimdWrapper
) {{{ 140 template
141 _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper
142 __extract_part(const _SimdWrapper
__x) 143 { 144 static_assert(_Combine == 1, "_Combine != 1 not implemented"); 145 static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0); 146 return __x._M_data >> (_Index * _Np / _Total); 147 } 148 149 // }}} 150 151 // __vector_convert {{{ 152 // implementation requires an index sequence 153 template
154 _GLIBCXX_SIMD_INTRINSIC constexpr _To 155 __vector_convert(_From __a, index_sequence<_I...>) 156 { 157 using _Tp = typename _VectorTraits<_To>::value_type; 158 return _To{static_cast<_Tp>(__a[_I])...}; 159 } 160 161 template
162 _GLIBCXX_SIMD_INTRINSIC constexpr _To 163 __vector_convert(_From __a, _From __b, index_sequence<_I...>) 164 { 165 using _Tp = typename _VectorTraits<_To>::value_type; 166 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...}; 167 } 168 169 template
170 _GLIBCXX_SIMD_INTRINSIC constexpr _To 171 __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>) 172 { 173 using _Tp = typename _VectorTraits<_To>::value_type; 174 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 175 static_cast<_Tp>(__c[_I])...}; 176 } 177 178 template
179 _GLIBCXX_SIMD_INTRINSIC constexpr _To 180 __vector_convert(_From __a, _From __b, _From __c, _From __d, 181 index_sequence<_I...>) 182 { 183 using _Tp = typename _VectorTraits<_To>::value_type; 184 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 185 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...}; 186 } 187 188 template
189 _GLIBCXX_SIMD_INTRINSIC constexpr _To 190 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 191 index_sequence<_I...>) 192 { 193 using _Tp = typename _VectorTraits<_To>::value_type; 194 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 195 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 196 static_cast<_Tp>(__e[_I])...}; 197 } 198 199 template
200 _GLIBCXX_SIMD_INTRINSIC constexpr _To 201 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 202 _From __f, index_sequence<_I...>) 203 { 204 using _Tp = typename _VectorTraits<_To>::value_type; 205 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 206 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 207 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...}; 208 } 209 210 template
211 _GLIBCXX_SIMD_INTRINSIC constexpr _To 212 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 213 _From __f, _From __g, index_sequence<_I...>) 214 { 215 using _Tp = typename _VectorTraits<_To>::value_type; 216 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 217 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 218 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 219 static_cast<_Tp>(__g[_I])...}; 220 } 221 222 template
223 _GLIBCXX_SIMD_INTRINSIC constexpr _To 224 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 225 _From __f, _From __g, _From __h, index_sequence<_I...>) 226 { 227 using _Tp = typename _VectorTraits<_To>::value_type; 228 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 229 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 230 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 231 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...}; 232 } 233 234 template
235 _GLIBCXX_SIMD_INTRINSIC constexpr _To 236 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 237 _From __f, _From __g, _From __h, _From __i, 238 index_sequence<_I...>) 239 { 240 using _Tp = typename _VectorTraits<_To>::value_type; 241 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 242 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 243 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 244 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 245 static_cast<_Tp>(__i[_I])...}; 246 } 247 248 template
249 _GLIBCXX_SIMD_INTRINSIC constexpr _To 250 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 251 _From __f, _From __g, _From __h, _From __i, _From __j, 252 index_sequence<_I...>) 253 { 254 using _Tp = typename _VectorTraits<_To>::value_type; 255 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 256 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 257 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 258 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 259 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...}; 260 } 261 262 template
263 _GLIBCXX_SIMD_INTRINSIC constexpr _To 264 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 265 _From __f, _From __g, _From __h, _From __i, _From __j, 266 _From __k, index_sequence<_I...>) 267 { 268 using _Tp = typename _VectorTraits<_To>::value_type; 269 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 270 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 271 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 272 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 273 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 274 static_cast<_Tp>(__k[_I])...}; 275 } 276 277 template
278 _GLIBCXX_SIMD_INTRINSIC constexpr _To 279 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 280 _From __f, _From __g, _From __h, _From __i, _From __j, 281 _From __k, _From __l, index_sequence<_I...>) 282 { 283 using _Tp = typename _VectorTraits<_To>::value_type; 284 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 285 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 286 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 287 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 288 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 289 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...}; 290 } 291 292 template
293 _GLIBCXX_SIMD_INTRINSIC constexpr _To 294 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 295 _From __f, _From __g, _From __h, _From __i, _From __j, 296 _From __k, _From __l, _From __m, index_sequence<_I...>) 297 { 298 using _Tp = typename _VectorTraits<_To>::value_type; 299 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 300 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 301 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 302 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 303 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 304 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 305 static_cast<_Tp>(__m[_I])...}; 306 } 307 308 template
309 _GLIBCXX_SIMD_INTRINSIC constexpr _To 310 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 311 _From __f, _From __g, _From __h, _From __i, _From __j, 312 _From __k, _From __l, _From __m, _From __n, 313 index_sequence<_I...>) 314 { 315 using _Tp = typename _VectorTraits<_To>::value_type; 316 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 317 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 318 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 319 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 320 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 321 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 322 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...}; 323 } 324 325 template
326 _GLIBCXX_SIMD_INTRINSIC constexpr _To 327 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 328 _From __f, _From __g, _From __h, _From __i, _From __j, 329 _From __k, _From __l, _From __m, _From __n, _From __o, 330 index_sequence<_I...>) 331 { 332 using _Tp = typename _VectorTraits<_To>::value_type; 333 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 334 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 335 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 336 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 337 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 338 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 339 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 340 static_cast<_Tp>(__o[_I])...}; 341 } 342 343 template
344 _GLIBCXX_SIMD_INTRINSIC constexpr _To 345 __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e, 346 _From __f, _From __g, _From __h, _From __i, _From __j, 347 _From __k, _From __l, _From __m, _From __n, _From __o, 348 _From __p, index_sequence<_I...>) 349 { 350 using _Tp = typename _VectorTraits<_To>::value_type; 351 return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])..., 352 static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])..., 353 static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])..., 354 static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])..., 355 static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])..., 356 static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])..., 357 static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])..., 358 static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...}; 359 } 360 361 // Defer actual conversion to the overload that takes an index sequence. Note 362 // that this function adds zeros or drops values off the end if you don't ensure 363 // matching width. 364 template
365 _GLIBCXX_SIMD_INTRINSIC constexpr _To 366 __vector_convert(_SimdWrapper<_From, _FromSize>... __xs) 367 { 368 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048 369 using _From0 = __first_of_pack_t<_From...>; 370 using _FW = _SimdWrapper<_From0, _FromSize>; 371 if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop())) 372 { 373 if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1)) 374 == 0) // power-of-two number of arguments 375 return __convert_x86<_To>(__as_vector(__xs)...); 376 else // append zeros and recurse until the above branch is taken 377 return __vector_convert<_To>(__xs..., _FW{}); 378 } 379 else 380 #endif 381 return __vector_convert<_To>( 382 __as_vector(__xs)..., 383 make_index_sequence<(sizeof...(__xs) == 1 ? std::min( 384 _VectorTraits<_To>::_S_full_size, int(_FromSize)) 385 : _FromSize)>()); 386 } 387 388 // }}} 389 // __convert function{{{ 390 template
391 _GLIBCXX_SIMD_INTRINSIC constexpr auto 392 __convert(_From __v0, _More... __vs) 393 { 394 static_assert((true && ... && is_same_v<_From, _More>) ); 395 if constexpr (__is_vectorizable_v<_From>) 396 { 397 using _V = typename _VectorTraits<_To>::type; 398 using _Tp = typename _VectorTraits<_To>::value_type; 399 return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...}; 400 } 401 else if constexpr (__is_vector_type_v<_From>) 402 return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...); 403 else // _SimdWrapper arguments 404 { 405 constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More)); 406 if constexpr (__is_vectorizable_v<_To>) 407 return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...); 408 else if constexpr (!__is_vector_type_v<_To>) 409 return _To(__convert
(__v0, __vs...)); 410 else 411 { 412 static_assert( 413 sizeof...(_More) == 0 414 || _VectorTraits<_To>::_S_full_size >= __input_size, 415 "__convert(...) requires the input to fit into the output"); 416 return __vector_convert<_To>(__v0, __vs...); 417 } 418 } 419 } 420 421 // }}} 422 // __convert_all{{{ 423 // Converts __v into array<_To, N>, where N is _NParts if non-zero or 424 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v). 425 // Note: this function may return less than all converted elements 426 template
> 432 _GLIBCXX_SIMD_INTRINSIC auto 433 __convert_all(_From __v) 434 { 435 if constexpr (is_arithmetic_v<_To> && _NParts != 1) 436 { 437 static_assert(_Offset < _FromVT::_S_full_size); 438 constexpr auto _Np 439 = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts; 440 return __generate_from_n_evaluations<_Np, array<_To, _Np>>( 441 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 442 return static_cast<_To>(__v[__i + _Offset]); 443 }); 444 } 445 else 446 { 447 static_assert(__is_vector_type_v<_To>); 448 using _ToVT = _VectorTraits<_To>; 449 if constexpr (__is_vector_type_v<_From>) 450 return __convert_all<_To, _NParts>(__as_wrapper(__v)); 451 else if constexpr (_NParts == 1) 452 { 453 static_assert(_Offset % _ToVT::_S_full_size == 0); 454 return array<_To, 1>{__vector_convert<_To>( 455 __extract_part<_Offset / _ToVT::_S_full_size, 456 __div_roundup(_FromVT::_S_partial_width, 457 _ToVT::_S_full_size)>(__v))}; 458 } 459 #if _GLIBCXX_SIMD_X86INTRIN // {{{ 460 else if constexpr (!__have_sse4_1 && _Offset == 0 461 && is_integral_v
462 && sizeof(typename _FromVT::value_type) 463 < sizeof(typename _ToVT::value_type) 464 && !(sizeof(typename _FromVT::value_type) == 4 465 && is_same_v
)) 466 { 467 using _ToT = typename _ToVT::value_type; 468 using _FromT = typename _FromVT::value_type; 469 constexpr size_t _Np 470 = _NParts != 0 471 ? _NParts 472 : (_FromVT::_S_partial_width / _ToVT::_S_full_size); 473 using _R = array<_To, _Np>; 474 // __adjust modifies its input to have _Np (use _SizeConstant) 475 // entries so that no unnecessary intermediate conversions are 476 // requested and, more importantly, no intermediate conversions are 477 // missing 478 [[maybe_unused]] auto __adjust 479 = [](auto __n, 480 auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> { 481 return __vector_bitcast<_FromT, decltype(__n)::value>(__vv); 482 }; 483 [[maybe_unused]] const auto __vi = __to_intrin(__v); 484 auto&& __make_array 485 = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 486 if constexpr (_Np == 1) 487 return _R{__intrin_bitcast<_To>(__x0)}; 488 else 489 return _R{__intrin_bitcast<_To>(__x0), 490 __intrin_bitcast<_To>(__x1)}; 491 }; 492 493 if constexpr (_Np == 0) 494 return _R{}; 495 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2) 496 { 497 static_assert(is_integral_v<_FromT>); 498 static_assert(is_integral_v<_ToT>); 499 if constexpr (is_unsigned_v<_FromT>) 500 return __make_array(_mm_unpacklo_epi8(__vi, __m128i()), 501 _mm_unpackhi_epi8(__vi, __m128i())); 502 else 503 return __make_array( 504 _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8), 505 _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8)); 506 } 507 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4) 508 { 509 static_assert(is_integral_v<_FromT>); 510 if constexpr (is_floating_point_v<_ToT>) 511 { 512 const auto __ints 513 = __convert_all<__vector_type16_t
, _Np>( 514 __adjust(_SizeConstant<_Np * 4>(), __v)); 515 return __generate_from_n_evaluations<_Np, _R>( 516 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 517 return __vector_convert<_To>(__as_wrapper(__ints[__i])); 518 }); 519 } 520 else if constexpr (is_unsigned_v<_FromT>) 521 return __make_array(_mm_unpacklo_epi16(__vi, __m128i()), 522 _mm_unpackhi_epi16(__vi, __m128i())); 523 else 524 return __make_array( 525 _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16), 526 _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16)); 527 } 528 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 529 && is_integral_v<_FromT> && is_integral_v<_ToT>) 530 { 531 if constexpr (is_unsigned_v<_FromT>) 532 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 533 _mm_unpackhi_epi32(__vi, __m128i())); 534 else 535 return __make_array( 536 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 537 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 538 } 539 else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8 540 && is_integral_v<_FromT> && is_integral_v<_ToT>) 541 { 542 if constexpr (is_unsigned_v<_FromT>) 543 return __make_array(_mm_unpacklo_epi32(__vi, __m128i()), 544 _mm_unpackhi_epi32(__vi, __m128i())); 545 else 546 return __make_array( 547 _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)), 548 _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31))); 549 } 550 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4 551 && is_signed_v<_FromT>) 552 { 553 const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi), 554 _mm_unpackhi_epi8(__vi, __vi)}; 555 const __vector_type_t
__vvvv[4] = { 556 __vector_bitcast
(_mm_unpacklo_epi16(__vv[0], __vv[0])), 557 __vector_bitcast
(_mm_unpackhi_epi16(__vv[0], __vv[0])), 558 __vector_bitcast
(_mm_unpacklo_epi16(__vv[1], __vv[1])), 559 __vector_bitcast
(_mm_unpackhi_epi16(__vv[1], __vv[1]))}; 560 if constexpr (sizeof(_ToT) == 4) 561 return __generate_from_n_evaluations<_Np, _R>( 562 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 563 return __vector_convert<_To>( 564 _SimdWrapper
(__vvvv[__i] >> 24)); 565 }); 566 else if constexpr (is_integral_v<_ToT>) 567 return __generate_from_n_evaluations<_Np, _R>( 568 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 569 const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31); 570 const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24); 571 return __vector_bitcast<_ToT>( 572 __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits) 573 : _mm_unpackhi_epi32(__sx32, __signbits)); 574 }); 575 else 576 return __generate_from_n_evaluations<_Np, _R>( 577 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 578 const _SimdWrapper
__int4 = __vvvv[__i / 2] >> 24; 579 return __vector_convert<_To>( 580 __i % 2 == 0 ? __int4 581 : _SimdWrapper
( 582 _mm_unpackhi_epi64(__to_intrin(__int4), 583 __to_intrin(__int4)))); 584 }); 585 } 586 else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4) 587 { 588 const auto __shorts = __convert_all<__vector_type16_t< 589 conditional_t
, short, unsigned short>>>( 590 __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v)); 591 return __generate_from_n_evaluations<_Np, _R>( 592 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 593 return __convert_all<_To>(__shorts[__i / 2])[__i % 2]; 594 }); 595 } 596 else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8 597 && is_signed_v<_FromT> && is_integral_v<_ToT>) 598 { 599 const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi), 600 _mm_unpackhi_epi16(__vi, __vi)}; 601 const __vector_type16_t
__vvvv[4] 602 = {__vector_bitcast
( 603 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16), 604 _mm_srai_epi32(__vv[0], 31))), 605 __vector_bitcast
( 606 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16), 607 _mm_srai_epi32(__vv[0], 31))), 608 __vector_bitcast
( 609 _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16), 610 _mm_srai_epi32(__vv[1], 31))), 611 __vector_bitcast
( 612 _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16), 613 _mm_srai_epi32(__vv[1], 31)))}; 614 return __generate_from_n_evaluations<_Np, _R>( 615 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 616 return __vector_bitcast<_ToT>(__vvvv[__i]); 617 }); 618 } 619 else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8) 620 { 621 const auto __ints 622 = __convert_all<__vector_type16_t
|| is_floating_point_v<_ToT>, int, 624 unsigned int>>>( 625 __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v)); 626 return __generate_from_n_evaluations<_Np, _R>( 627 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 628 return __convert_all<_To>(__ints[__i / 2])[__i % 2]; 629 }); 630 } 631 else 632 __assert_unreachable<_To>(); 633 } 634 #endif // _GLIBCXX_SIMD_X86INTRIN }}} 635 else if constexpr ((_FromVT::_S_partial_width - _Offset) 636 > _ToVT::_S_full_size) 637 { 638 /* 639 static_assert( 640 (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) == 641 0, 642 "__convert_all only supports power-of-2 number of elements. 643 Otherwise " "the return type cannot be array<_To, N>."); 644 */ 645 constexpr size_t _NTotal 646 = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size; 647 constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts; 648 static_assert( 649 _Np <= _NTotal 650 || (_Np == _NTotal + 1 651 && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size 652 > 0)); 653 using _R = array<_To, _Np>; 654 if constexpr (_Np == 1) 655 return _R{__vector_convert<_To>( 656 __extract_part<_Offset, _FromVT::_S_partial_width, 657 _ToVT::_S_full_size>(__v))}; 658 else 659 return __generate_from_n_evaluations<_Np, _R>( 660 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 661 auto __part 662 = __extract_part<__i * _ToVT::_S_full_size + _Offset, 663 _FromVT::_S_partial_width, 664 _ToVT::_S_full_size>(__v); 665 return __vector_convert<_To>(__part); 666 }); 667 } 668 else if constexpr (_Offset == 0) 669 return array<_To, 1>{__vector_convert<_To>(__v)}; 670 else 671 return array<_To, 1>{__vector_convert<_To>( 672 __extract_part<_Offset, _FromVT::_S_partial_width, 673 _FromVT::_S_partial_width - _Offset>(__v))}; 674 } 675 } 676 677 // }}} 678 679 // _GnuTraits {{{ 680 template
681 struct _GnuTraits 682 { 683 using _IsValid = true_type; 684 using _SimdImpl = typename _Abi::_SimdImpl; 685 using _MaskImpl = typename _Abi::_MaskImpl; 686 687 // simd and simd_mask member types {{{ 688 using _SimdMember = _SimdWrapper<_Tp, _Np>; 689 using _MaskMember = _SimdWrapper<_Mp, _Np>; 690 static constexpr size_t _S_simd_align = alignof(_SimdMember); 691 static constexpr size_t _S_mask_align = alignof(_MaskMember); 692 693 // }}} 694 // size metadata {{{ 695 static constexpr size_t _S_full_size = _SimdMember::_S_full_size; 696 static constexpr bool _S_is_partial = _SimdMember::_S_is_partial; 697 698 // }}} 699 // _SimdBase / base class for simd, providing extra conversions {{{ 700 struct _SimdBase2 701 { 702 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 703 operator __intrinsic_type_t<_Tp, _Np>() const 704 { return __to_intrin(static_cast
*>(this)->_M_data); } 705 706 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 707 operator __vector_type_t<_Tp, _Np>() const 708 { return __data(*static_cast
*>(this)); } 709 }; 710 711 struct _SimdBase1 712 { 713 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 714 operator __intrinsic_type_t<_Tp, _Np>() const 715 { return __data(*static_cast
*>(this)); } 716 }; 717 718 using _SimdBase = conditional_t< 719 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 720 _SimdBase1, _SimdBase2>; 721 722 // }}} 723 // _MaskBase {{{ 724 struct _MaskBase2 725 { 726 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 727 operator __intrinsic_type_t<_Tp, _Np>() const 728 { return static_cast
*>(this) ->_M_data.__intrin(); } 729 730 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 731 operator __vector_type_t<_Tp, _Np>() const 732 { return static_cast
*>(this)->_M_data._M_data; } 733 }; 734 735 struct _MaskBase1 736 { 737 _GLIBCXX_SIMD_ALWAYS_INLINE explicit 738 operator __intrinsic_type_t<_Tp, _Np>() const 739 { return __data(*static_cast
*>(this)); } 740 }; 741 742 using _MaskBase = conditional_t< 743 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 744 _MaskBase1, _MaskBase2>; 745 746 // }}} 747 // _MaskCastType {{{ 748 // parameter type of one explicit simd_mask constructor 749 class _MaskCastType 750 { 751 using _Up = __intrinsic_type_t<_Tp, _Np>; 752 _Up _M_data; 753 754 public: 755 _GLIBCXX_SIMD_ALWAYS_INLINE 756 _MaskCastType(_Up __x) : _M_data(__x) {} 757 758 _GLIBCXX_SIMD_ALWAYS_INLINE 759 operator _MaskMember() const { return _M_data; } 760 }; 761 762 // }}} 763 // _SimdCastType {{{ 764 // parameter type of one explicit simd constructor 765 class _SimdCastType1 766 { 767 using _Ap = __intrinsic_type_t<_Tp, _Np>; 768 _SimdMember _M_data; 769 770 public: 771 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr 772 _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 773 774 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr 775 operator _SimdMember() const { return _M_data; } 776 }; 777 778 class _SimdCastType2 779 { 780 using _Ap = __intrinsic_type_t<_Tp, _Np>; 781 using _Bp = __vector_type_t<_Tp, _Np>; 782 _SimdMember _M_data; 783 784 public: 785 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr 786 _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {} 787 788 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr 789 _SimdCastType2(_Bp __b) : _M_data(__b) {} 790 791 _GLIBCXX_SIMD_ALWAYS_INLINE constexpr 792 operator _SimdMember() const { return _M_data; } 793 }; 794 795 using _SimdCastType = conditional_t< 796 is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value, 797 _SimdCastType1, _SimdCastType2>; 798 //}}} 799 }; 800 801 // }}} 802 struct _CommonImplX86; 803 struct _CommonImplNeon; 804 struct _CommonImplBuiltin; 805 template
struct _SimdImplBuiltin; 806 template
struct _MaskImplBuiltin; 807 template
struct _SimdImplX86; 808 template
struct _MaskImplX86; 809 template
struct _SimdImplNeon; 810 template
struct _MaskImplNeon; 811 template
struct _SimdImplPpc; 812 template
struct _MaskImplPpc; 813 814 // simd_abi::_VecBuiltin {{{ 815 template
816 struct simd_abi::_VecBuiltin 817 { 818 template
819 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 820 821 // validity traits {{{ 822 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 823 824 template
825 struct _IsValidSizeFor 826 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 827 && _UsedBytes % sizeof(_Tp) == 0 828 && _UsedBytes <= __vectorized_sizeof<_Tp>() 829 && (!__have_avx512f || _UsedBytes <= 32))> {}; 830 831 template
832 struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>, 833 _IsValidSizeFor<_Tp>> {}; 834 835 template
836 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 837 838 // }}} 839 // _SimdImpl/_MaskImpl {{{ 840 #if _GLIBCXX_SIMD_X86INTRIN 841 using _CommonImpl = _CommonImplX86; 842 using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>; 843 using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>; 844 #elif _GLIBCXX_SIMD_HAVE_NEON 845 using _CommonImpl = _CommonImplNeon; 846 using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>; 847 using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>; 848 #else 849 using _CommonImpl = _CommonImplBuiltin; 850 #ifdef __ALTIVEC__ 851 using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>; 852 using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>; 853 #else 854 using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>; 855 using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>; 856 #endif 857 #endif 858 859 // }}} 860 // __traits {{{ 861 template
862 using _MaskValueType = __int_for_sizeof_t<_Tp>; 863 864 template
865 using __traits 866 = conditional_t<_S_is_valid_v<_Tp>, 867 _GnuTraits<_Tp, _MaskValueType<_Tp>, 868 _VecBuiltin<_UsedBytes>, _S_size<_Tp>>, 869 _InvalidTraits>; 870 871 //}}} 872 // size metadata {{{ 873 template
874 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 875 876 template
877 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 878 879 // }}} 880 // implicit masks {{{ 881 template
882 using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>; 883 884 template
885 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 886 _S_implicit_mask() 887 { 888 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 889 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 890 return ~_UV(); 891 else 892 { 893 constexpr auto __size = _S_size<_Tp>; 894 _GLIBCXX_SIMD_USE_CONSTEXPR auto __r 895 = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 896 { return __i < __size ? -1 : 0; }); 897 return __r; 898 } 899 } 900 901 template
902 _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>> 903 _S_implicit_mask_intrin() 904 { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); } 905 906 template
> 907 _GLIBCXX_SIMD_INTRINSIC static constexpr _TW 908 _S_masked(_TW __x) 909 { 910 using _Tp = typename _TVT::value_type; 911 if constexpr (!_MaskMember<_Tp>::_S_is_partial) 912 return __x; 913 else 914 return __and(__as_vector(__x), 915 __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>())); 916 } 917 918 template
> 919 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 920 __make_padding_nonzero(_TW __x) 921 { 922 using _Tp = typename _TVT::value_type; 923 if constexpr (!_S_is_partial<_Tp>) 924 return __x; 925 else 926 { 927 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask 928 = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()); 929 if constexpr (is_integral_v<_Tp>) 930 return __or(__x, ~__implicit_mask); 931 else 932 { 933 _GLIBCXX_SIMD_USE_CONSTEXPR auto __one 934 = __andnot(__implicit_mask, 935 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))); 936 // it's not enough to return `x | 1_in_padding` because the 937 // padding in x might be inf or nan (independent of 938 // __FINITE_MATH_ONLY__, because it's about padding bits) 939 return __or(__and(__x, __implicit_mask), __one); 940 } 941 } 942 } 943 // }}} 944 }; 945 946 // }}} 947 // simd_abi::_VecBltnBtmsk {{{ 948 template
949 struct simd_abi::_VecBltnBtmsk 950 { 951 template
952 static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); 953 954 // validity traits {{{ 955 struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {}; 956 957 template
958 struct _IsValidSizeFor 959 : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 960 && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64 961 && (_UsedBytes > 32 || __have_avx512vl))> {}; 962 963 // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also 964 // required. 965 template
966 struct _IsValid 967 : conjunction< 968 _IsValidAbiTag, __bool_constant<__have_avx512f>, 969 __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>, 970 __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>, 971 _IsValidSizeFor<_Tp>> {}; 972 973 template
974 static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; 975 976 // }}} 977 // simd/_MaskImpl {{{ 978 #if _GLIBCXX_SIMD_X86INTRIN 979 using _CommonImpl = _CommonImplX86; 980 using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>; 981 using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>; 982 #else 983 template
984 struct _MissingImpl; 985 986 using _CommonImpl = _MissingImpl<_UsedBytes>; 987 using _SimdImpl = _MissingImpl<_UsedBytes>; 988 using _MaskImpl = _MissingImpl<_UsedBytes>; 989 #endif 990 991 // }}} 992 // __traits {{{ 993 template
994 using _MaskMember = _SimdWrapper
>; 995 996 template
997 using __traits = conditional_t< 998 _S_is_valid_v<_Tp>, 999 _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>, 1000 _InvalidTraits>; 1001 1002 //}}} 1003 // size metadata {{{ 1004 template
1005 static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; 1006 template
1007 static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; 1008 1009 // }}} 1010 // implicit mask {{{ 1011 private: 1012 template
1013 using _ImplicitMask = _SimdWrapper
>; 1014 1015 public: 1016 template
1017 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np> 1018 __implicit_mask_n() 1019 { 1020 using _Tp = __bool_storage_member_type_t<_Np>; 1021 return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp(); 1022 } 1023 1024 template
1025 _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp> 1026 _S_implicit_mask() 1027 { return __implicit_mask_n<_S_size<_Tp>>(); } 1028 1029 template
1030 _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>> 1031 _S_implicit_mask_intrin() 1032 { return __implicit_mask_n<_S_size<_Tp>>(); } 1033 1034 template
1035 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1036 _S_masked(_SimdWrapper<_Tp, _Np> __x) 1037 { 1038 if constexpr (is_same_v<_Tp, bool>) 1039 if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0) 1040 return _MaskImpl::_S_bit_and( 1041 __x, _SimdWrapper<_Tp, _Np>( 1042 __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1))); 1043 else 1044 return __x; 1045 else 1046 return _S_masked(__x._M_data); 1047 } 1048 1049 template
1050 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 1051 _S_masked(_TV __x) 1052 { 1053 using _Tp = typename _VectorTraits<_TV>::value_type; 1054 static_assert( 1055 !__is_bitmask_v<_TV>, 1056 "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't " 1057 "know the number of elements. Use _SimdWrapper
instead."); 1058 if constexpr (_S_is_partial<_Tp>) 1059 { 1060 constexpr size_t _Np = _S_size<_Tp>; 1061 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1062 _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(), 1063 _SimdWrapper<_Tp, _Np>(__x)); 1064 } 1065 else 1066 return __x; 1067 } 1068 1069 template
> 1070 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1071 __make_padding_nonzero(_TV __x) 1072 { 1073 using _Tp = typename _TVT::value_type; 1074 if constexpr (!_S_is_partial<_Tp>) 1075 return __x; 1076 else 1077 { 1078 constexpr size_t _Np = _S_size<_Tp>; 1079 if constexpr (is_integral_v
) 1080 return __x 1081 | __generate_vector<_Tp, _S_full_size<_Tp>>( 1082 [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp { 1083 if (__i < _Np) 1084 return 0; 1085 else 1086 return 1; 1087 }); 1088 else 1089 return __make_dependent_t<_TV, _CommonImpl>::_S_blend( 1090 _S_implicit_mask<_Tp>(), 1091 _SimdWrapper<_Tp, _Np>( 1092 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))), 1093 _SimdWrapper<_Tp, _Np>(__x)) 1094 ._M_data; 1095 } 1096 } 1097 1098 // }}} 1099 }; 1100 1101 //}}} 1102 // _CommonImplBuiltin {{{ 1103 struct _CommonImplBuiltin 1104 { 1105 // _S_converts_via_decomposition{{{ 1106 // This lists all cases where a __vector_convert needs to fall back to 1107 // conversion of individual scalars (i.e. decompose the input vector into 1108 // scalars, convert, compose output vector). In those cases, _S_masked_load & 1109 // _S_masked_store prefer to use the _S_bit_iteration implementation. 1110 template
1111 static inline constexpr bool __converts_via_decomposition_v 1112 = sizeof(_From) != sizeof(_To); 1113 1114 // }}} 1115 // _S_load{{{ 1116 template
1117 _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np> 1118 _S_load(const void* __p) 1119 { 1120 static_assert(_Np > 1); 1121 static_assert(_Bytes % sizeof(_Tp) == 0); 1122 using _Rp = __vector_type_t<_Tp, _Np>; 1123 if constexpr (sizeof(_Rp) == _Bytes) 1124 { 1125 _Rp __r; 1126 __builtin_memcpy(&__r, __p, _Bytes); 1127 return __r; 1128 } 1129 else 1130 { 1131 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424 1132 using _Up = conditional_t< 1133 is_integral_v<_Tp>, 1134 conditional_t<_Bytes % 4 == 0, 1135 conditional_t<_Bytes % 8 == 0, long long, int>, 1136 conditional_t<_Bytes % 2 == 0, short, signed char>>, 1137 conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp, 1138 double>>; 1139 using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>; 1140 if constexpr (sizeof(_V) != sizeof(_Rp)) 1141 { // on i386 with 4 < _Bytes <= 8 1142 _Rp __r{}; 1143 __builtin_memcpy(&__r, __p, _Bytes); 1144 return __r; 1145 } 1146 else 1147 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424 1148 using _V = _Rp; 1149 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1150 { 1151 _V __r{}; 1152 static_assert(_Bytes <= sizeof(_V)); 1153 __builtin_memcpy(&__r, __p, _Bytes); 1154 return reinterpret_cast<_Rp>(__r); 1155 } 1156 } 1157 } 1158 1159 // }}} 1160 // _S_store {{{ 1161 template
1162 _GLIBCXX_SIMD_INTRINSIC static void 1163 _S_memcpy(char* __dst, const char* __src) 1164 { 1165 if constexpr (_Bytes > 0) 1166 { 1167 constexpr size_t _Ns = std::__bit_floor(_Bytes); 1168 __builtin_memcpy(__dst, __src, _Ns); 1169 _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns); 1170 } 1171 } 1172 1173 template
1174 _GLIBCXX_SIMD_INTRINSIC static void 1175 _S_store(_TV __x, void* __addr) 1176 { 1177 constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes; 1178 static_assert(sizeof(__x) >= _Bytes); 1179 1180 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424 1181 if constexpr (__is_vector_type_v<_TV>) 1182 _S_memcpy<_Bytes>(reinterpret_cast
(__addr), reinterpret_cast
(&__x)); 1183 else 1184 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424 1185 __builtin_memcpy(__addr, &__x, _Bytes); 1186 } 1187 1188 template
1189 _GLIBCXX_SIMD_INTRINSIC static void 1190 _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr) 1191 { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); } 1192 1193 // }}} 1194 // _S_store_bool_array(_BitMask) {{{ 1195 template
1196 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1197 _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) 1198 { 1199 if constexpr (_Np == 1) 1200 __mem[0] = __x[0]; 1201 else if (__builtin_is_constant_evaluated()) 1202 { 1203 for (size_t __i = 0; __i < _Np; ++__i) 1204 __mem[__i] = __x[__i]; 1205 } 1206 else if constexpr (_Np == 2) 1207 { 1208 short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101; 1209 _S_store<_Np>(__bool2, __mem); 1210 } 1211 else if constexpr (_Np == 3) 1212 { 1213 int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101; 1214 _S_store<_Np>(__bool3, __mem); 1215 } 1216 else 1217 { 1218 __execute_n_times<__div_roundup(_Np, 4)>( 1219 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1220 constexpr int __offset = __i * 4; 1221 constexpr int __remaining = _Np - __offset; 1222 if constexpr (__remaining > 4 && __remaining <= 7) 1223 { 1224 const _ULLong __bool7 1225 = (__x.template _M_extract<__offset>()._M_to_bits() 1226 * 0x40810204081ULL) 1227 & 0x0101010101010101ULL; 1228 _S_store<__remaining>(__bool7, __mem + __offset); 1229 } 1230 else if constexpr (__remaining >= 4) 1231 { 1232 int __bits = __x.template _M_extract<__offset>()._M_to_bits(); 1233 if constexpr (__remaining > 7) 1234 __bits &= 0xf; 1235 const int __bool4 = (__bits * 0x204081) & 0x01010101; 1236 _S_store<4>(__bool4, __mem + __offset); 1237 } 1238 }); 1239 } 1240 } 1241 1242 // }}} 1243 // _S_blend{{{ 1244 template
1245 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 1246 _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k, 1247 _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1) 1248 { return __k._M_data ? __at1._M_data : __at0._M_data; } 1249 1250 // }}} 1251 }; 1252 1253 // }}} 1254 // _SimdImplBuiltin {{{1 1255 template
1256 struct _SimdImplBuiltin 1257 { 1258 // member types {{{2 1259 template
1260 static constexpr size_t _S_max_store_size = 16; 1261 1262 using abi_type = _Abi; 1263 1264 template
1265 using _TypeTag = _Tp*; 1266 1267 template
1268 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 1269 1270 template
1271 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 1272 1273 template
1274 static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; 1275 1276 template
1277 static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; 1278 1279 using _CommonImpl = typename _Abi::_CommonImpl; 1280 using _SuperImpl = typename _Abi::_SimdImpl; 1281 using _MaskImpl = typename _Abi::_MaskImpl; 1282 1283 // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2 1284 template
1285 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1286 _M_make_simd(_SimdWrapper<_Tp, _Np> __x) 1287 { return {__private_init, __x}; } 1288 1289 template
1290 _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi> 1291 _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x) 1292 { return {__private_init, __vector_bitcast<_Tp>(__x)}; } 1293 1294 // _S_broadcast {{{2 1295 template
1296 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1297 _S_broadcast(_Tp __x) noexcept 1298 { return __vector_broadcast<_S_full_size<_Tp>>(__x); } 1299 1300 // _S_generator {{{2 1301 template
1302 inline static constexpr _SimdMember<_Tp> 1303 _S_generator(_Fp&& __gen, _TypeTag<_Tp>) 1304 { 1305 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1306 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1307 if constexpr (__i < _S_size<_Tp>) 1308 return __gen(__i); 1309 else 1310 return 0; 1311 }); 1312 } 1313 1314 // _S_load {{{2 1315 template
1316 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> 1317 _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept 1318 { 1319 constexpr size_t _Np = _S_size<_Tp>; 1320 constexpr size_t __max_load_size 1321 = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64 1322 : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32 1323 : 16; 1324 constexpr size_t __bytes_to_load = sizeof(_Up) * _Np; 1325 if (__builtin_is_constant_evaluated()) 1326 return __generate_vector<_Tp, _S_full_size<_Tp>>( 1327 [&](auto __i) constexpr { 1328 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1329 }); 1330 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1331 return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>( 1332 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1333 return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0); 1334 }); 1335 else if constexpr (is_same_v<_Up, _Tp>) 1336 return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>, 1337 _Np * sizeof(_Tp)>(__mem); 1338 else if constexpr (__bytes_to_load <= __max_load_size) 1339 return __convert<_SimdMember<_Tp>>( 1340 _CommonImpl::template _S_load<_Up, _Np>(__mem)); 1341 else if constexpr (__bytes_to_load % __max_load_size == 0) 1342 { 1343 constexpr size_t __n_loads = __bytes_to_load / __max_load_size; 1344 constexpr size_t __elements_per_load = _Np / __n_loads; 1345 return __call_with_n_evaluations<__n_loads>( 1346 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1347 return __convert<_SimdMember<_Tp>>(__uncvted...); 1348 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1349 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1350 __mem + __i * __elements_per_load); 1351 }); 1352 } 1353 else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0 1354 && __max_load_size > 16) 1355 { // e.g. int[] ->
with AVX2 1356 constexpr size_t __n_loads 1357 = __bytes_to_load / (__max_load_size / 2); 1358 constexpr size_t __elements_per_load = _Np / __n_loads; 1359 return __call_with_n_evaluations<__n_loads>( 1360 [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1361 return __convert<_SimdMember<_Tp>>(__uncvted...); 1362 }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1363 return _CommonImpl::template _S_load<_Up, __elements_per_load>( 1364 __mem + __i * __elements_per_load); 1365 }); 1366 } 1367 else // e.g. int[] ->
1368 return __call_with_subscripts( 1369 __mem, make_index_sequence<_Np>(), 1370 [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1371 return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...}; 1372 }); 1373 } 1374 1375 // _S_masked_load {{{2 1376 template
1377 static constexpr inline _SimdWrapper<_Tp, _Np> 1378 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 1379 const _Up* __mem) noexcept 1380 { 1381 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1382 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1383 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 1384 }); 1385 return __merge; 1386 } 1387 1388 // _S_store {{{2 1389 template
1390 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1391 _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept 1392 { 1393 // TODO: converting int -> "smaller int" can be optimized with AVX512 1394 constexpr size_t _Np = _S_size<_Tp>; 1395 constexpr size_t __max_store_size 1396 = _SuperImpl::template _S_max_store_size<_Up>; 1397 if (__builtin_is_constant_evaluated()) 1398 { 1399 for (size_t __i = 0; __i < _Np; ++__i) 1400 __mem[__i] = __v[__i]; 1401 } 1402 else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up)) 1403 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1404 __mem[__i] = __v[__i]; 1405 }); 1406 else if constexpr (is_same_v<_Up, _Tp>) 1407 _CommonImpl::_S_store(__v, __mem); 1408 else if constexpr (sizeof(_Up) * _Np <= __max_store_size) 1409 _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)), 1410 __mem); 1411 else 1412 { 1413 constexpr size_t __vsize = __max_store_size / sizeof(_Up); 1414 // round up to convert the last partial vector as well: 1415 constexpr size_t __stores = __div_roundup(_Np, __vsize); 1416 constexpr size_t __full_stores = _Np / __vsize; 1417 using _V = __vector_type_t<_Up, __vsize>; 1418 const array<_V, __stores> __converted 1419 = __convert_all<_V, __stores>(__v); 1420 __execute_n_times<__full_stores>( 1421 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1422 _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize); 1423 }); 1424 if constexpr (__full_stores < __stores) 1425 _CommonImpl::template _S_store<(_Np - __full_stores * __vsize) 1426 * sizeof(_Up)>( 1427 __converted[__full_stores], __mem + __full_stores * __vsize); 1428 } 1429 } 1430 1431 // _S_masked_store_nocvt {{{2 1432 template
1433 _GLIBCXX_SIMD_INTRINSIC static constexpr void 1434 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k) 1435 { 1436 _BitOps::_S_bit_iteration( 1437 _MaskImpl::_S_to_bits(__k), 1438 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1439 __mem[__i] = __v[__i]; 1440 }); 1441 } 1442 1443 // _S_masked_store {{{2 1444 template
, 1445 typename _Tp = typename _TVT::value_type, typename _Up> 1446 static constexpr inline void 1447 _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept 1448 { 1449 constexpr size_t _TV_size = _S_size<_Tp>; 1450 [[maybe_unused]] const auto __vi = __to_intrin(__v); 1451 constexpr size_t __max_store_size 1452 = _SuperImpl::template _S_max_store_size<_Up>; 1453 if constexpr ( 1454 is_same_v< 1455 _Tp, 1456 _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up))) 1457 { 1458 // bitwise or no conversion, reinterpret: 1459 const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1460 if constexpr (__is_bitmask_v
) 1461 return _MaskMember<_Up>(__k._M_data); 1462 else 1463 return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k); 1464 }(); 1465 _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v), 1466 __mem, __kk); 1467 } 1468 else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up) 1469 && !_CommonImpl:: 1470 template __converts_via_decomposition_v< 1471 _Tp, _Up, __max_store_size>) 1472 { // conversion via decomposition is better handled via the 1473 // bit_iteration 1474 // fallback below 1475 constexpr size_t _UW_size 1476 = std::min(_TV_size, __max_store_size / sizeof(_Up)); 1477 static_assert(_UW_size <= _TV_size); 1478 using _UW = _SimdWrapper<_Up, _UW_size>; 1479 using _UV = __vector_type_t<_Up, _UW_size>; 1480 using _UAbi = simd_abi::deduce_t<_Up, _UW_size>; 1481 if constexpr (_UW_size == _TV_size) // one convert+store 1482 { 1483 const _UW __converted = __convert<_UW>(__v); 1484 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1485 __converted, __mem, 1486 _UAbi::_MaskImpl::template _S_convert< 1487 __int_for_sizeof_t<_Up>>(__k)); 1488 } 1489 else 1490 { 1491 static_assert(_UW_size * sizeof(_Up) == __max_store_size); 1492 constexpr size_t _NFullStores = _TV_size / _UW_size; 1493 constexpr size_t _NAllStores 1494 = __div_roundup(_TV_size, _UW_size); 1495 constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size; 1496 const array<_UV, _NAllStores> __converted 1497 = __convert_all<_UV, _NAllStores>(__v); 1498 __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1499 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1500 _UW(__converted[__i]), __mem + __i * _UW_size, 1501 _UAbi::_MaskImpl::template _S_convert< 1502 __int_for_sizeof_t<_Up>>( 1503 __extract_part<__i, _NParts>(__k.__as_full_vector()))); 1504 }); 1505 if constexpr (_NAllStores 1506 > _NFullStores) // one partial at the end 1507 _UAbi::_SimdImpl::_S_masked_store_nocvt( 1508 _UW(__converted[_NFullStores]), 1509 __mem + _NFullStores * _UW_size, 1510 _UAbi::_MaskImpl::template _S_convert< 1511 __int_for_sizeof_t<_Up>>( 1512 __extract_part<_NFullStores, _NParts>( 1513 __k.__as_full_vector()))); 1514 } 1515 } 1516 else 1517 _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k), 1518 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1519 __mem[__i] = static_cast<_Up>(__v[__i]); 1520 }); 1521 } 1522 1523 // _S_complement {{{2 1524 template
1525 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1526 _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept 1527 { 1528 if constexpr (is_floating_point_v<_Tp>) 1529 return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x)); 1530 else 1531 return ~__x._M_data; 1532 } 1533 1534 // _S_unary_minus {{{2 1535 template
1536 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1537 _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept 1538 { 1539 // GCC doesn't use the psign instructions, but pxor & psub seem to be 1540 // just as good a choice as pcmpeqd & psign. So meh. 1541 return -__x._M_data; 1542 } 1543 1544 // arithmetic operators {{{2 1545 template
1546 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1547 _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1548 { return __x._M_data + __y._M_data; } 1549 1550 template
1551 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1552 _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1553 { return __x._M_data - __y._M_data; } 1554 1555 template
1556 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1557 _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1558 { return __x._M_data * __y._M_data; } 1559 1560 template
1561 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1562 _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1563 { 1564 // Note that division by 0 is always UB, so we must ensure we avoid the 1565 // case for partial registers 1566 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1567 return __x._M_data / __y._M_data; 1568 else 1569 return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data); 1570 } 1571 1572 template
1573 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1574 _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1575 { 1576 if constexpr (!_Abi::template _S_is_partial<_Tp>) 1577 return __x._M_data % __y._M_data; 1578 else 1579 return __as_vector(__x) 1580 % _Abi::__make_padding_nonzero(__as_vector(__y)); 1581 } 1582 1583 template
1584 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1585 _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1586 { return __and(__x, __y); } 1587 1588 template
1589 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1590 _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1591 { return __or(__x, __y); } 1592 1593 template
1594 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1595 _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1596 { return __xor(__x, __y); } 1597 1598 template
1599 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1600 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1601 { return __x._M_data << __y._M_data; } 1602 1603 template
1604 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1605 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1606 { return __x._M_data >> __y._M_data; } 1607 1608 template
1609 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1610 _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y) 1611 { return __x._M_data << __y; } 1612 1613 template
1614 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 1615 _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y) 1616 { return __x._M_data >> __y; } 1617 1618 // compares {{{2 1619 // _S_equal_to {{{3 1620 template
1621 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1622 _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1623 { return __x._M_data == __y._M_data; } 1624 1625 // _S_not_equal_to {{{3 1626 template
1627 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1628 _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1629 { return __x._M_data != __y._M_data; } 1630 1631 // _S_less {{{3 1632 template
1633 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1634 _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1635 { return __x._M_data < __y._M_data; } 1636 1637 // _S_less_equal {{{3 1638 template
1639 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1640 _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 1641 { return __x._M_data <= __y._M_data; } 1642 1643 // _S_negate {{{2 1644 template
1645 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 1646 _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept 1647 { return !__x._M_data; } 1648 1649 // _S_min, _S_max, _S_minmax {{{2 1650 template
1651 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1652 _SimdWrapper<_Tp, _Np> 1653 _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1654 { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; } 1655 1656 template
1657 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1658 _SimdWrapper<_Tp, _Np> 1659 _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1660 { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; } 1661 1662 template
1663 _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr 1664 pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>> 1665 _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b) 1666 { 1667 return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data, 1668 __a._M_data < __b._M_data ? __b._M_data : __a._M_data}; 1669 } 1670 1671 // reductions {{{2 1672 template
1674 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1675 _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>, 1676 simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1677 { 1678 using _V = __vector_type_t<_Tp, _Np / 2>; 1679 static_assert(sizeof(_V) <= sizeof(__x)); 1680 // _S_full_size is the size of the smallest native SIMD register that 1681 // can store _Np/2 elements: 1682 using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>; 1683 using _HalfSimd = __deduced_simd<_Tp, _Np / 2>; 1684 const auto __xx = __as_vector(__x); 1685 return _HalfSimd::abi_type::_SimdImpl::_S_reduce( 1686 static_cast<_HalfSimd>(__as_vector(__binary_op( 1687 static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)), 1688 static_cast<_FullSimd>(__intrin_bitcast<_V>( 1689 __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>( 1690 __xx)))))), 1691 __binary_op); 1692 } 1693 1694 template
1695 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 1696 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 1697 { 1698 constexpr size_t _Np = simd_size_v<_Tp, _Abi>; 1699 if constexpr (_Np == 1) 1700 return __x[0]; 1701 else if constexpr (_Np == 2) 1702 return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1703 simd<_Tp, simd_abi::scalar>(__x[1]))[0]; 1704 else if (__builtin_is_constant_evaluated()) 1705 { 1706 simd<_Tp, simd_abi::scalar> __acc = __x[0]; 1707 for (size_t __i = 1; __i < _Np; ++__i) 1708 __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i])); 1709 return __acc[0]; 1710 } 1711 else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{ 1712 { 1713 [[maybe_unused]] constexpr auto __full_size 1714 = _Abi::template _S_full_size<_Tp>; 1715 if constexpr (_Np == 3) 1716 return __binary_op( 1717 __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]), 1718 simd<_Tp, simd_abi::scalar>(__x[1])), 1719 simd<_Tp, simd_abi::scalar>(__x[2]))[0]; 1720 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1721 plus<>>) 1722 { 1723 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1724 return _Ap::_SimdImpl::_S_reduce( 1725 simd<_Tp, _Ap>(__private_init, 1726 _Abi::_S_masked(__as_vector(__x))), 1727 __binary_op); 1728 } 1729 else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, 1730 multiplies<>>) 1731 { 1732 using _Ap = simd_abi::deduce_t<_Tp, __full_size>; 1733 using _TW = _SimdWrapper<_Tp, __full_size>; 1734 _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full 1735 = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector(); 1736 _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one 1737 = __vector_broadcast<__full_size>(_Tp(1)); 1738 const _TW __x_full = __data(__x).__as_full_vector(); 1739 const _TW __x_padded_with_ones 1740 = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one, 1741 __x_full); 1742 return _Ap::_SimdImpl::_S_reduce( 1743 simd<_Tp, _Ap>(__private_init, __x_padded_with_ones), 1744 __binary_op); 1745 } 1746 else if constexpr (_Np & 1) 1747 { 1748 using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>; 1749 return __binary_op( 1750 simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce( 1751 simd<_Tp, _Ap>( 1752 __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>( 1753 __as_vector(__x))), 1754 __binary_op)), 1755 simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0]; 1756 } 1757 else 1758 return _S_reduce_partial<_Np>( 1759 make_index_sequence<_Np / 2>(), 1760 make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op); 1761 } //}}} 1762 else if constexpr (sizeof(__x) == 16) //{{{ 1763 { 1764 if constexpr (_Np == 16) 1765 { 1766 const auto __y = __data(__x); 1767 __x = __binary_op( 1768 _M_make_simd<_Tp, _Np>( 1769 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1770 7, 7>(__y)), 1771 _M_make_simd<_Tp, _Np>( 1772 __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 1773 14, 14, 15, 15>(__y))); 1774 } 1775 if constexpr (_Np >= 8) 1776 { 1777 const auto __y = __vector_bitcast
(__data(__x)); 1778 __x = __binary_op( 1779 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1780 __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))), 1781 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1782 __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y)))); 1783 } 1784 if constexpr (_Np >= 4) 1785 { 1786 using _Up = conditional_t
, float, int>; 1787 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1788 __x = __binary_op(__x, 1789 _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1790 __vector_permute<3, 2, 1, 0>(__y)))); 1791 } 1792 using _Up = conditional_t
, double, _LLong>; 1793 const auto __y = __vector_bitcast<_Up>(__data(__x)); 1794 __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( 1795 __vector_permute<1, 1>(__y)))); 1796 return __x[0]; 1797 } //}}} 1798 else 1799 { 1800 static_assert(sizeof(__x) > __min_vector_size<_Tp>); 1801 static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2 1802 using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>; 1803 using _V = simd<_Tp, _Ap>; 1804 return _Ap::_SimdImpl::_S_reduce( 1805 __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))), 1806 _V(__private_init, 1807 __extract<1, 2>(__as_vector(__x)))), 1808 static_cast<_BinaryOperation&&>(__binary_op)); 1809 } 1810 } 1811 1812 // math {{{2 1813 // frexp, modf and copysign implemented in simd_math.h 1814 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \ 1815 template
\ 1816 static _Tp \ 1817 _S_##__name(const _Tp& __x, const _More&... __more) \ 1818 { \ 1819 return __generate_vector<_Tp>( \ 1820 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1821 return __name(__x[__i], __more[__i]...); \ 1822 }); \ 1823 } 1824 1825 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \ 1826 template
\ 1827 static typename _Tp::mask_type \ 1828 _S_##__name(const _Tp& __x, const _More&... __more) \ 1829 { \ 1830 return __generate_vector<_Tp>( \ 1831 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1832 return __name(__x[__i], __more[__i]...); \ 1833 }); \ 1834 } 1835 1836 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ 1837 template
\ 1838 static auto \ 1839 _S_##__name(const _Tp& __x, const _More&... __more) \ 1840 { \ 1841 return __fixed_size_storage_t<_RetTp, \ 1842 _VectorTraits<_Tp>::_S_partial_width>:: \ 1843 _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1844 return __meta._S_generator( \ 1845 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ 1846 return __name(__x[__meta._S_offset + __i], \ 1847 __more[__meta._S_offset + __i]...); \ 1848 }, \ 1849 static_cast<_RetTp*>(nullptr)); \ 1850 }); \ 1851 } 1852 1853 _GLIBCXX_SIMD_MATH_FALLBACK(acos) 1854 _GLIBCXX_SIMD_MATH_FALLBACK(asin) 1855 _GLIBCXX_SIMD_MATH_FALLBACK(atan) 1856 _GLIBCXX_SIMD_MATH_FALLBACK(atan2) 1857 _GLIBCXX_SIMD_MATH_FALLBACK(cos) 1858 _GLIBCXX_SIMD_MATH_FALLBACK(sin) 1859 _GLIBCXX_SIMD_MATH_FALLBACK(tan) 1860 _GLIBCXX_SIMD_MATH_FALLBACK(acosh) 1861 _GLIBCXX_SIMD_MATH_FALLBACK(asinh) 1862 _GLIBCXX_SIMD_MATH_FALLBACK(atanh) 1863 _GLIBCXX_SIMD_MATH_FALLBACK(cosh) 1864 _GLIBCXX_SIMD_MATH_FALLBACK(sinh) 1865 _GLIBCXX_SIMD_MATH_FALLBACK(tanh) 1866 _GLIBCXX_SIMD_MATH_FALLBACK(exp) 1867 _GLIBCXX_SIMD_MATH_FALLBACK(exp2) 1868 _GLIBCXX_SIMD_MATH_FALLBACK(expm1) 1869 _GLIBCXX_SIMD_MATH_FALLBACK(ldexp) 1870 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) 1871 _GLIBCXX_SIMD_MATH_FALLBACK(log) 1872 _GLIBCXX_SIMD_MATH_FALLBACK(log10) 1873 _GLIBCXX_SIMD_MATH_FALLBACK(log1p) 1874 _GLIBCXX_SIMD_MATH_FALLBACK(log2) 1875 _GLIBCXX_SIMD_MATH_FALLBACK(logb) 1876 1877 // modf implemented in simd_math.h 1878 _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) 1879 _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) 1880 _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) 1881 _GLIBCXX_SIMD_MATH_FALLBACK(fabs) 1882 _GLIBCXX_SIMD_MATH_FALLBACK(pow) 1883 _GLIBCXX_SIMD_MATH_FALLBACK(sqrt) 1884 _GLIBCXX_SIMD_MATH_FALLBACK(erf) 1885 _GLIBCXX_SIMD_MATH_FALLBACK(erfc) 1886 _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) 1887 _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) 1888 1889 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) 1890 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) 1891 1892 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) 1893 _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) 1894 1895 _GLIBCXX_SIMD_MATH_FALLBACK(fmod) 1896 _GLIBCXX_SIMD_MATH_FALLBACK(remainder) 1897 1898 template
> 1899 static _Tp 1900 _S_remquo(const _Tp __x, const _Tp __y, 1901 __fixed_size_storage_t
* __z) 1902 { 1903 return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 1904 int __tmp; 1905 auto __r = remquo(__x[__i], __y[__i], &__tmp); 1906 __z->_M_set(__i, __tmp); 1907 return __r; 1908 }); 1909 } 1910 1911 // copysign in simd_math.h 1912 _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) 1913 _GLIBCXX_SIMD_MATH_FALLBACK(fdim) 1914 _GLIBCXX_SIMD_MATH_FALLBACK(fmax) 1915 _GLIBCXX_SIMD_MATH_FALLBACK(fmin) 1916 _GLIBCXX_SIMD_MATH_FALLBACK(fma) 1917 1918 template
1919 static constexpr _MaskMember<_Tp> 1920 _S_isgreater(_SimdWrapper<_Tp, _Np> __x, 1921 _SimdWrapper<_Tp, _Np> __y) noexcept 1922 { 1923 using _Ip = __int_for_sizeof_t<_Tp>; 1924 const auto __xn = __vector_bitcast<_Ip>(__x); 1925 const auto __yn = __vector_bitcast<_Ip>(__y); 1926 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 1927 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 1928 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 1929 __xp > __yp); 1930 } 1931 1932 template
1933 static constexpr _MaskMember<_Tp> 1934 _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x, 1935 _SimdWrapper<_Tp, _Np> __y) noexcept 1936 { 1937 using _Ip = __int_for_sizeof_t<_Tp>; 1938 const auto __xn = __vector_bitcast<_Ip>(__x); 1939 const auto __yn = __vector_bitcast<_Ip>(__y); 1940 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 1941 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 1942 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 1943 __xp >= __yp); 1944 } 1945 1946 template
1947 static constexpr _MaskMember<_Tp> 1948 _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept 1949 { 1950 using _Ip = __int_for_sizeof_t<_Tp>; 1951 const auto __xn = __vector_bitcast<_Ip>(__x); 1952 const auto __yn = __vector_bitcast<_Ip>(__y); 1953 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 1954 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 1955 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 1956 __xp < __yp); 1957 } 1958 1959 template
1960 static constexpr _MaskMember<_Tp> 1961 _S_islessequal(_SimdWrapper<_Tp, _Np> __x, 1962 _SimdWrapper<_Tp, _Np> __y) noexcept 1963 { 1964 using _Ip = __int_for_sizeof_t<_Tp>; 1965 const auto __xn = __vector_bitcast<_Ip>(__x); 1966 const auto __yn = __vector_bitcast<_Ip>(__y); 1967 const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn; 1968 const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn; 1969 return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data, 1970 __xp <= __yp); 1971 } 1972 1973 template
1974 static constexpr _MaskMember<_Tp> 1975 _S_islessgreater(_SimdWrapper<_Tp, _Np> __x, 1976 _SimdWrapper<_Tp, _Np> __y) noexcept 1977 { 1978 return __andnot(_SuperImpl::_S_isunordered(__x, __y), 1979 _SuperImpl::_S_not_equal_to(__x, __y)); 1980 } 1981 1982 #undef _GLIBCXX_SIMD_MATH_FALLBACK 1983 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET 1984 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET 1985 // _S_abs {{{3 1986 template
1987 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 1988 _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept 1989 { 1990 // if (__builtin_is_constant_evaluated()) 1991 // { 1992 // return __x._M_data < 0 ? -__x._M_data : __x._M_data; 1993 // } 1994 if constexpr (is_floating_point_v<_Tp>) 1995 // `v < 0 ? -v : v` cannot compile to the efficient implementation of 1996 // masking the signbit off because it must consider v == -0 1997 1998 // ~(-0.) & v would be easy, but breaks with fno-signed-zeros 1999 return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data); 2000 else 2001 return __x._M_data < 0 ? -__x._M_data : __x._M_data; 2002 } 2003 2004 // }}}3 2005 // _S_plus_minus {{{ 2006 // Returns __x + __y - __y without -fassociative-math optimizing to __x. 2007 // - _TV must be __vector_type_t
. 2008 // - _UV must be _TV or floating-point type. 2009 template
2010 _GLIBCXX_SIMD_INTRINSIC static constexpr _TV 2011 _S_plus_minus(_TV __x, _UV __y) noexcept 2012 { 2013 #if defined __i386__ && !defined __SSE_MATH__ 2014 if constexpr (sizeof(__x) == 8) 2015 { // operations on __x would use the FPU 2016 static_assert(is_same_v<_TV, __vector_type_t
>); 2017 const auto __x4 = __vector_bitcast
(__x); 2018 if constexpr (is_same_v<_TV, _UV>) 2019 return __vector_bitcast
( 2020 _S_plus_minus(__x4, __vector_bitcast
(__y))); 2021 else 2022 return __vector_bitcast
(_S_plus_minus(__x4, __y)); 2023 } 2024 #endif 2025 #if !defined __clang__ && __GCC_IEC_559 == 0 2026 if (__builtin_is_constant_evaluated() 2027 || (__builtin_constant_p(__x) && __builtin_constant_p(__y))) 2028 return (__x + __y) - __y; 2029 else 2030 return [&] { 2031 __x += __y; 2032 if constexpr(__have_sse) 2033 { 2034 if constexpr (sizeof(__x) >= 16) 2035 asm("" : "+x"(__x)); 2036 else if constexpr (is_same_v<__vector_type_t
, _TV>) 2037 asm("" : "+x"(__x[0]), "+x"(__x[1])); 2038 else 2039 __assert_unreachable<_TV>(); 2040 } 2041 else if constexpr(__have_neon) 2042 asm("" : "+w"(__x)); 2043 else if constexpr (__have_power_vmx) 2044 { 2045 if constexpr (is_same_v<__vector_type_t
, _TV>) 2046 asm("" : "+fgr"(__x[0]), "+fgr"(__x[1])); 2047 else 2048 asm("" : "+v"(__x)); 2049 } 2050 else 2051 asm("" : "+g"(__x)); 2052 return __x - __y; 2053 }(); 2054 #else 2055 return (__x + __y) - __y; 2056 #endif 2057 } 2058 2059 // }}} 2060 // _S_nearbyint {{{3 2061 template
> 2062 _GLIBCXX_SIMD_INTRINSIC static _Tp 2063 _S_nearbyint(_Tp __x_) noexcept 2064 { 2065 using value_type = typename _TVT::value_type; 2066 using _V = typename _TVT::type; 2067 const _V __x = __x_; 2068 const _V __absx = __and(__x, _S_absmask<_V>); 2069 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v
); 2070 _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs 2071 = _V() + (1ull << (__digits_v
- 1)); 2072 const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs); 2073 const _V __shifted = _S_plus_minus(__x, __shifter); 2074 return __absx < __shifter_abs ? __shifted : __x; 2075 } 2076 2077 // _S_rint {{{3 2078 template
> 2079 _GLIBCXX_SIMD_INTRINSIC static _Tp 2080 _S_rint(_Tp __x) noexcept 2081 { return _SuperImpl::_S_nearbyint(__x); } 2082 2083 // _S_trunc {{{3 2084 template
2085 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2086 _S_trunc(_SimdWrapper<_Tp, _Np> __x) 2087 { 2088 using _V = __vector_type_t<_Tp, _Np>; 2089 const _V __absx = __and(__x._M_data, _S_absmask<_V>); 2090 static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>); 2091 constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1); 2092 _V __truncated = _S_plus_minus(__absx, __shifter); 2093 __truncated -= __truncated > __absx ? _V() + 1 : _V(); 2094 return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated) 2095 : __x._M_data; 2096 } 2097 2098 // _S_round {{{3 2099 template
2100 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2101 _S_round(_SimdWrapper<_Tp, _Np> __x) 2102 { 2103 const auto __abs_x = _SuperImpl::_S_abs(__x); 2104 const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data; 2105 const auto __r_abs // round(abs(x)) = 2106 = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0); 2107 return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs); 2108 } 2109 2110 // _S_floor {{{3 2111 template
2112 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2113 _S_floor(_SimdWrapper<_Tp, _Np> __x) 2114 { 2115 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2116 const auto __negative_input 2117 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2118 const auto __mask 2119 = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2120 return __or(__andnot(__mask, __y), 2121 __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1))); 2122 } 2123 2124 // _S_ceil {{{3 2125 template
2126 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 2127 _S_ceil(_SimdWrapper<_Tp, _Np> __x) 2128 { 2129 const auto __y = _SuperImpl::_S_trunc(__x)._M_data; 2130 const auto __negative_input 2131 = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0)); 2132 const auto __inv_mask 2133 = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input); 2134 return __or(__and(__inv_mask, __y), 2135 __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1))); 2136 } 2137 2138 // _S_isnan {{{3 2139 template
2140 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2141 _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2142 { 2143 #if __FINITE_MATH_ONLY__ 2144 return {}; // false 2145 #elif !defined __SUPPORT_SNAN__ 2146 return ~(__x._M_data == __x._M_data); 2147 #elif defined __STDC_IEC_559__ 2148 using _Ip = __int_for_sizeof_t<_Tp>; 2149 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2150 const auto __infn 2151 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>)); 2152 return __infn < __absn; 2153 #else 2154 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?" 2155 #endif 2156 } 2157 2158 // _S_isfinite {{{3 2159 template
2160 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2161 _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2162 { 2163 #if __FINITE_MATH_ONLY__ 2164 using _UV = typename _MaskMember<_Tp>::_BuiltinType; 2165 _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV(); 2166 return __alltrue; 2167 #else 2168 // if all exponent bits are set, __x is either inf or NaN 2169 using _Ip = __int_for_sizeof_t<_Tp>; 2170 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2171 const auto __maxn 2172 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2173 return __absn <= __maxn; 2174 #endif 2175 } 2176 2177 // _S_isunordered {{{3 2178 template
2179 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2180 _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) 2181 { return __or(_S_isnan(__x), _S_isnan(__y)); } 2182 2183 // _S_signbit {{{3 2184 template
2185 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2186 _S_signbit(_SimdWrapper<_Tp, _Np> __x) 2187 { 2188 using _Ip = __int_for_sizeof_t<_Tp>; 2189 return __vector_bitcast<_Ip>(__x) < 0; 2190 // Arithmetic right shift (SRA) would also work (instead of compare), but 2191 // 64-bit SRA isn't available on x86 before AVX512. And in general, 2192 // compares are more likely to be efficient than SRA. 2193 } 2194 2195 // _S_isinf {{{3 2196 template
2197 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2198 _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x) 2199 { 2200 #if __FINITE_MATH_ONLY__ 2201 return {}; // false 2202 #else 2203 return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x), 2204 __vector_broadcast<_Np>( 2205 __infinity_v<_Tp>)); 2206 // alternative: 2207 // compare to inf using the corresponding integer type 2208 /* 2209 return 2210 __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>( 2211 _S_abs(__x)._M_data) 2212 == 2213 __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>( 2214 __infinity_v<_Tp>))); 2215 */ 2216 #endif 2217 } 2218 2219 // _S_isnormal {{{3 2220 template
2221 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2222 _S_isnormal(_SimdWrapper<_Tp, _Np> __x) 2223 { 2224 using _Ip = __int_for_sizeof_t<_Tp>; 2225 const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); 2226 const auto __minn 2227 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>)); 2228 #if __FINITE_MATH_ONLY__ 2229 return __absn >= __minn; 2230 #else 2231 const auto __maxn 2232 = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); 2233 return __minn <= __absn && __absn <= __maxn; 2234 #endif 2235 } 2236 2237 // _S_fpclassify {{{3 2238 template
2239 _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t
2240 _S_fpclassify(_SimdWrapper<_Tp, _Np> __x) 2241 { 2242 using _I = __int_for_sizeof_t<_Tp>; 2243 const auto __xn 2244 = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x))); 2245 constexpr size_t _NI = sizeof(__xn) / sizeof(_I); 2246 _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn 2247 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>)); 2248 2249 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal 2250 = __vector_broadcast<_NI, _I>(FP_NORMAL); 2251 #if !__FINITE_MATH_ONLY__ 2252 _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn 2253 = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>)); 2254 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan 2255 = __vector_broadcast<_NI, _I>(FP_NAN); 2256 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite 2257 = __vector_broadcast<_NI, _I>(FP_INFINITE); 2258 #endif 2259 #ifndef __FAST_MATH__ 2260 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal 2261 = __vector_broadcast<_NI, _I>(FP_SUBNORMAL); 2262 #endif 2263 _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero 2264 = __vector_broadcast<_NI, _I>(FP_ZERO); 2265 2266 __vector_type_t<_I, _NI> 2267 __tmp = __xn < __minn 2268 #ifdef __FAST_MATH__ 2269 ? __fp_zero 2270 #else 2271 ? (__xn == 0 ? __fp_zero : __fp_subnormal) 2272 #endif 2273 #if __FINITE_MATH_ONLY__ 2274 : __fp_normal; 2275 #else 2276 : (__xn < __infn ? __fp_normal 2277 : (__xn == __infn ? __fp_infinite : __fp_nan)); 2278 #endif 2279 2280 if constexpr (sizeof(_I) == sizeof(int)) 2281 { 2282 using _FixedInt = __fixed_size_storage_t
; 2283 const auto __as_int = __vector_bitcast
(__tmp); 2284 if constexpr (_FixedInt::_S_tuple_size == 1) 2285 return {__as_int}; 2286 else if constexpr (_FixedInt::_S_tuple_size == 2 2287 && is_same_v< 2288 typename _FixedInt::_SecondType::_FirstAbi, 2289 simd_abi::scalar>) 2290 return {__extract<0, 2>(__as_int), __as_int[_Np - 1]}; 2291 else if constexpr (_FixedInt::_S_tuple_size == 2) 2292 return {__extract<0, 2>(__as_int), 2293 __auto_bitcast(__extract<1, 2>(__as_int))}; 2294 else 2295 __assert_unreachable<_Tp>(); 2296 } 2297 else if constexpr (_Np == 2 && sizeof(_I) == 8 2298 && __fixed_size_storage_t
::_S_tuple_size == 2) 2299 { 2300 const auto __aslong = __vector_bitcast<_LLong>(__tmp); 2301 return {int(__aslong[0]), {int(__aslong[1])}}; 2302 } 2303 #if _GLIBCXX_SIMD_X86INTRIN 2304 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32 2305 && __fixed_size_storage_t
::_S_tuple_size == 1) 2306 return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)), 2307 __to_intrin(__hi128(__tmp)))}; 2308 else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64 2309 && __fixed_size_storage_t
::_S_tuple_size == 1) 2310 return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))}; 2311 #endif // _GLIBCXX_SIMD_X86INTRIN 2312 else if constexpr (__fixed_size_storage_t
::_S_tuple_size == 1) 2313 return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp), 2314 [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2315 return __make_wrapper
(__l...); 2316 })}; 2317 else 2318 __assert_unreachable<_Tp>(); 2319 } 2320 2321 // _S_increment & _S_decrement{{{2 2322 template
2323 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2324 _S_increment(_SimdWrapper<_Tp, _Np>& __x) 2325 { __x = __x._M_data + 1; } 2326 2327 template
2328 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2329 _S_decrement(_SimdWrapper<_Tp, _Np>& __x) 2330 { __x = __x._M_data - 1; } 2331 2332 // smart_reference access {{{2 2333 template
2334 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2335 _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept 2336 { __v._M_set(__i, static_cast<_Up&&>(__x)); } 2337 2338 // _S_masked_assign{{{2 2339 template
2340 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2341 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2342 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2343 { 2344 if (__k._M_is_constprop_none_of()) 2345 return; 2346 else if (__k._M_is_constprop_all_of()) 2347 __lhs = __rhs; 2348 else 2349 __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); 2350 } 2351 2352 template
2353 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2354 _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2355 __type_identity_t<_Tp> __rhs) 2356 { 2357 if (__k._M_is_constprop_none_of()) 2358 return; 2359 else if (__k._M_is_constprop_all_of()) 2360 __lhs = __vector_broadcast<_Np>(__rhs); 2361 else if (__builtin_constant_p(__rhs) && __rhs == 0) 2362 { 2363 if constexpr (!is_same_v
) 2364 // the __andnot optimization only makes sense if __k._M_data is a 2365 // vector register 2366 __lhs._M_data 2367 = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data); 2368 else 2369 // for AVX512/__mmask, a _mm512_maskz_mov is best 2370 __lhs 2371 = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>()); 2372 } 2373 else 2374 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2375 _SimdWrapper<_Tp, _Np>( 2376 __vector_broadcast<_Np>(__rhs))); 2377 } 2378 2379 // _S_masked_cassign {{{2 2380 template
2381 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2382 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2383 _SimdWrapper<_Tp, _Np>& __lhs, 2384 const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs, 2385 _Op __op) 2386 { 2387 if (__k._M_is_constprop_none_of()) 2388 return; 2389 else if (__k._M_is_constprop_all_of()) 2390 __lhs = __op(_SuperImpl{}, __lhs, __rhs); 2391 else 2392 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2393 __op(_SuperImpl{}, __lhs, __rhs)); 2394 } 2395 2396 template
2397 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2398 _S_masked_cassign(const _SimdWrapper<_K, _Np> __k, 2399 _SimdWrapper<_Tp, _Np>& __lhs, 2400 const __type_identity_t<_Tp> __rhs, _Op __op) 2401 { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); } 2402 2403 // _S_masked_unary {{{2 2404 template
class _Op, typename _Tp, typename _K, 2405 size_t _Np> 2406 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2407 _S_masked_unary(const _SimdWrapper<_K, _Np> __k, 2408 const _SimdWrapper<_Tp, _Np> __v) 2409 { 2410 if (__k._M_is_constprop_none_of()) 2411 return __v; 2412 auto __vv = _M_make_simd(__v); 2413 _Op
__op; 2414 if (__k._M_is_constprop_all_of()) 2415 return __data(__op(__vv)); 2416 else if constexpr (is_same_v<_Op
, __increment
>) 2417 { 2418 static_assert(not std::is_same_v<_K, bool>); 2419 if constexpr (is_integral_v<_Tp>) 2420 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2421 return __v._M_data - __vector_bitcast<_Tp>(__k._M_data); 2422 else if constexpr (not __have_avx2) 2423 return __v._M_data 2424 + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2425 _K, _Tp(1))); 2426 // starting with AVX2 it is more efficient to blend after add 2427 } 2428 else if constexpr (is_same_v<_Op
, __decrement
>) 2429 { 2430 static_assert(not std::is_same_v<_K, bool>); 2431 if constexpr (is_integral_v<_Tp>) 2432 // Take a shortcut knowing that __k is an integer vector with values -1 or 0. 2433 return __v._M_data + __vector_bitcast<_Tp>(__k._M_data); 2434 else if constexpr (not __have_avx2) 2435 return __v._M_data 2436 - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast( 2437 _K, _Tp(1))); 2438 // starting with AVX2 it is more efficient to blend after sub 2439 } 2440 return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv))); 2441 } 2442 2443 //}}}2 2444 }; 2445 2446 // _MaskImplBuiltinMixin {{{1 2447 struct _MaskImplBuiltinMixin 2448 { 2449 template
2450 using _TypeTag = _Tp*; 2451 2452 // _S_to_maskvector {{{ 2453 template
2454 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2455 _S_to_maskvector(bool __x) 2456 { 2457 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2458 return __x ? __vector_type_t<_Up, _ToN>{~_Up()} 2459 : __vector_type_t<_Up, _ToN>{}; 2460 } 2461 2462 template
2464 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2465 _S_to_maskvector(_BitMask<_Np, _Sanitized> __x) 2466 { 2467 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2468 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2469 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2470 if constexpr (__i < _Np) 2471 return __x[__i] ? ~_Up() : _Up(); 2472 else 2473 return _Up(); 2474 }); 2475 } 2476 2477 template
2479 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN> 2480 _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x) 2481 { 2482 static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>); 2483 using _TW = _SimdWrapper<_Tp, _Np>; 2484 using _UW = _SimdWrapper<_Up, _ToN>; 2485 if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW)) 2486 return __wrapper_bitcast<_Up, _ToN>(__x); 2487 else if constexpr (is_same_v<_Tp, bool>) // bits -> vector 2488 return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data)); 2489 else 2490 { // vector -> vector 2491 /* 2492 [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data); 2493 if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) == 2494 16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr 2495 (sizeof(_Tp) == 4 && sizeof(_Up) == 2 2496 && sizeof(__y) == 16) 2497 return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y); 2498 else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2 2499 && sizeof(__y) == 16) 2500 return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y); 2501 else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1 2502 && sizeof(__y) == 16) 2503 return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, 2504 -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 && 2505 sizeof(_Up) == 1 2506 && sizeof(__y) == 16) 2507 return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, 2508 -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 && 2509 sizeof(_Up) == 1 2510 && sizeof(__y) == 16) 2511 return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2512 -1, -1, -1, -1, -1>(__y); else 2513 */ 2514 { 2515 return __generate_vector<__vector_type_t<_Up, _ToN>>( 2516 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2517 if constexpr (__i < _Np) 2518 return _Up(__x[__i.value]); 2519 else 2520 return _Up(); 2521 }); 2522 } 2523 } 2524 } 2525 2526 // }}} 2527 // _S_to_bits {{{ 2528 template
2529 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 2530 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 2531 { 2532 static_assert(!is_same_v<_Tp, bool>); 2533 static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong)); 2534 using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>; 2535 const auto __bools 2536 = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1); 2537 _ULLong __r = 0; 2538 __execute_n_times<_Np>( 2539 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2540 __r |= _ULLong(__bools[__i.value]) << __i; 2541 }); 2542 return __r; 2543 } 2544 2545 // }}} 2546 }; 2547 2548 // _MaskImplBuiltin {{{1 2549 template
2550 struct _MaskImplBuiltin : _MaskImplBuiltinMixin 2551 { 2552 using _MaskImplBuiltinMixin::_S_to_bits; 2553 using _MaskImplBuiltinMixin::_S_to_maskvector; 2554 2555 // member types {{{ 2556 template
2557 using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; 2558 2559 template
2560 using _MaskMember = typename _Abi::template _MaskMember<_Tp>; 2561 2562 using _SuperImpl = typename _Abi::_MaskImpl; 2563 using _CommonImpl = typename _Abi::_CommonImpl; 2564 2565 template
2566 static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; 2567 2568 // }}} 2569 // _S_broadcast {{{ 2570 template
2571 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2572 _S_broadcast(bool __x) 2573 { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); } 2574 2575 // }}} 2576 // _S_load {{{ 2577 template
2578 _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> 2579 _S_load(const bool* __mem) 2580 { 2581 using _I = __int_for_sizeof_t<_Tp>; 2582 if (not __builtin_is_constant_evaluated()) 2583 if constexpr (sizeof(_Tp) == sizeof(bool)) 2584 { 2585 const auto __bools 2586 = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem); 2587 // bool is {0, 1}, everything else is UB 2588 return __bools > 0; 2589 } 2590 return __generate_vector<_I, _S_size<_Tp>>( 2591 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2592 return __mem[__i] ? ~_I() : _I(); 2593 }); 2594 } 2595 2596 // }}} 2597 // _S_convert {{{ 2598 template
2599 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2600 _S_convert(_BitMask<_Np, _Sanitized> __x) 2601 { 2602 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2603 return _SimdWrapper
>(__x._M_to_bits()); 2604 else 2605 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2606 _S_size<_Tp>>( 2607 __x._M_sanitized()); 2608 } 2609 2610 template
2611 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2612 _S_convert(_SimdWrapper
__x) 2613 { 2614 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2615 return _SimdWrapper
>(__x._M_data); 2616 else 2617 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2618 _S_size<_Tp>>( 2619 _BitMask<_Np>(__x._M_data)._M_sanitized()); 2620 } 2621 2622 template
2623 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2624 _S_convert(_SimdWrapper<_Up, _Np> __x) 2625 { 2626 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2627 return _SimdWrapper
>( 2628 _SuperImpl::_S_to_bits(__x)); 2629 else 2630 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2631 _S_size<_Tp>>(__x); 2632 } 2633 2634 template
2635 _GLIBCXX_SIMD_INTRINSIC static constexpr auto 2636 _S_convert(simd_mask<_Up, _UAbi> __x) 2637 { 2638 if constexpr (__is_builtin_bitmask_abi<_Abi>()) 2639 { 2640 using _R = _SimdWrapper
>; 2641 if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits 2642 return _R(__data(__x)); 2643 else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits 2644 return _R(__data(__x)); 2645 else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits 2646 return _R(__data(__x)._M_to_bits()); 2647 else // vector -> bits 2648 return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits()); 2649 } 2650 else 2651 return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, 2652 _S_size<_Tp>>( 2653 __data(__x)); 2654 } 2655 2656 // }}} 2657 // _S_masked_load {{{2 2658 template
2659 static inline _SimdWrapper<_Tp, _Np> 2660 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, 2661 _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept 2662 { 2663 // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity 2664 auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge); 2665 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask), 2666 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2667 __tmp._M_set(__i, -__mem[__i]); 2668 }); 2669 __merge = __wrapper_bitcast<_Tp>(__tmp); 2670 return __merge; 2671 } 2672 2673 // _S_store {{{2 2674 template
2675 _GLIBCXX_SIMD_INTRINSIC static constexpr void 2676 _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept 2677 { 2678 __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2679 __mem[__i] = __v[__i]; 2680 }); 2681 } 2682 2683 // _S_masked_store {{{2 2684 template
2685 static inline void 2686 _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem, 2687 const _SimdWrapper<_Tp, _Np> __k) noexcept 2688 { 2689 _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k), 2690 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2691 __mem[__i] = __v[__i]; 2692 }); 2693 } 2694 2695 // _S_from_bitmask{{{2 2696 template
2697 _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> 2698 _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) 2699 { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); } 2700 2701 // logical and bitwise operators {{{2 2702 template
2703 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2704 _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2705 { return __and(__x._M_data, __y._M_data); } 2706 2707 template
2708 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2709 _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2710 { return __or(__x._M_data, __y._M_data); } 2711 2712 template
2713 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2714 _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x) 2715 { 2716 if constexpr (_Abi::template _S_is_partial<_Tp>) 2717 return __andnot(__x, __wrapper_bitcast<_Tp>( 2718 _Abi::template _S_implicit_mask<_Tp>())); 2719 else 2720 return __not(__x._M_data); 2721 } 2722 2723 template
2724 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2725 _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2726 { return __and(__x._M_data, __y._M_data); } 2727 2728 template
2729 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2730 _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2731 { return __or(__x._M_data, __y._M_data); } 2732 2733 template
2734 _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np> 2735 _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y) 2736 { return __xor(__x._M_data, __y._M_data); } 2737 2738 // smart_reference access {{{2 2739 template
2740 static constexpr void 2741 _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept 2742 { 2743 if constexpr (is_same_v<_Tp, bool>) 2744 __k._M_set(__i, __x); 2745 else 2746 { 2747 static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>); 2748 if (__builtin_is_constant_evaluated()) 2749 { 2750 __k = __generate_from_n_evaluations<_Np, 2751 __vector_type_t<_Tp, _Np>>( 2752 [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 2753 if (__i == static_cast
(__j)) 2754 return _Tp(-__x); 2755 else 2756 return __k[+__j]; 2757 }); 2758 } 2759 else 2760 __k._M_data[__i] = -__x; 2761 } 2762 } 2763 2764 // _S_masked_assign{{{2 2765 template
2766 _GLIBCXX_SIMD_INTRINSIC static void 2767 _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, 2768 __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs) 2769 { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); } 2770 2771 template
2772 _GLIBCXX_SIMD_INTRINSIC static void 2773 _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs) 2774 { 2775 if (__builtin_constant_p(__rhs)) 2776 { 2777 if (__rhs == false) 2778 __lhs = __andnot(__k, __lhs); 2779 else 2780 __lhs = __or(__k, __lhs); 2781 return; 2782 } 2783 __lhs = _CommonImpl::_S_blend(__k, __lhs, 2784 __data(simd_mask<_Tp, _Abi>(__rhs))); 2785 } 2786 2787 //}}}2 2788 // _S_all_of {{{ 2789 template
2790 _GLIBCXX_SIMD_INTRINSIC static bool 2791 _S_all_of(simd_mask<_Tp, _Abi> __k) 2792 { 2793 return __call_with_subscripts( 2794 __data(__k), make_index_sequence<_S_size<_Tp>>(), 2795 [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA 2796 { return (... && !(__ent == 0)); }); 2797 } 2798 2799 // }}} 2800 // _S_any_of {{{ 2801 template