Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_neon.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_neon.h 1 // Simd NEON specific implementations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 27 28 #if __cplusplus >= 201703L 29 30 #if !_GLIBCXX_SIMD_HAVE_NEON 31 #error "simd_neon.h may only be included when NEON on ARM is available" 32 #endif 33 34 _GLIBCXX_SIMD_BEGIN_NAMESPACE 35 36 // _CommonImplNeon {{{ 37 struct _CommonImplNeon : _CommonImplBuiltin 38 { 39 // _S_store {{{ 40 using _CommonImplBuiltin::_S_store; 41 42 // }}} 43 }; 44 45 // }}} 46 // _SimdImplNeon {{{ 47 template
48 struct _SimdImplNeon : _SimdImplBuiltin<_Abi> 49 { 50 using _Base = _SimdImplBuiltin<_Abi>; 51 52 template
53 using _MaskMember = typename _Base::template _MaskMember<_Tp>; 54 55 template
56 static constexpr size_t _S_max_store_size = 16; 57 58 // _S_masked_load {{{ 59 template
60 static inline _SimdWrapper<_Tp, _Np> 61 _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, 62 const _Up* __mem) noexcept 63 { 64 __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 65 if (__k[__i] != 0) 66 __merge._M_set(__i, static_cast<_Tp>(__mem[__i])); 67 }); 68 return __merge; 69 } 70 71 // }}} 72 // _S_masked_store_nocvt {{{ 73 template
74 _GLIBCXX_SIMD_INTRINSIC static void 75 _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, 76 _MaskMember<_Tp> __k) 77 { 78 __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 79 if (__k[__i] != 0) 80 __mem[__i] = __v[__i]; 81 }); 82 } 83 84 // }}} 85 // _S_reduce {{{ 86 template
87 _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp 88 _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) 89 { 90 if (not __builtin_is_constant_evaluated()) 91 { 92 constexpr size_t _Np = __x.size(); 93 if constexpr (sizeof(__x) == 16 && _Np >= 4 94 && !_Abi::template _S_is_partial<_Tp>) 95 { 96 const auto __halves = split
>>(__x); 97 const auto __y = __binary_op(__halves[0], __halves[1]); 98 return _SimdImplNeon
>::_S_reduce( 99 __y, static_cast<_BinaryOperation&&>(__binary_op)); 100 } 101 else if constexpr (_Np == 8) 102 { 103 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 104 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data))); 105 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 106 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data))); 107 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 108 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data))); 109 return __x[0]; 110 } 111 else if constexpr (_Np == 4) 112 { 113 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 114 __vector_permute<1, 0, 3, 2>(__x._M_data))); 115 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 116 __vector_permute<3, 2, 1, 0>(__x._M_data))); 117 return __x[0]; 118 } 119 else if constexpr (_Np == 2) 120 { 121 __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>( 122 __vector_permute<1, 0>(__x._M_data))); 123 return __x[0]; 124 } 125 } 126 return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op)); 127 } 128 129 // }}} 130 // math {{{ 131 // _S_sqrt {{{ 132 template
> 133 _GLIBCXX_SIMD_INTRINSIC static _Tp 134 _S_sqrt(_Tp __x) 135 { 136 if constexpr (__have_neon_a64) 137 { 138 const auto __intrin = __to_intrin(__x); 139 if constexpr (_TVT::template _S_is
) 140 return vsqrt_f32(__intrin); 141 else if constexpr (_TVT::template _S_is
) 142 return vsqrtq_f32(__intrin); 143 else if constexpr (_TVT::template _S_is
) 144 return vsqrt_f64(__intrin); 145 else if constexpr (_TVT::template _S_is
) 146 return vsqrtq_f64(__intrin); 147 else 148 __assert_unreachable<_Tp>(); 149 } 150 else 151 return _Base::_S_sqrt(__x); 152 } 153 154 // }}} 155 // _S_trunc {{{ 156 template
> 157 _GLIBCXX_SIMD_INTRINSIC static _TW 158 _S_trunc(_TW __x) 159 { 160 using _Tp = typename _TVT::value_type; 161 if constexpr (__have_neon_a32) 162 { 163 const auto __intrin = __to_intrin(__x); 164 if constexpr (_TVT::template _S_is
) 165 return vrnd_f32(__intrin); 166 else if constexpr (_TVT::template _S_is
) 167 return vrndq_f32(__intrin); 168 else if constexpr (_TVT::template _S_is
) 169 return vrnd_f64(__intrin); 170 else if constexpr (_TVT::template _S_is
) 171 return vrndq_f64(__intrin); 172 else 173 __assert_unreachable<_Tp>(); 174 } 175 else if constexpr (is_same_v<_Tp, float>) 176 { 177 auto __intrin = __to_intrin(__x); 178 if constexpr (sizeof(__x) == 16) 179 __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin)); 180 else 181 __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin)); 182 return _Base::_S_abs(__x)._M_data < 0x1p23f 183 ? __vector_bitcast
(__intrin) 184 : __x._M_data; 185 } 186 else 187 return _Base::_S_trunc(__x); 188 } 189 190 // }}} 191 // _S_round {{{ 192 template
193 _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np> 194 _S_round(_SimdWrapper<_Tp, _Np> __x) 195 { 196 if constexpr (__have_neon_a32) 197 { 198 const auto __intrin = __to_intrin(__x); 199 if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8) 200 return vrnda_f32(__intrin); 201 else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16) 202 return vrndaq_f32(__intrin); 203 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8) 204 return vrnda_f64(__intrin); 205 else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16) 206 return vrndaq_f64(__intrin); 207 else 208 __assert_unreachable<_Tp>(); 209 } 210 else 211 return _Base::_S_round(__x); 212 } 213 214 // }}} 215 // _S_floor {{{ 216 template
> 217 _GLIBCXX_SIMD_INTRINSIC static _Tp 218 _S_floor(_Tp __x) 219 { 220 if constexpr (__have_neon_a32) 221 { 222 const auto __intrin = __to_intrin(__x); 223 if constexpr (_TVT::template _S_is
) 224 return vrndm_f32(__intrin); 225 else if constexpr (_TVT::template _S_is
) 226 return vrndmq_f32(__intrin); 227 else if constexpr (_TVT::template _S_is
) 228 return vrndm_f64(__intrin); 229 else if constexpr (_TVT::template _S_is
) 230 return vrndmq_f64(__intrin); 231 else 232 __assert_unreachable<_Tp>(); 233 } 234 else 235 return _Base::_S_floor(__x); 236 } 237 238 // }}} 239 // _S_ceil {{{ 240 template
> 241 _GLIBCXX_SIMD_INTRINSIC static _Tp 242 _S_ceil(_Tp __x) 243 { 244 if constexpr (__have_neon_a32) 245 { 246 const auto __intrin = __to_intrin(__x); 247 if constexpr (_TVT::template _S_is
) 248 return vrndp_f32(__intrin); 249 else if constexpr (_TVT::template _S_is
) 250 return vrndpq_f32(__intrin); 251 else if constexpr (_TVT::template _S_is
) 252 return vrndp_f64(__intrin); 253 else if constexpr (_TVT::template _S_is
) 254 return vrndpq_f64(__intrin); 255 else 256 __assert_unreachable<_Tp>(); 257 } 258 else 259 return _Base::_S_ceil(__x); 260 } 261 262 //}}} }}} 263 }; // }}} 264 // _MaskImplNeonMixin {{{ 265 struct _MaskImplNeonMixin 266 { 267 using _Base = _MaskImplBuiltinMixin; 268 269 template
270 _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> 271 _S_to_bits(_SimdWrapper<_Tp, _Np> __x) 272 { 273 if (__builtin_is_constant_evaluated()) 274 return _Base::_S_to_bits(__x); 275 276 using _I = __int_for_sizeof_t<_Tp>; 277 if constexpr (sizeof(__x) == 16) 278 { 279 auto __asint = __vector_bitcast<_I>(__x); 280 #ifdef __aarch64__ 281 [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 282 #else 283 [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))(); 284 #endif 285 if constexpr (sizeof(_Tp) == 1) 286 { 287 constexpr auto __bitsel 288 = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>( 289 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 290 return static_cast<_I>( 291 __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0); 292 }); 293 __asint &= __bitsel; 294 #ifdef __aarch64__ 295 return __vector_bitcast<_UShort>( 296 vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero), 297 __zero))[0]; 298 #else 299 return __vector_bitcast<_UShort>( 300 vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)), 301 __zero), 302 __zero))[0]; 303 #endif 304 } 305 else if constexpr (sizeof(_Tp) == 2) 306 { 307 constexpr auto __bitsel 308 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 309 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 310 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 311 }); 312 __asint &= __bitsel; 313 #ifdef __aarch64__ 314 return vaddvq_s16(__asint); 315 #else 316 return vpadd_s16( 317 vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero), 318 __zero)[0]; 319 #endif 320 } 321 else if constexpr (sizeof(_Tp) == 4) 322 { 323 constexpr auto __bitsel 324 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 325 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 326 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 327 }); 328 __asint &= __bitsel; 329 #ifdef __aarch64__ 330 return vaddvq_s32(__asint); 331 #else 332 return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)), 333 __zero)[0]; 334 #endif 335 } 336 else if constexpr (sizeof(_Tp) == 8) 337 return (__asint[0] & 1) | (__asint[1] & 2); 338 else 339 __assert_unreachable<_Tp>(); 340 } 341 else if constexpr (sizeof(__x) == 8) 342 { 343 auto __asint = __vector_bitcast<_I>(__x); 344 [[maybe_unused]] constexpr auto __zero = decltype(__asint)(); 345 if constexpr (sizeof(_Tp) == 1) 346 { 347 constexpr auto __bitsel 348 = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>( 349 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 350 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 351 }); 352 __asint &= __bitsel; 353 #ifdef __aarch64__ 354 return vaddv_s8(__asint); 355 #else 356 return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero), 357 __zero)[0]; 358 #endif 359 } 360 else if constexpr (sizeof(_Tp) == 2) 361 { 362 constexpr auto __bitsel 363 = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>( 364 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { 365 return static_cast<_I>(__i < _Np ? 1 << __i : 0); 366 }); 367 __asint &= __bitsel; 368 #ifdef __aarch64__ 369 return vaddv_s16(__asint); 370 #else 371 return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0]; 372 #endif 373 } 374 else if constexpr (sizeof(_Tp) == 4) 375 { 376 __asint &= __make_vector<_I>(0x1, 0x2); 377 #ifdef __aarch64__ 378 return vaddv_s32(__asint); 379 #else 380 return vpadd_s32(__asint, __zero)[0]; 381 #endif 382 } 383 else 384 __assert_unreachable<_Tp>(); 385 } 386 else 387 return _Base::_S_to_bits(__x); 388 } 389 }; 390 391 // }}} 392 // _MaskImplNeon {{{ 393 template
394 struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi> 395 { 396 using _MaskImplBuiltinMixin::_S_to_maskvector; 397 using _MaskImplNeonMixin::_S_to_bits; 398 using _Base = _MaskImplBuiltin<_Abi>; 399 using _Base::_S_convert; 400 401 // _S_all_of {{{ 402 template
403 _GLIBCXX_SIMD_INTRINSIC static bool 404 _S_all_of(simd_mask<_Tp, _Abi> __k) 405 { 406 const auto __kk 407 = __vector_bitcast
(__k._M_data) 408 | ~__vector_bitcast
(_Abi::template _S_implicit_mask<_Tp>()); 409 if constexpr (sizeof(__k) == 16) 410 { 411 const auto __x = __vector_bitcast
(__kk); 412 return __x[0] + __x[1] == -2; 413 } 414 else if constexpr (sizeof(__k) <= 8) 415 return __bit_cast<__int_for_sizeof_t
>(__kk) == -1; 416 else 417 __assert_unreachable<_Tp>(); 418 } 419 420 // }}} 421 // _S_any_of {{{ 422 template
423 _GLIBCXX_SIMD_INTRINSIC static bool 424 _S_any_of(simd_mask<_Tp, _Abi> __k) 425 { 426 const auto __kk 427 = __vector_bitcast
(__k._M_data) 428 | ~__vector_bitcast
(_Abi::template _S_implicit_mask<_Tp>()); 429 if constexpr (sizeof(__k) == 16) 430 { 431 const auto __x = __vector_bitcast
(__kk); 432 return (__x[0] | __x[1]) != 0; 433 } 434 else if constexpr (sizeof(__k) <= 8) 435 return __bit_cast<__int_for_sizeof_t
>(__kk) != 0; 436 else 437 __assert_unreachable<_Tp>(); 438 } 439 440 // }}} 441 // _S_none_of {{{ 442 template
443 _GLIBCXX_SIMD_INTRINSIC static bool 444 _S_none_of(simd_mask<_Tp, _Abi> __k) 445 { 446 const auto __kk = _Abi::_S_masked(__k._M_data); 447 if constexpr (sizeof(__k) == 16) 448 { 449 const auto __x = __vector_bitcast
(__kk); 450 return (__x[0] | __x[1]) == 0; 451 } 452 else if constexpr (sizeof(__k) <= 8) 453 return __bit_cast<__int_for_sizeof_t
>(__kk) == 0; 454 else 455 __assert_unreachable<_Tp>(); 456 } 457 458 // }}} 459 // _S_some_of {{{ 460 template
461 _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k) 462 { 463 if constexpr (sizeof(__k) <= 8) 464 { 465 const auto __kk = __vector_bitcast
(__k._M_data) 466 | ~__vector_bitcast
( 467 _Abi::template _S_implicit_mask<_Tp>()); 468 using _Up = make_unsigned_t<__int_for_sizeof_t
>; 469 return __bit_cast<_Up>(__kk) + 1 > 1; 470 } 471 else 472 return _Base::_S_some_of(__k); 473 } 474 475 // }}} 476 // _S_popcount {{{ 477 template
478 _GLIBCXX_SIMD_INTRINSIC static int 479 _S_popcount(simd_mask<_Tp, _Abi> __k) 480 { 481 if constexpr (sizeof(_Tp) == 1) 482 { 483 const auto __s8 = __vector_bitcast<_SChar>(__k._M_data); 484 int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8); 485 return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()), 486 int8x8_t())[0]; 487 } 488 else if constexpr (sizeof(_Tp) == 2) 489 { 490 const auto __s16 = __vector_bitcast
(__k._M_data); 491 int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16); 492 return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0]; 493 } 494 else if constexpr (sizeof(_Tp) == 4) 495 { 496 const auto __s32 = __vector_bitcast
(__k._M_data); 497 int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32); 498 return -vpadd_s32(__tmp, int32x2_t())[0]; 499 } 500 else if constexpr (sizeof(_Tp) == 8) 501 { 502 static_assert(sizeof(__k) == 16); 503 const auto __s64 = __vector_bitcast
(__k._M_data); 504 return -(__s64[0] + __s64[1]); 505 } 506 } 507 508 // }}} 509 // _S_find_first_set {{{ 510 template
511 _GLIBCXX_SIMD_INTRINSIC static int 512 _S_find_first_set(simd_mask<_Tp, _Abi> __k) 513 { 514 // TODO: the _Base implementation is not optimal for NEON 515 return _Base::_S_find_first_set(__k); 516 } 517 518 // }}} 519 // _S_find_last_set {{{ 520 template
521 _GLIBCXX_SIMD_INTRINSIC static int 522 _S_find_last_set(simd_mask<_Tp, _Abi> __k) 523 { 524 // TODO: the _Base implementation is not optimal for NEON 525 return _Base::_S_find_last_set(__k); 526 } 527 528 // }}} 529 }; // }}} 530 531 _GLIBCXX_SIMD_END_NAMESPACE 532 #endif // __cplusplus >= 201703L 533 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_ 534 // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™