Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/c++/11/experimental/bits/simd_neon.h


$ cat -n /usr/include/c++/11/experimental/bits/simd_neon.h

     1	// Simd NEON specific implementations -*- C++ -*-
     2	
     3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
     4	//
     5	// This file is part of the GNU ISO C++ Library.  This library is free
     6	// software; you can redistribute it and/or modify it under the
     7	// terms of the GNU General Public License as published by the
     8	// Free Software Foundation; either version 3, or (at your option)
     9	// any later version.
    10	
    11	// This library is distributed in the hope that it will be useful,
    12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
    13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14	// GNU General Public License for more details.
    15	
    16	// Under Section 7 of GPL version 3, you are granted additional
    17	// permissions described in the GCC Runtime Library Exception, version
    18	// 3.1, as published by the Free Software Foundation.
    19	
    20	// You should have received a copy of the GNU General Public License and
    21	// a copy of the GCC Runtime Library Exception along with this program;
    22	// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    23	// <http://www.gnu.org/licenses/>.
    24	
    25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
    26	#define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
    27	
    28	#if __cplusplus >= 201703L
    29	
    30	#if !_GLIBCXX_SIMD_HAVE_NEON
    31	#error "simd_neon.h may only be included when NEON on ARM is available"
    32	#endif
    33	
    34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
    35	
    36	// _CommonImplNeon {{{
    37	struct _CommonImplNeon : _CommonImplBuiltin
    38	{
    39	  // _S_store {{{
    40	  using _CommonImplBuiltin::_S_store;
    41	
    42	  // }}}
    43	};
    44	
    45	// }}}
    46	// _SimdImplNeon {{{
    47	template <typename _Abi>
    48	  struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
    49	  {
    50	    using _Base = _SimdImplBuiltin<_Abi>;
    51	
    52	    template <typename _Tp>
    53	      using _MaskMember = typename _Base::template _MaskMember<_Tp>;
    54	
    55	    template <typename _Tp>
    56	      static constexpr size_t _S_max_store_size = 16;
    57	
    58	    // _S_masked_load {{{
    59	    template <typename _Tp, size_t _Np, typename _Up>
    60	      static inline _SimdWrapper<_Tp, _Np>
    61	      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
    62			     const _Up* __mem) noexcept
    63	      {
    64		__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
    65		  if (__k[__i] != 0)
    66		    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
    67		});
    68		return __merge;
    69	      }
    70	
    71	    // }}}
    72	    // _S_masked_store_nocvt {{{
    73	    template <typename _Tp, size_t _Np>
    74	      _GLIBCXX_SIMD_INTRINSIC static void
    75	      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
    76				    _MaskMember<_Tp> __k)
    77	      {
    78		__execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
    79		  if (__k[__i] != 0)
    80		    __mem[__i] = __v[__i];
    81		});
    82	      }
    83	
    84	    // }}}
    85	    // _S_reduce {{{
    86	    template <typename _Tp, typename _BinaryOperation>
    87	      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
    88	      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
    89	      {
    90		if (not __builtin_is_constant_evaluated())
    91		  {
    92		    constexpr size_t _Np = __x.size();
    93		    if constexpr (sizeof(__x) == 16 && _Np >= 4
    94				    && !_Abi::template _S_is_partial<_Tp>)
    95		      {
    96			const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
    97			const auto __y = __binary_op(__halves[0], __halves[1]);
    98			return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
    99				 __y, static_cast<_BinaryOperation&&>(__binary_op));
   100		      }
   101		    else if constexpr (_Np == 8)
   102		      {
   103			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   104						 __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(__x._M_data)));
   105			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   106						 __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(__x._M_data)));
   107			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   108						 __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(__x._M_data)));
   109			return __x[0];
   110		      }
   111		    else if constexpr (_Np == 4)
   112		      {
   113			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   114						 __vector_permute<1, 0, 3, 2>(__x._M_data)));
   115			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   116						 __vector_permute<3, 2, 1, 0>(__x._M_data)));
   117			return __x[0];
   118		      }
   119		    else if constexpr (_Np == 2)
   120		      {
   121			__x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
   122						 __vector_permute<1, 0>(__x._M_data)));
   123			return __x[0];
   124		      }
   125		  }
   126		return _Base::_S_reduce(__x, static_cast<_BinaryOperation&&>(__binary_op));
   127	      }
   128	
   129	    // }}}
   130	    // math {{{
   131	    // _S_sqrt {{{
   132	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
   133	      _GLIBCXX_SIMD_INTRINSIC static _Tp
   134	      _S_sqrt(_Tp __x)
   135	      {
   136		if constexpr (__have_neon_a64)
   137		  {
   138		    const auto __intrin = __to_intrin(__x);
   139		    if constexpr (_TVT::template _S_is<float, 2>)
   140		      return vsqrt_f32(__intrin);
   141		    else if constexpr (_TVT::template _S_is<float, 4>)
   142		      return vsqrtq_f32(__intrin);
   143		    else if constexpr (_TVT::template _S_is<double, 1>)
   144		      return vsqrt_f64(__intrin);
   145		    else if constexpr (_TVT::template _S_is<double, 2>)
   146		      return vsqrtq_f64(__intrin);
   147		    else
   148		      __assert_unreachable<_Tp>();
   149		  }
   150		else
   151		  return _Base::_S_sqrt(__x);
   152	      }
   153	
   154	    // }}}
   155	    // _S_trunc {{{
   156	    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
   157	      _GLIBCXX_SIMD_INTRINSIC static _TW
   158	      _S_trunc(_TW __x)
   159	      {
   160		using _Tp = typename _TVT::value_type;
   161		if constexpr (__have_neon_a32)
   162		  {
   163		    const auto __intrin = __to_intrin(__x);
   164		    if constexpr (_TVT::template _S_is<float, 2>)
   165		      return vrnd_f32(__intrin);
   166		    else if constexpr (_TVT::template _S_is<float, 4>)
   167		      return vrndq_f32(__intrin);
   168		    else if constexpr (_TVT::template _S_is<double, 1>)
   169		      return vrnd_f64(__intrin);
   170		    else if constexpr (_TVT::template _S_is<double, 2>)
   171		      return vrndq_f64(__intrin);
   172		    else
   173		      __assert_unreachable<_Tp>();
   174		  }
   175		else if constexpr (is_same_v<_Tp, float>)
   176		  {
   177		    auto __intrin = __to_intrin(__x);
   178		    if constexpr (sizeof(__x) == 16)
   179		      __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
   180		    else
   181		      __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
   182		    return _Base::_S_abs(__x)._M_data < 0x1p23f
   183			     ? __vector_bitcast<float>(__intrin)
   184			     : __x._M_data;
   185		  }
   186		else
   187		  return _Base::_S_trunc(__x);
   188	      }
   189	
   190	    // }}}
   191	    // _S_round {{{
   192	    template <typename _Tp, size_t _Np>
   193	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
   194	      _S_round(_SimdWrapper<_Tp, _Np> __x)
   195	      {
   196		if constexpr (__have_neon_a32)
   197		  {
   198		    const auto __intrin = __to_intrin(__x);
   199		    if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
   200		      return vrnda_f32(__intrin);
   201		    else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
   202		      return vrndaq_f32(__intrin);
   203		    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
   204		      return vrnda_f64(__intrin);
   205		    else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
   206		      return vrndaq_f64(__intrin);
   207		    else
   208		      __assert_unreachable<_Tp>();
   209		  }
   210		else
   211		  return _Base::_S_round(__x);
   212	      }
   213	
   214	    // }}}
   215	    // _S_floor {{{
   216	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
   217	      _GLIBCXX_SIMD_INTRINSIC static _Tp
   218	      _S_floor(_Tp __x)
   219	      {
   220		if constexpr (__have_neon_a32)
   221		  {
   222		    const auto __intrin = __to_intrin(__x);
   223		    if constexpr (_TVT::template _S_is<float, 2>)
   224		      return vrndm_f32(__intrin);
   225		    else if constexpr (_TVT::template _S_is<float, 4>)
   226		      return vrndmq_f32(__intrin);
   227		    else if constexpr (_TVT::template _S_is<double, 1>)
   228		      return vrndm_f64(__intrin);
   229		    else if constexpr (_TVT::template _S_is<double, 2>)
   230		      return vrndmq_f64(__intrin);
   231		    else
   232		      __assert_unreachable<_Tp>();
   233		  }
   234		else
   235		  return _Base::_S_floor(__x);
   236	      }
   237	
   238	    // }}}
   239	    // _S_ceil {{{
   240	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
   241	      _GLIBCXX_SIMD_INTRINSIC static _Tp
   242	      _S_ceil(_Tp __x)
   243	      {
   244		if constexpr (__have_neon_a32)
   245		  {
   246		    const auto __intrin = __to_intrin(__x);
   247		    if constexpr (_TVT::template _S_is<float, 2>)
   248		      return vrndp_f32(__intrin);
   249		    else if constexpr (_TVT::template _S_is<float, 4>)
   250		      return vrndpq_f32(__intrin);
   251		    else if constexpr (_TVT::template _S_is<double, 1>)
   252		      return vrndp_f64(__intrin);
   253		    else if constexpr (_TVT::template _S_is<double, 2>)
   254		      return vrndpq_f64(__intrin);
   255		    else
   256		      __assert_unreachable<_Tp>();
   257		  }
   258		else
   259		  return _Base::_S_ceil(__x);
   260	      }
   261	
   262	    //}}} }}}
   263	  }; // }}}
   264	// _MaskImplNeonMixin {{{
   265	struct _MaskImplNeonMixin
   266	{
   267	  using _Base = _MaskImplBuiltinMixin;
   268	
   269	  template <typename _Tp, size_t _Np>
   270	    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
   271	    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
   272	    {
   273	      if (__builtin_is_constant_evaluated())
   274		return _Base::_S_to_bits(__x);
   275	
   276	      using _I = __int_for_sizeof_t<_Tp>;
   277	      if constexpr (sizeof(__x) == 16)
   278		{
   279		  auto __asint = __vector_bitcast<_I>(__x);
   280	#ifdef __aarch64__
   281		  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
   282	#else
   283		  [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
   284	#endif
   285		  if constexpr (sizeof(_Tp) == 1)
   286		    {
   287		      constexpr auto __bitsel
   288			= __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
   289			  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   290			    return static_cast<_I>(
   291			      __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
   292			  });
   293		      __asint &= __bitsel;
   294	#ifdef __aarch64__
   295		      return __vector_bitcast<_UShort>(
   296			vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
   297				  __zero))[0];
   298	#else
   299		      return __vector_bitcast<_UShort>(
   300			vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
   301					  __zero),
   302				 __zero))[0];
   303	#endif
   304		    }
   305		  else if constexpr (sizeof(_Tp) == 2)
   306		    {
   307		      constexpr auto __bitsel
   308			= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
   309			  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   310			    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
   311			  });
   312		      __asint &= __bitsel;
   313	#ifdef __aarch64__
   314		      return vaddvq_s16(__asint);
   315	#else
   316		      return vpadd_s16(
   317			vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
   318			__zero)[0];
   319	#endif
   320		    }
   321		  else if constexpr (sizeof(_Tp) == 4)
   322		    {
   323		      constexpr auto __bitsel
   324			= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
   325			  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   326			    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
   327			  });
   328		      __asint &= __bitsel;
   329	#ifdef __aarch64__
   330		      return vaddvq_s32(__asint);
   331	#else
   332		      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
   333				       __zero)[0];
   334	#endif
   335		    }
   336		  else if constexpr (sizeof(_Tp) == 8)
   337		    return (__asint[0] & 1) | (__asint[1] & 2);
   338		  else
   339		    __assert_unreachable<_Tp>();
   340		}
   341	      else if constexpr (sizeof(__x) == 8)
   342		{
   343		  auto __asint = __vector_bitcast<_I>(__x);
   344		  [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
   345		  if constexpr (sizeof(_Tp) == 1)
   346		    {
   347		      constexpr auto __bitsel
   348			= __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
   349			  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   350			    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
   351			  });
   352		      __asint &= __bitsel;
   353	#ifdef __aarch64__
   354		      return vaddv_s8(__asint);
   355	#else
   356		      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
   357				      __zero)[0];
   358	#endif
   359		    }
   360		  else if constexpr (sizeof(_Tp) == 2)
   361		    {
   362		      constexpr auto __bitsel
   363			= __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
   364			  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   365			    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
   366			  });
   367		      __asint &= __bitsel;
   368	#ifdef __aarch64__
   369		      return vaddv_s16(__asint);
   370	#else
   371		      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
   372	#endif
   373		    }
   374		  else if constexpr (sizeof(_Tp) == 4)
   375		    {
   376		      __asint &= __make_vector<_I>(0x1, 0x2);
   377	#ifdef __aarch64__
   378		      return vaddv_s32(__asint);
   379	#else
   380		      return vpadd_s32(__asint, __zero)[0];
   381	#endif
   382		    }
   383		  else
   384		    __assert_unreachable<_Tp>();
   385		}
   386	      else
   387		return _Base::_S_to_bits(__x);
   388	    }
   389	};
   390	
   391	// }}}
   392	// _MaskImplNeon {{{
   393	template <typename _Abi>
   394	  struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
   395	  {
   396	    using _MaskImplBuiltinMixin::_S_to_maskvector;
   397	    using _MaskImplNeonMixin::_S_to_bits;
   398	    using _Base = _MaskImplBuiltin<_Abi>;
   399	    using _Base::_S_convert;
   400	
   401	    // _S_all_of {{{
   402	    template <typename _Tp>
   403	      _GLIBCXX_SIMD_INTRINSIC static bool
   404	      _S_all_of(simd_mask<_Tp, _Abi> __k)
   405	      {
   406		const auto __kk
   407		  = __vector_bitcast<char>(__k._M_data)
   408		    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
   409		if constexpr (sizeof(__k) == 16)
   410		  {
   411		    const auto __x = __vector_bitcast<long long>(__kk);
   412		    return __x[0] + __x[1] == -2;
   413		  }
   414		else if constexpr (sizeof(__k) <= 8)
   415		  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
   416		else
   417		  __assert_unreachable<_Tp>();
   418	      }
   419	
   420	    // }}}
   421	    // _S_any_of {{{
   422	    template <typename _Tp>
   423	      _GLIBCXX_SIMD_INTRINSIC static bool
   424	      _S_any_of(simd_mask<_Tp, _Abi> __k)
   425	      {
   426		const auto __kk
   427		  = __vector_bitcast<char>(__k._M_data)
   428		    | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
   429		if constexpr (sizeof(__k) == 16)
   430		  {
   431		    const auto __x = __vector_bitcast<long long>(__kk);
   432		    return (__x[0] | __x[1]) != 0;
   433		  }
   434		else if constexpr (sizeof(__k) <= 8)
   435		  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
   436		else
   437		  __assert_unreachable<_Tp>();
   438	      }
   439	
   440	    // }}}
   441	    // _S_none_of {{{
   442	    template <typename _Tp>
   443	      _GLIBCXX_SIMD_INTRINSIC static bool
   444	      _S_none_of(simd_mask<_Tp, _Abi> __k)
   445	      {
   446		const auto __kk = _Abi::_S_masked(__k._M_data);
   447		if constexpr (sizeof(__k) == 16)
   448		  {
   449		    const auto __x = __vector_bitcast<long long>(__kk);
   450		    return (__x[0] | __x[1]) == 0;
   451		  }
   452		else if constexpr (sizeof(__k) <= 8)
   453		  return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
   454		else
   455		  __assert_unreachable<_Tp>();
   456	      }
   457	
   458	    // }}}
   459	    // _S_some_of {{{
   460	    template <typename _Tp>
   461	      _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
   462	      {
   463		if constexpr (sizeof(__k) <= 8)
   464		  {
   465		    const auto __kk = __vector_bitcast<char>(__k._M_data)
   466				      | ~__vector_bitcast<char>(
   467					_Abi::template _S_implicit_mask<_Tp>());
   468		    using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
   469		    return __bit_cast<_Up>(__kk) + 1 > 1;
   470		  }
   471		else
   472		  return _Base::_S_some_of(__k);
   473	      }
   474	
   475	    // }}}
   476	    // _S_popcount {{{
   477	    template <typename _Tp>
   478	      _GLIBCXX_SIMD_INTRINSIC static int
   479	      _S_popcount(simd_mask<_Tp, _Abi> __k)
   480	      {
   481		if constexpr (sizeof(_Tp) == 1)
   482		  {
   483		    const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
   484		    int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
   485		    return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
   486				     int8x8_t())[0];
   487		  }
   488		else if constexpr (sizeof(_Tp) == 2)
   489		  {
   490		    const auto __s16 = __vector_bitcast<short>(__k._M_data);
   491		    int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
   492		    return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
   493		  }
   494		else if constexpr (sizeof(_Tp) == 4)
   495		  {
   496		    const auto __s32 = __vector_bitcast<int>(__k._M_data);
   497		    int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
   498		    return -vpadd_s32(__tmp, int32x2_t())[0];
   499		  }
   500		else if constexpr (sizeof(_Tp) == 8)
   501		  {
   502		    static_assert(sizeof(__k) == 16);
   503		    const auto __s64 = __vector_bitcast<long>(__k._M_data);
   504		    return -(__s64[0] + __s64[1]);
   505		  }
   506	      }
   507	
   508	    // }}}
   509	    // _S_find_first_set {{{
   510	    template <typename _Tp>
   511	      _GLIBCXX_SIMD_INTRINSIC static int
   512	      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
   513	      {
   514		// TODO: the _Base implementation is not optimal for NEON
   515		return _Base::_S_find_first_set(__k);
   516	      }
   517	
   518	    // }}}
   519	    // _S_find_last_set {{{
   520	    template <typename _Tp>
   521	      _GLIBCXX_SIMD_INTRINSIC static int
   522	      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
   523	      {
   524		// TODO: the _Base implementation is not optimal for NEON
   525		return _Base::_S_find_last_set(__k);
   526	      }
   527	
   528	    // }}}
   529	  }; // }}}
   530	
   531	_GLIBCXX_SIMD_END_NAMESPACE
   532	#endif // __cplusplus >= 201703L
   533	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
   534	// vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80