Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/c++/11/experimental/bits/simd_builtin.h


$ cat -n /usr/include/c++/11/experimental/bits/simd_builtin.h

     1	// Simd Abi specific implementations -*- C++ -*-
     2	
     3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
     4	//
     5	// This file is part of the GNU ISO C++ Library.  This library is free
     6	// software; you can redistribute it and/or modify it under the
     7	// terms of the GNU General Public License as published by the
     8	// Free Software Foundation; either version 3, or (at your option)
     9	// any later version.
    10	
    11	// This library is distributed in the hope that it will be useful,
    12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
    13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14	// GNU General Public License for more details.
    15	
    16	// Under Section 7 of GPL version 3, you are granted additional
    17	// permissions described in the GCC Runtime Library Exception, version
    18	// 3.1, as published by the Free Software Foundation.
    19	
    20	// You should have received a copy of the GNU General Public License and
    21	// a copy of the GCC Runtime Library Exception along with this program;
    22	// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    23	// <http://www.gnu.org/licenses/>.
    24	
    25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
    26	#define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
    27	
    28	#if __cplusplus >= 201703L
    29	
    30	#include <array>
    31	#include <cmath>
    32	#include <cstdlib>
    33	
    34	_GLIBCXX_SIMD_BEGIN_NAMESPACE
    35	// _S_allbits{{{
    36	template <typename _V>
    37	  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
    38	    = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
    39	
    40	// }}}
    41	// _S_signmask, _S_absmask{{{
    42	template <typename _V, typename = _VectorTraits<_V>>
    43	  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
    44	    = __xor(_V() + 1, _V() - 1);
    45	
    46	template <typename _V, typename = _VectorTraits<_V>>
    47	  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
    48	    = __andnot(_S_signmask<_V>, _S_allbits<_V>);
    49	
    50	//}}}
    51	// __vector_permute<Indices...>{{{
    52	// Index == -1 requests zeroing of the output element
    53	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
    54	  constexpr _Tp
    55	  __vector_permute(_Tp __x)
    56	  {
    57	    static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
    58	    return __make_vector<typename _TVT::value_type>(
    59	      (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
    60	  }
    61	
    62	// }}}
    63	// __vector_shuffle<Indices...>{{{
    64	// Index == -1 requests zeroing of the output element
    65	template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
    66	  constexpr _Tp
    67	  __vector_shuffle(_Tp __x, _Tp __y)
    68	  {
    69	    return _Tp{(_Indices == -1 ? 0
    70			: _Indices < _TVT::_S_full_size
    71			  ? __x[_Indices]
    72			  : __y[_Indices - _TVT::_S_full_size])...};
    73	  }
    74	
    75	// }}}
    76	// __make_wrapper{{{
    77	template <typename _Tp, typename... _Args>
    78	  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
    79	  __make_wrapper(const _Args&... __args)
    80	  { return __make_vector<_Tp>(__args...); }
    81	
    82	// }}}
    83	// __wrapper_bitcast{{{
    84	template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
    85		  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
    86	  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
    87	  __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
    88	  {
    89	    static_assert(_Np > 1);
    90	    return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
    91	  }
    92	
    93	// }}}
    94	// __shift_elements_right{{{
    95	// if (__shift % 2ⁿ == 0) => the low n Bytes are correct
    96	template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
    97	  _GLIBCXX_SIMD_INTRINSIC _Tp
    98	  __shift_elements_right(_Tp __v)
    99	  {
   100	    [[maybe_unused]] const auto __iv = __to_intrin(__v);
   101	    static_assert(__shift <= sizeof(_Tp));
   102	    if constexpr (__shift == 0)
   103	      return __v;
   104	    else if constexpr (__shift == sizeof(_Tp))
   105	      return _Tp();
   106	#if _GLIBCXX_SIMD_X86INTRIN // {{{
   107	    else if constexpr (__have_sse && __shift == 8
   108			       && _TVT::template _S_is<float, 4>)
   109	      return _mm_movehl_ps(__iv, __iv);
   110	    else if constexpr (__have_sse2 && __shift == 8
   111			       && _TVT::template _S_is<double, 2>)
   112	      return _mm_unpackhi_pd(__iv, __iv);
   113	    else if constexpr (__have_sse2 && sizeof(_Tp) == 16)
   114	      return reinterpret_cast<typename _TVT::type>(
   115		_mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
   116	    else if constexpr (__shift == 16 && sizeof(_Tp) == 32)
   117	      {
   118		/*if constexpr (__have_avx && _TVT::template _S_is<double, 4>)
   119		  return _mm256_permute2f128_pd(__iv, __iv, 0x81);
   120		else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
   121		  return _mm256_permute2f128_ps(__iv, __iv, 0x81);
   122		else if constexpr (__have_avx)
   123		  return reinterpret_cast<typename _TVT::type>(
   124		    _mm256_permute2f128_si256(__iv, __iv, 0x81));
   125		else*/
   126		return __zero_extend(__hi128(__v));
   127	      }
   128	    else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16)
   129	      {
   130		const auto __vll = __vector_bitcast<_LLong>(__v);
   131		return reinterpret_cast<typename _TVT::type>(
   132		  _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81),
   133				     __vll, __shift));
   134	      }
   135	    else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16)
   136	      {
   137		const auto __vll = __vector_bitcast<_LLong>(__v);
   138		return reinterpret_cast<typename _TVT::type>(
   139		  __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
   140			   _mm_srli_si128(__hi128(__vll), __shift)));
   141	      }
   142	    else if constexpr (sizeof(_Tp) == 32 && __shift > 16)
   143	      return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v)));
   144	    else if constexpr (sizeof(_Tp) == 64 && __shift == 32)
   145	      return __zero_extend(__hi256(__v));
   146	    else if constexpr (__have_avx512f && sizeof(_Tp) == 64)
   147	      {
   148		if constexpr (__shift >= 48)
   149		  return __zero_extend(
   150		    __shift_elements_right<__shift - 48>(__extract<3, 4>(__v)));
   151		else if constexpr (__shift >= 32)
   152		  return __zero_extend(
   153		    __shift_elements_right<__shift - 32>(__hi256(__v)));
   154		else if constexpr (__shift % 8 == 0)
   155		  return reinterpret_cast<typename _TVT::type>(
   156		    _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
   157					__shift / 8));
   158		else if constexpr (__shift % 4 == 0)
   159		  return reinterpret_cast<typename _TVT::type>(
   160		    _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
   161					__shift / 4));
   162		else if constexpr (__have_avx512bw && __shift < 16)
   163		  {
   164		    const auto __vll = __vector_bitcast<_LLong>(__v);
   165		    return reinterpret_cast<typename _TVT::type>(
   166		      _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9),
   167					 __vll, __shift));
   168		  }
   169		else if constexpr (__have_avx512bw && __shift < 32)
   170		  {
   171		    const auto __vll = __vector_bitcast<_LLong>(__v);
   172		    return reinterpret_cast<typename _TVT::type>(
   173		      _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee),
   174					 _mm512_shuffle_i32x4(__vll, __vll, 0xf9),
   175					 __shift - 16));
   176		  }
   177		else
   178		  __assert_unreachable<_Tp>();
   179	      }
   180	  /*
   181	      } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
   182		  return __auto_bitcast(__extract<__shift / 16, 4>(__v));
   183	  */
   184	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
   185	    else
   186	      {
   187		constexpr int __chunksize = __shift % 8 == 0   ? 8
   188					    : __shift % 4 == 0 ? 4
   189					    : __shift % 2 == 0 ? 2
   190							       : 1;
   191		auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
   192		using _Up = decltype(__w);
   193		return __intrin_bitcast<_Tp>(
   194		  __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
   195		    [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   196		      return _Up{__chunks...};
   197		    }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   198		      return __w[__shift / __chunksize + __i];
   199		    }));
   200	      }
   201	  }
   202	
   203	// }}}
   204	// __extract_part(_SimdWrapper<_Tp, _Np>) {{{
   205	template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
   206	  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
   207	  _SimdWrapper<_Tp, _Np / _Total * _Combine>
   208	  __extract_part(const _SimdWrapper<_Tp, _Np> __x)
   209	  {
   210	    if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
   211	      return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
   212	    else
   213	      {
   214		constexpr size_t __values_per_part = _Np / _Total;
   215		constexpr size_t __values_to_skip = _Index * __values_per_part;
   216		constexpr size_t __return_size = __values_per_part * _Combine;
   217		using _R = __vector_type_t<_Tp, __return_size>;
   218		static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
   219				<= sizeof(__x),
   220			      "out of bounds __extract_part");
   221		// the following assertion would ensure no "padding" to be read
   222		// static_assert(_Total >= _Index + _Combine, "_Total must be greater
   223		// than _Index");
   224	
   225		// static_assert(__return_size * _Total == _Np, "_Np must be divisible
   226		// by _Total");
   227		if (__x._M_is_constprop())
   228		  return __generate_from_n_evaluations<__return_size, _R>(
   229		    [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   230		      return __x[__values_to_skip + __i];
   231		    });
   232		if constexpr (_Index == 0 && _Total == 1)
   233		  return __x;
   234		else if constexpr (_Index == 0)
   235		  return __intrin_bitcast<_R>(__as_vector(__x));
   236	#if _GLIBCXX_SIMD_X86INTRIN // {{{
   237		else if constexpr (sizeof(__x) == 32
   238				   && __return_size * sizeof(_Tp) <= 16)
   239		  {
   240		    constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
   241		    if constexpr (__bytes_to_skip == 16)
   242		      return __vector_bitcast<_Tp, __return_size>(
   243			__hi128(__as_vector(__x)));
   244		    else
   245		      return __vector_bitcast<_Tp, __return_size>(
   246			_mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
   247					__lo128(__vector_bitcast<_LLong>(__x)),
   248					__bytes_to_skip));
   249		  }
   250	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
   251		else if constexpr (_Index > 0
   252				   && (__values_to_skip % __return_size != 0
   253				       || sizeof(_R) >= 8)
   254				   && (__values_to_skip + __return_size) * sizeof(_Tp)
   255					<= 64
   256				   && sizeof(__x) >= 16)
   257		  return __intrin_bitcast<_R>(
   258		    __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
   259		      __as_vector(__x)));
   260		else
   261		  {
   262		    _R __r = {};
   263		    __builtin_memcpy(&__r,
   264				     reinterpret_cast<const char*>(&__x)
   265				       + sizeof(_Tp) * __values_to_skip,
   266				     __return_size * sizeof(_Tp));
   267		    return __r;
   268		  }
   269	      }
   270	  }
   271	
   272	// }}}
   273	// __extract_part(_SimdWrapper<bool, _Np>) {{{
   274	template <int _Index, int _Total, int _Combine = 1, size_t _Np>
   275	  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
   276	  __extract_part(const _SimdWrapper<bool, _Np> __x)
   277	  {
   278	    static_assert(_Combine == 1, "_Combine != 1 not implemented");
   279	    static_assert(__have_avx512f && _Np == _Np);
   280	    static_assert(_Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
   281	    return __x._M_data >> (_Index * _Np / _Total);
   282	  }
   283	
   284	// }}}
   285	
   286	// __vector_convert {{{
   287	// implementation requires an index sequence
   288	template <typename _To, typename _From, size_t... _I>
   289	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   290	  __vector_convert(_From __a, index_sequence<_I...>)
   291	  {
   292	    using _Tp = typename _VectorTraits<_To>::value_type;
   293	    return _To{static_cast<_Tp>(__a[_I])...};
   294	  }
   295	
   296	template <typename _To, typename _From, size_t... _I>
   297	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   298	  __vector_convert(_From __a, _From __b, index_sequence<_I...>)
   299	  {
   300	    using _Tp = typename _VectorTraits<_To>::value_type;
   301	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
   302	  }
   303	
   304	template <typename _To, typename _From, size_t... _I>
   305	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   306	  __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
   307	  {
   308	    using _Tp = typename _VectorTraits<_To>::value_type;
   309	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   310		       static_cast<_Tp>(__c[_I])...};
   311	  }
   312	
   313	template <typename _To, typename _From, size_t... _I>
   314	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   315	  __vector_convert(_From __a, _From __b, _From __c, _From __d,
   316			   index_sequence<_I...>)
   317	  {
   318	    using _Tp = typename _VectorTraits<_To>::value_type;
   319	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   320		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
   321	  }
   322	
   323	template <typename _To, typename _From, size_t... _I>
   324	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   325	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   326			   index_sequence<_I...>)
   327	  {
   328	    using _Tp = typename _VectorTraits<_To>::value_type;
   329	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   330		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   331		       static_cast<_Tp>(__e[_I])...};
   332	  }
   333	
   334	template <typename _To, typename _From, size_t... _I>
   335	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   336	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   337			   _From __f, index_sequence<_I...>)
   338	  {
   339	    using _Tp = typename _VectorTraits<_To>::value_type;
   340	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   341		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   342		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
   343	  }
   344	
   345	template <typename _To, typename _From, size_t... _I>
   346	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   347	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   348			   _From __f, _From __g, index_sequence<_I...>)
   349	  {
   350	    using _Tp = typename _VectorTraits<_To>::value_type;
   351	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   352		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   353		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   354		       static_cast<_Tp>(__g[_I])...};
   355	  }
   356	
   357	template <typename _To, typename _From, size_t... _I>
   358	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   359	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   360			   _From __f, _From __g, _From __h, index_sequence<_I...>)
   361	  {
   362	    using _Tp = typename _VectorTraits<_To>::value_type;
   363	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   364		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   365		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   366		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
   367	  }
   368	
   369	template <typename _To, typename _From, size_t... _I>
   370	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   371	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   372			   _From __f, _From __g, _From __h, _From __i,
   373			   index_sequence<_I...>)
   374	  {
   375	    using _Tp = typename _VectorTraits<_To>::value_type;
   376	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   377		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   378		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   379		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   380		       static_cast<_Tp>(__i[_I])...};
   381	  }
   382	
   383	template <typename _To, typename _From, size_t... _I>
   384	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   385	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   386			   _From __f, _From __g, _From __h, _From __i, _From __j,
   387			   index_sequence<_I...>)
   388	  {
   389	    using _Tp = typename _VectorTraits<_To>::value_type;
   390	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   391		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   392		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   393		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   394		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
   395	  }
   396	
   397	template <typename _To, typename _From, size_t... _I>
   398	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   399	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   400			   _From __f, _From __g, _From __h, _From __i, _From __j,
   401			   _From __k, index_sequence<_I...>)
   402	  {
   403	    using _Tp = typename _VectorTraits<_To>::value_type;
   404	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   405		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   406		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   407		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   408		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   409		       static_cast<_Tp>(__k[_I])...};
   410	  }
   411	
   412	template <typename _To, typename _From, size_t... _I>
   413	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   414	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   415			   _From __f, _From __g, _From __h, _From __i, _From __j,
   416			   _From __k, _From __l, index_sequence<_I...>)
   417	  {
   418	    using _Tp = typename _VectorTraits<_To>::value_type;
   419	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   420		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   421		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   422		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   423		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   424		       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
   425	  }
   426	
   427	template <typename _To, typename _From, size_t... _I>
   428	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   429	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   430			   _From __f, _From __g, _From __h, _From __i, _From __j,
   431			   _From __k, _From __l, _From __m, index_sequence<_I...>)
   432	  {
   433	    using _Tp = typename _VectorTraits<_To>::value_type;
   434	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   435		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   436		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   437		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   438		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   439		       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
   440		       static_cast<_Tp>(__m[_I])...};
   441	  }
   442	
   443	template <typename _To, typename _From, size_t... _I>
   444	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   445	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   446			   _From __f, _From __g, _From __h, _From __i, _From __j,
   447			   _From __k, _From __l, _From __m, _From __n,
   448			   index_sequence<_I...>)
   449	  {
   450	    using _Tp = typename _VectorTraits<_To>::value_type;
   451	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   452		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   453		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   454		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   455		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   456		       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
   457		       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
   458	  }
   459	
   460	template <typename _To, typename _From, size_t... _I>
   461	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   462	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   463			   _From __f, _From __g, _From __h, _From __i, _From __j,
   464			   _From __k, _From __l, _From __m, _From __n, _From __o,
   465			   index_sequence<_I...>)
   466	  {
   467	    using _Tp = typename _VectorTraits<_To>::value_type;
   468	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   469		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   470		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   471		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   472		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   473		       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
   474		       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
   475		       static_cast<_Tp>(__o[_I])...};
   476	  }
   477	
   478	template <typename _To, typename _From, size_t... _I>
   479	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   480	  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
   481			   _From __f, _From __g, _From __h, _From __i, _From __j,
   482			   _From __k, _From __l, _From __m, _From __n, _From __o,
   483			   _From __p, index_sequence<_I...>)
   484	  {
   485	    using _Tp = typename _VectorTraits<_To>::value_type;
   486	    return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
   487		       static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
   488		       static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
   489		       static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
   490		       static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
   491		       static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
   492		       static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
   493		       static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
   494	  }
   495	
   496	// Defer actual conversion to the overload that takes an index sequence. Note
   497	// that this function adds zeros or drops values off the end if you don't ensure
   498	// matching width.
   499	template <typename _To, typename... _From, size_t _FromSize>
   500	  _GLIBCXX_SIMD_INTRINSIC constexpr _To
   501	  __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
   502	  {
   503	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
   504	    using _From0 = __first_of_pack_t<_From...>;
   505	    using _FW = _SimdWrapper<_From0, _FromSize>;
   506	    if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
   507	      {
   508		if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
   509			      == 0) // power-of-two number of arguments
   510		  return __convert_x86<_To>(__as_vector(__xs)...);
   511		else // append zeros and recurse until the above branch is taken
   512		  return __vector_convert<_To>(__xs..., _FW{});
   513	      }
   514	    else
   515	#endif
   516	      return __vector_convert<_To>(
   517		__as_vector(__xs)...,
   518		make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
   519				       _VectorTraits<_To>::_S_full_size, int(_FromSize))
   520							  : _FromSize)>());
   521	  }
   522	
   523	// }}}
   524	// __convert function{{{
   525	template <typename _To, typename _From, typename... _More>
   526	  _GLIBCXX_SIMD_INTRINSIC constexpr auto
   527	  __convert(_From __v0, _More... __vs)
   528	  {
   529	    static_assert((true && ... && is_same_v<_From, _More>) );
   530	    if constexpr (__is_vectorizable_v<_From>)
   531	      {
   532		using _V = typename _VectorTraits<_To>::type;
   533		using _Tp = typename _VectorTraits<_To>::value_type;
   534		return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
   535	      }
   536	    else if constexpr (__is_vector_type_v<_From>)
   537	      return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
   538	    else // _SimdWrapper arguments
   539	      {
   540		constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
   541		if constexpr (__is_vectorizable_v<_To>)
   542		  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
   543		else if constexpr (!__is_vector_type_v<_To>)
   544		  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
   545		else
   546		  {
   547		    static_assert(
   548		      sizeof...(_More) == 0
   549			|| _VectorTraits<_To>::_S_full_size >= __input_size,
   550		      "__convert(...) requires the input to fit into the output");
   551		    return __vector_convert<_To>(__v0, __vs...);
   552		  }
   553	      }
   554	  }
   555	
   556	// }}}
   557	// __convert_all{{{
   558	// Converts __v into array<_To, N>, where N is _NParts if non-zero or
   559	// otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
   560	// Note: this function may return less than all converted elements
   561	template <typename _To,
   562		  size_t _NParts = 0, // allows to convert fewer or more (only last
   563				      // _To, to be partially filled) than all
   564		  size_t _Offset = 0, // where to start, # of elements (not Bytes or
   565				      // Parts)
   566		  typename _From, typename _FromVT = _VectorTraits<_From>>
   567	  _GLIBCXX_SIMD_INTRINSIC auto
   568	  __convert_all(_From __v)
   569	  {
   570	    if constexpr (is_arithmetic_v<_To> && _NParts != 1)
   571	      {
   572		static_assert(_Offset < _FromVT::_S_full_size);
   573		constexpr auto _Np
   574		  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
   575		return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
   576			 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   577			   return static_cast<_To>(__v[__i + _Offset]);
   578			 });
   579	      }
   580	    else
   581	      {
   582		static_assert(__is_vector_type_v<_To>);
   583		using _ToVT = _VectorTraits<_To>;
   584		if constexpr (__is_vector_type_v<_From>)
   585		  return __convert_all<_To, _NParts>(__as_wrapper(__v));
   586		else if constexpr (_NParts == 1)
   587		  {
   588		    static_assert(_Offset % _ToVT::_S_full_size == 0);
   589		    return array<_To, 1>{__vector_convert<_To>(
   590		      __extract_part<_Offset / _ToVT::_S_full_size,
   591				     __div_roundup(_FromVT::_S_partial_width,
   592						   _ToVT::_S_full_size)>(__v))};
   593		  }
   594	#if _GLIBCXX_SIMD_X86INTRIN // {{{
   595		else if constexpr (!__have_sse4_1 && _Offset == 0
   596		  && is_integral_v<typename _FromVT::value_type>
   597		  && sizeof(typename _FromVT::value_type)
   598		      < sizeof(typename _ToVT::value_type)
   599		  && !(sizeof(typename _FromVT::value_type) == 4
   600		      && is_same_v<typename _ToVT::value_type, double>))
   601		  {
   602		    using _ToT = typename _ToVT::value_type;
   603		    using _FromT = typename _FromVT::value_type;
   604		    constexpr size_t _Np
   605		      = _NParts != 0
   606			  ? _NParts
   607			  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
   608		    using _R = array<_To, _Np>;
   609		    // __adjust modifies its input to have _Np (use _SizeConstant)
   610		    // entries so that no unnecessary intermediate conversions are
   611		    // requested and, more importantly, no intermediate conversions are
   612		    // missing
   613		    [[maybe_unused]] auto __adjust
   614		      = [](auto __n,
   615			   auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
   616		      return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
   617		    };
   618		    [[maybe_unused]] const auto __vi = __to_intrin(__v);
   619		    auto&& __make_array
   620			= [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   621			  if constexpr (_Np == 1)
   622			    return _R{__intrin_bitcast<_To>(__x0)};
   623			  else
   624			    return _R{__intrin_bitcast<_To>(__x0),
   625				      __intrin_bitcast<_To>(__x1)};
   626			};
   627	
   628		    if constexpr (_Np == 0)
   629		      return _R{};
   630		    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
   631		      {
   632			static_assert(is_integral_v<_FromT>);
   633			static_assert(is_integral_v<_ToT>);
   634			if constexpr (is_unsigned_v<_FromT>)
   635			  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
   636					      _mm_unpackhi_epi8(__vi, __m128i()));
   637			else
   638			  return __make_array(
   639			    _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
   640			    _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
   641		      }
   642		    else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
   643		      {
   644			static_assert(is_integral_v<_FromT>);
   645			if constexpr (is_floating_point_v<_ToT>)
   646			  {
   647			    const auto __ints
   648			      = __convert_all<__vector_type16_t<int>, _Np>(
   649				__adjust(_SizeConstant<_Np * 4>(), __v));
   650			    return __generate_from_n_evaluations<_Np, _R>(
   651			      [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   652				return __vector_convert<_To>(__as_wrapper(__ints[__i]));
   653			      });
   654			  }
   655			else if constexpr (is_unsigned_v<_FromT>)
   656			  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
   657					      _mm_unpackhi_epi16(__vi, __m128i()));
   658			else
   659			  return __make_array(
   660			    _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
   661			    _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
   662		      }
   663		    else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
   664				       && is_integral_v<_FromT> && is_integral_v<_ToT>)
   665		      {
   666			if constexpr (is_unsigned_v<_FromT>)
   667			  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
   668					      _mm_unpackhi_epi32(__vi, __m128i()));
   669			else
   670			  return __make_array(
   671			    _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
   672			    _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
   673		      }
   674		    else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
   675				       && is_integral_v<_FromT> && is_integral_v<_ToT>)
   676		      {
   677			if constexpr (is_unsigned_v<_FromT>)
   678			  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
   679					      _mm_unpackhi_epi32(__vi, __m128i()));
   680			else
   681			  return __make_array(
   682			    _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
   683			    _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
   684		      }
   685		    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
   686				       && is_signed_v<_FromT>)
   687		      {
   688			const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
   689						 _mm_unpackhi_epi8(__vi, __vi)};
   690			const __vector_type_t<int, 4> __vvvv[4] = {
   691			  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
   692			  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
   693			  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
   694			  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
   695			if constexpr (sizeof(_ToT) == 4)
   696			  return __generate_from_n_evaluations<_Np, _R>(
   697				   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   698				     return __vector_convert<_To>(
   699					      _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
   700				   });
   701			else if constexpr (is_integral_v<_ToT>)
   702			  return __generate_from_n_evaluations<_Np, _R>(
   703				   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   704				     const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
   705				     const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
   706				     return __vector_bitcast<_ToT>(
   707					      __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
   708							   : _mm_unpackhi_epi32(__sx32, __signbits));
   709				   });
   710			else
   711			  return __generate_from_n_evaluations<_Np, _R>(
   712				   [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   713				     const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
   714				     return __vector_convert<_To>(
   715					      __i % 2 == 0 ? __int4
   716							   : _SimdWrapper<int, 4>(
   717							       _mm_unpackhi_epi64(__to_intrin(__int4),
   718										  __to_intrin(__int4))));
   719				   });
   720		      }
   721		    else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
   722		      {
   723			const auto __shorts = __convert_all<__vector_type16_t<
   724			  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
   725			  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
   726			return __generate_from_n_evaluations<_Np, _R>(
   727				 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   728				   return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
   729				 });
   730		      }
   731		    else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
   732				       && is_signed_v<_FromT> && is_integral_v<_ToT>)
   733		      {
   734			const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
   735						 _mm_unpackhi_epi16(__vi, __vi)};
   736			const __vector_type16_t<int> __vvvv[4]
   737			  = {__vector_bitcast<int>(
   738			       _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
   739						  _mm_srai_epi32(__vv[0], 31))),
   740			     __vector_bitcast<int>(
   741			       _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
   742						  _mm_srai_epi32(__vv[0], 31))),
   743			     __vector_bitcast<int>(
   744			       _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
   745						  _mm_srai_epi32(__vv[1], 31))),
   746			     __vector_bitcast<int>(
   747			       _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
   748						  _mm_srai_epi32(__vv[1], 31)))};
   749			return __generate_from_n_evaluations<_Np, _R>(
   750				 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   751				   return __vector_bitcast<_ToT>(__vvvv[__i]);
   752				 });
   753		      }
   754		    else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
   755		      {
   756			const auto __ints
   757			  = __convert_all<__vector_type16_t<conditional_t<
   758			    is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
   759			    unsigned int>>>(
   760			    __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
   761			return __generate_from_n_evaluations<_Np, _R>(
   762				 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   763				   return __convert_all<_To>(__ints[__i / 2])[__i % 2];
   764				 });
   765		      }
   766		    else
   767		      __assert_unreachable<_To>();
   768		  }
   769	#endif // _GLIBCXX_SIMD_X86INTRIN }}}
   770		else if constexpr ((_FromVT::_S_partial_width - _Offset)
   771				   > _ToVT::_S_full_size)
   772		  {
   773		    /*
   774		    static_assert(
   775		      (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
   776		    0,
   777		      "__convert_all only supports power-of-2 number of elements.
   778		    Otherwise " "the return type cannot be array<_To, N>.");
   779		      */
   780		    constexpr size_t _NTotal
   781		      = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
   782		    constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
   783		    static_assert(
   784		      _Np <= _NTotal
   785		      || (_Np == _NTotal + 1
   786			  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
   787			       > 0));
   788		    using _R = array<_To, _Np>;
   789		    if constexpr (_Np == 1)
   790		      return _R{__vector_convert<_To>(
   791			__extract_part<_Offset, _FromVT::_S_partial_width,
   792				       _ToVT::_S_full_size>(__v))};
   793		    else
   794		      return __generate_from_n_evaluations<_Np, _R>(
   795			       [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
   796				 auto __part
   797				   = __extract_part<__i * _ToVT::_S_full_size + _Offset,
   798						    _FromVT::_S_partial_width,
   799						    _ToVT::_S_full_size>(__v);
   800				 return __vector_convert<_To>(__part);
   801			       });
   802		  }
   803		else if constexpr (_Offset == 0)
   804		  return array<_To, 1>{__vector_convert<_To>(__v)};
   805		else
   806		  return array<_To, 1>{__vector_convert<_To>(
   807		    __extract_part<_Offset, _FromVT::_S_partial_width,
   808				   _FromVT::_S_partial_width - _Offset>(__v))};
   809	      }
   810	  }
   811	
   812	// }}}
   813	
   814	// _GnuTraits {{{
   815	template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
   816	  struct _GnuTraits
   817	  {
   818	    using _IsValid = true_type;
   819	    using _SimdImpl = typename _Abi::_SimdImpl;
   820	    using _MaskImpl = typename _Abi::_MaskImpl;
   821	
   822	    // simd and simd_mask member types {{{
   823	    using _SimdMember = _SimdWrapper<_Tp, _Np>;
   824	    using _MaskMember = _SimdWrapper<_Mp, _Np>;
   825	    static constexpr size_t _S_simd_align = alignof(_SimdMember);
   826	    static constexpr size_t _S_mask_align = alignof(_MaskMember);
   827	
   828	    // }}}
   829	    // size metadata {{{
   830	    static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
   831	    static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
   832	
   833	    // }}}
   834	    // _SimdBase / base class for simd, providing extra conversions {{{
   835	    struct _SimdBase2
   836	    {
   837	      explicit
   838	      operator __intrinsic_type_t<_Tp, _Np>() const
   839	      { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); }
   840	
   841	      explicit
   842	      operator __vector_type_t<_Tp, _Np>() const
   843	      { return static_cast<const simd<_Tp, _Abi>*>(this)->_M_data.__builtin(); }
   844	    };
   845	
   846	    struct _SimdBase1
   847	    {
   848	      explicit
   849	      operator __intrinsic_type_t<_Tp, _Np>() const
   850	      { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
   851	    };
   852	
   853	    using _SimdBase = conditional_t<
   854	      is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
   855	      _SimdBase1, _SimdBase2>;
   856	
   857	    // }}}
   858	    // _MaskBase {{{
   859	    struct _MaskBase2
   860	    {
   861	      explicit
   862	      operator __intrinsic_type_t<_Tp, _Np>() const
   863	      { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); }
   864	
   865	      explicit
   866	      operator __vector_type_t<_Tp, _Np>() const
   867	      { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; }
   868	    };
   869	
   870	    struct _MaskBase1
   871	    {
   872	      explicit
   873	      operator __intrinsic_type_t<_Tp, _Np>() const
   874	      { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
   875	    };
   876	
   877	    using _MaskBase = conditional_t<
   878	      is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
   879	      _MaskBase1, _MaskBase2>;
   880	
   881	    // }}}
   882	    // _MaskCastType {{{
   883	    // parameter type of one explicit simd_mask constructor
   884	    class _MaskCastType
   885	    {
   886	      using _Up = __intrinsic_type_t<_Tp, _Np>;
   887	      _Up _M_data;
   888	
   889	    public:
   890	      _MaskCastType(_Up __x) : _M_data(__x) {}
   891	
   892	      operator _MaskMember() const { return _M_data; }
   893	    };
   894	
   895	    // }}}
   896	    // _SimdCastType {{{
   897	    // parameter type of one explicit simd constructor
   898	    class _SimdCastType1
   899	    {
   900	      using _Ap = __intrinsic_type_t<_Tp, _Np>;
   901	      _SimdMember _M_data;
   902	
   903	    public:
   904	      constexpr
   905	      _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
   906	
   907	      constexpr
   908	      operator _SimdMember() const { return _M_data; }
   909	    };
   910	
   911	    class _SimdCastType2
   912	    {
   913	      using _Ap = __intrinsic_type_t<_Tp, _Np>;
   914	      using _Bp = __vector_type_t<_Tp, _Np>;
   915	      _SimdMember _M_data;
   916	
   917	    public:
   918	      constexpr
   919	      _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
   920	
   921	      constexpr
   922	      _SimdCastType2(_Bp __b) : _M_data(__b) {}
   923	
   924	      constexpr
   925	      operator _SimdMember() const { return _M_data; }
   926	    };
   927	
   928	    using _SimdCastType = conditional_t<
   929	      is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
   930	      _SimdCastType1, _SimdCastType2>;
   931	    //}}}
   932	  };
   933	
   934	// }}}
   935	struct _CommonImplX86;
   936	struct _CommonImplNeon;
   937	struct _CommonImplBuiltin;
   938	template <typename _Abi> struct _SimdImplBuiltin;
   939	template <typename _Abi> struct _MaskImplBuiltin;
   940	template <typename _Abi> struct _SimdImplX86;
   941	template <typename _Abi> struct _MaskImplX86;
   942	template <typename _Abi> struct _SimdImplNeon;
   943	template <typename _Abi> struct _MaskImplNeon;
   944	template <typename _Abi> struct _SimdImplPpc;
   945	template <typename _Abi> struct _MaskImplPpc;
   946	
   947	// simd_abi::_VecBuiltin {{{
   948	template <int _UsedBytes>
   949	  struct simd_abi::_VecBuiltin
   950	  {
   951	    template <typename _Tp>
   952	      static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
   953	
   954	    // validity traits {{{
   955	    struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
   956	
   957	    template <typename _Tp>
   958	      struct _IsValidSizeFor
   959		: __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
   960				   && _UsedBytes % sizeof(_Tp) == 0
   961				   && _UsedBytes <= __vectorized_sizeof<_Tp>()
   962				   && (!__have_avx512f || _UsedBytes <= 32))> {};
   963	
   964	    template <typename _Tp>
   965	      struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
   966					    _IsValidSizeFor<_Tp>> {};
   967	
   968	    template <typename _Tp>
   969	      static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
   970	
   971	    // }}}
   972	    // _SimdImpl/_MaskImpl {{{
   973	#if _GLIBCXX_SIMD_X86INTRIN
   974	    using _CommonImpl = _CommonImplX86;
   975	    using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
   976	    using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
   977	#elif _GLIBCXX_SIMD_HAVE_NEON
   978	    using _CommonImpl = _CommonImplNeon;
   979	    using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
   980	    using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
   981	#else
   982	    using _CommonImpl = _CommonImplBuiltin;
   983	#ifdef __ALTIVEC__
   984	    using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
   985	    using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
   986	#else
   987	    using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
   988	    using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
   989	#endif
   990	#endif
   991	
   992	    // }}}
   993	    // __traits {{{
   994	    template <typename _Tp>
   995	      using _MaskValueType = __int_for_sizeof_t<_Tp>;
   996	
   997	    template <typename _Tp>
   998	      using __traits
   999		= conditional_t<_S_is_valid_v<_Tp>,
  1000				_GnuTraits<_Tp, _MaskValueType<_Tp>,
  1001					   _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
  1002				_InvalidTraits>;
  1003	
  1004	    //}}}
  1005	    // size metadata {{{
  1006	    template <typename _Tp>
  1007	      static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
  1008	
  1009	    template <typename _Tp>
  1010	      static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
  1011	
  1012	    // }}}
  1013	    // implicit masks {{{
  1014	    template <typename _Tp>
  1015	      using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
  1016	
  1017	    template <typename _Tp>
  1018	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1019	      _S_implicit_mask()
  1020	      {
  1021		using _UV = typename _MaskMember<_Tp>::_BuiltinType;
  1022		if constexpr (!_MaskMember<_Tp>::_S_is_partial)
  1023		  return ~_UV();
  1024		else
  1025		  {
  1026		    constexpr auto __size = _S_size<_Tp>;
  1027		    _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
  1028		      = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
  1029					       { return __i < __size ? -1 : 0; });
  1030		    return __r;
  1031		  }
  1032	      }
  1033	
  1034	    template <typename _Tp>
  1035	      _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
  1036	      _S_implicit_mask_intrin()
  1037	      { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
  1038	
  1039	    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
  1040	      _GLIBCXX_SIMD_INTRINSIC static constexpr _TW
  1041	      _S_masked(_TW __x)
  1042	      {
  1043		using _Tp = typename _TVT::value_type;
  1044		if constexpr (!_MaskMember<_Tp>::_S_is_partial)
  1045		  return __x;
  1046		else
  1047		  return __and(__as_vector(__x),
  1048			       __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
  1049	      }
  1050	
  1051	    template <typename _TW, typename _TVT = _VectorTraits<_TW>>
  1052	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  1053	      __make_padding_nonzero(_TW __x)
  1054	      {
  1055		using _Tp = typename _TVT::value_type;
  1056		if constexpr (!_S_is_partial<_Tp>)
  1057		  return __x;
  1058		else
  1059		  {
  1060		    _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
  1061		      = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
  1062		    if constexpr (is_integral_v<_Tp>)
  1063		      return __or(__x, ~__implicit_mask);
  1064		    else
  1065		      {
  1066			_GLIBCXX_SIMD_USE_CONSTEXPR auto __one
  1067			  = __andnot(__implicit_mask,
  1068				     __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
  1069			// it's not enough to return `x | 1_in_padding` because the
  1070			// padding in x might be inf or nan (independent of
  1071			// __FINITE_MATH_ONLY__, because it's about padding bits)
  1072			return __or(__and(__x, __implicit_mask), __one);
  1073		      }
  1074		  }
  1075	      }
  1076	    // }}}
  1077	  };
  1078	
  1079	// }}}
  1080	// simd_abi::_VecBltnBtmsk {{{
  1081	template <int _UsedBytes>
  1082	  struct simd_abi::_VecBltnBtmsk
  1083	  {
  1084	    template <typename _Tp>
  1085	      static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
  1086	
  1087	    // validity traits {{{
  1088	    struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
  1089	
  1090	    template <typename _Tp>
  1091	      struct _IsValidSizeFor
  1092		: __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
  1093				   && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
  1094				   && (_UsedBytes > 32 || __have_avx512vl))> {};
  1095	
  1096	    // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
  1097	    // required.
  1098	    template <typename _Tp>
  1099	      struct _IsValid
  1100		: conjunction<
  1101		    _IsValidAbiTag, __bool_constant<__have_avx512f>,
  1102		    __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
  1103		    __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
  1104		    _IsValidSizeFor<_Tp>> {};
  1105	
  1106	    template <typename _Tp>
  1107	      static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
  1108	
  1109	    // }}}
  1110	    // simd/_MaskImpl {{{
  1111	  #if _GLIBCXX_SIMD_X86INTRIN
  1112	    using _CommonImpl = _CommonImplX86;
  1113	    using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
  1114	    using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
  1115	  #else
  1116	    template <int>
  1117	      struct _MissingImpl;
  1118	
  1119	    using _CommonImpl = _MissingImpl<_UsedBytes>;
  1120	    using _SimdImpl = _MissingImpl<_UsedBytes>;
  1121	    using _MaskImpl = _MissingImpl<_UsedBytes>;
  1122	  #endif
  1123	
  1124	    // }}}
  1125	    // __traits {{{
  1126	    template <typename _Tp>
  1127	      using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
  1128	
  1129	    template <typename _Tp>
  1130	      using __traits = conditional_t<
  1131		_S_is_valid_v<_Tp>,
  1132		_GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
  1133		_InvalidTraits>;
  1134	
  1135	    //}}}
  1136	    // size metadata {{{
  1137	    template <typename _Tp>
  1138	      static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
  1139	    template <typename _Tp>
  1140	      static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
  1141	
  1142	    // }}}
  1143	    // implicit mask {{{
  1144	  private:
  1145	    template <typename _Tp>
  1146	      using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
  1147	
  1148	  public:
  1149	    template <size_t _Np>
  1150	      _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
  1151	      __implicit_mask_n()
  1152	      {
  1153		using _Tp = __bool_storage_member_type_t<_Np>;
  1154		return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
  1155	      }
  1156	
  1157	    template <typename _Tp>
  1158	      _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
  1159	      _S_implicit_mask()
  1160	      { return __implicit_mask_n<_S_size<_Tp>>(); }
  1161	
  1162	    template <typename _Tp>
  1163	      _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
  1164	      _S_implicit_mask_intrin()
  1165	      { return __implicit_mask_n<_S_size<_Tp>>(); }
  1166	
  1167	    template <typename _Tp, size_t _Np>
  1168	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1169	      _S_masked(_SimdWrapper<_Tp, _Np> __x)
  1170	      {
  1171		if constexpr (is_same_v<_Tp, bool>)
  1172		  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
  1173		    return _MaskImpl::_S_bit_and(
  1174		      __x, _SimdWrapper<_Tp, _Np>(
  1175			     __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
  1176		  else
  1177		    return __x;
  1178		else
  1179		  return _S_masked(__x._M_data);
  1180	      }
  1181	
  1182	    template <typename _TV>
  1183	      _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
  1184	      _S_masked(_TV __x)
  1185	      {
  1186		using _Tp = typename _VectorTraits<_TV>::value_type;
  1187		static_assert(
  1188		  !__is_bitmask_v<_TV>,
  1189		  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
  1190		  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
  1191		if constexpr (_S_is_partial<_Tp>)
  1192		  {
  1193		    constexpr size_t _Np = _S_size<_Tp>;
  1194		    return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
  1195		      _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
  1196		      _SimdWrapper<_Tp, _Np>(__x));
  1197		  }
  1198		else
  1199		  return __x;
  1200	      }
  1201	
  1202	    template <typename _TV, typename _TVT = _VectorTraits<_TV>>
  1203	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  1204	      __make_padding_nonzero(_TV __x)
  1205	      {
  1206		using _Tp = typename _TVT::value_type;
  1207		if constexpr (!_S_is_partial<_Tp>)
  1208		  return __x;
  1209		else
  1210		  {
  1211		    constexpr size_t _Np = _S_size<_Tp>;
  1212		    if constexpr (is_integral_v<typename _TVT::value_type>)
  1213		      return __x
  1214			     | __generate_vector<_Tp, _S_full_size<_Tp>>(
  1215			       [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
  1216				 if (__i < _Np)
  1217				   return 0;
  1218				 else
  1219				   return 1;
  1220			       });
  1221		    else
  1222		      return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
  1223			       _S_implicit_mask<_Tp>(),
  1224			       _SimdWrapper<_Tp, _Np>(
  1225				 __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
  1226			       _SimdWrapper<_Tp, _Np>(__x))
  1227			._M_data;
  1228		  }
  1229	      }
  1230	
  1231	    // }}}
  1232	  };
  1233	
  1234	//}}}
  1235	// _CommonImplBuiltin {{{
  1236	struct _CommonImplBuiltin
  1237	{
  1238	  // _S_converts_via_decomposition{{{
  1239	  // This lists all cases where a __vector_convert needs to fall back to
  1240	  // conversion of individual scalars (i.e. decompose the input vector into
  1241	  // scalars, convert, compose output vector). In those cases, _S_masked_load &
  1242	  // _S_masked_store prefer to use the _S_bit_iteration implementation.
  1243	  template <typename _From, typename _To, size_t _ToSize>
  1244	    static inline constexpr bool __converts_via_decomposition_v
  1245	      = sizeof(_From) != sizeof(_To);
  1246	
  1247	  // }}}
  1248	  // _S_load{{{
  1249	  template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
  1250	    _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
  1251	    _S_load(const void* __p)
  1252	    {
  1253	      static_assert(_Np > 1);
  1254	      static_assert(_Bytes % sizeof(_Tp) == 0);
  1255	      using _Rp = __vector_type_t<_Tp, _Np>;
  1256	      if constexpr (sizeof(_Rp) == _Bytes)
  1257		{
  1258		  _Rp __r;
  1259		  __builtin_memcpy(&__r, __p, _Bytes);
  1260		  return __r;
  1261		}
  1262	      else
  1263		{
  1264	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
  1265		  using _Up = conditional_t<
  1266		    is_integral_v<_Tp>,
  1267		    conditional_t<_Bytes % 4 == 0,
  1268				  conditional_t<_Bytes % 8 == 0, long long, int>,
  1269				  conditional_t<_Bytes % 2 == 0, short, signed char>>,
  1270		    conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
  1271				  double>>;
  1272		  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
  1273		  if constexpr (sizeof(_V) != sizeof(_Rp))
  1274		    { // on i386 with 4 < _Bytes <= 8
  1275		      _Rp __r{};
  1276		      __builtin_memcpy(&__r, __p, _Bytes);
  1277		      return __r;
  1278		    }
  1279		  else
  1280	#else // _GLIBCXX_SIMD_WORKAROUND_PR90424
  1281		  using _V = _Rp;
  1282	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
  1283		    {
  1284		      _V __r{};
  1285		      static_assert(_Bytes <= sizeof(_V));
  1286		      __builtin_memcpy(&__r, __p, _Bytes);
  1287		      return reinterpret_cast<_Rp>(__r);
  1288		    }
  1289		}
  1290	    }
  1291	
  1292	  // }}}
  1293	  // _S_store {{{
  1294	  template <size_t _ReqBytes = 0, typename _TV>
  1295	    _GLIBCXX_SIMD_INTRINSIC static void
  1296	    _S_store(_TV __x, void* __addr)
  1297	    {
  1298	      constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
  1299	      static_assert(sizeof(__x) >= _Bytes);
  1300	
  1301	      if constexpr (__is_vector_type_v<_TV>)
  1302		{
  1303		  using _Tp = typename _VectorTraits<_TV>::value_type;
  1304		  constexpr size_t _Np = _Bytes / sizeof(_Tp);
  1305		  static_assert(_Np * sizeof(_Tp) == _Bytes);
  1306	
  1307	#ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
  1308		  using _Up = conditional_t<
  1309		    (is_integral_v<_Tp> || _Bytes < 4),
  1310		    conditional_t<(sizeof(__x) > sizeof(long long)), long long, _Tp>,
  1311		    float>;
  1312		  const auto __v = __vector_bitcast<_Up>(__x);
  1313	#else // _GLIBCXX_SIMD_WORKAROUND_PR90424
  1314		  const __vector_type_t<_Tp, _Np> __v = __x;
  1315	#endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
  1316	
  1317		  if constexpr ((_Bytes & (_Bytes - 1)) != 0)
  1318		    {
  1319		      constexpr size_t _MoreBytes = std::__bit_ceil(_Bytes);
  1320		      alignas(decltype(__v)) char __tmp[_MoreBytes];
  1321		      __builtin_memcpy(__tmp, &__v, _MoreBytes);
  1322		      __builtin_memcpy(__addr, __tmp, _Bytes);
  1323		    }
  1324		  else
  1325		    __builtin_memcpy(__addr, &__v, _Bytes);
  1326		}
  1327	      else
  1328		__builtin_memcpy(__addr, &__x, _Bytes);
  1329	    }
  1330	
  1331	  template <typename _Tp, size_t _Np>
  1332	    _GLIBCXX_SIMD_INTRINSIC static void
  1333	    _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
  1334	    { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
  1335	
  1336	  // }}}
  1337	  // _S_store_bool_array(_BitMask) {{{
  1338	  template <size_t _Np, bool _Sanitized>
  1339	    _GLIBCXX_SIMD_INTRINSIC static constexpr void
  1340	    _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
  1341	    {
  1342	      if constexpr (_Np == 1)
  1343		__mem[0] = __x[0];
  1344	      else if (__builtin_is_constant_evaluated())
  1345		{
  1346		  for (size_t __i = 0; __i < _Np; ++__i)
  1347		    __mem[__i] = __x[__i];
  1348		}
  1349	      else if constexpr (_Np == 2)
  1350		{
  1351		  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
  1352		  _S_store<_Np>(__bool2, __mem);
  1353		}
  1354	      else if constexpr (_Np == 3)
  1355		{
  1356		  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
  1357		  _S_store<_Np>(__bool3, __mem);
  1358		}
  1359	      else
  1360		{
  1361		  __execute_n_times<__div_roundup(_Np, 4)>(
  1362		    [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1363		      constexpr int __offset = __i * 4;
  1364		      constexpr int __remaining = _Np - __offset;
  1365		      if constexpr (__remaining > 4 && __remaining <= 7)
  1366			{
  1367			  const _ULLong __bool7
  1368			    = (__x.template _M_extract<__offset>()._M_to_bits()
  1369				 * 0x40810204081ULL)
  1370				& 0x0101010101010101ULL;
  1371			  _S_store<__remaining>(__bool7, __mem + __offset);
  1372			}
  1373		      else if constexpr (__remaining >= 4)
  1374			{
  1375			  int __bits = __x.template _M_extract<__offset>()._M_to_bits();
  1376			  if constexpr (__remaining > 7)
  1377			    __bits &= 0xf;
  1378			  const int __bool4 = (__bits * 0x204081) & 0x01010101;
  1379			  _S_store<4>(__bool4, __mem + __offset);
  1380			}
  1381		    });
  1382		}
  1383	    }
  1384	
  1385	  // }}}
  1386	  // _S_blend{{{
  1387	  template <typename _Tp, size_t _Np>
  1388	    _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  1389	    _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
  1390		     _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
  1391	    { return __k._M_data ? __at1._M_data : __at0._M_data; }
  1392	
  1393	  // }}}
  1394	};
  1395	
  1396	// }}}
  1397	// _SimdImplBuiltin {{{1
  1398	template <typename _Abi>
  1399	  struct _SimdImplBuiltin
  1400	  {
  1401	    // member types {{{2
  1402	    template <typename _Tp>
  1403	      static constexpr size_t _S_max_store_size = 16;
  1404	
  1405	    using abi_type = _Abi;
  1406	
  1407	    template <typename _Tp>
  1408	      using _TypeTag = _Tp*;
  1409	
  1410	    template <typename _Tp>
  1411	      using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
  1412	
  1413	    template <typename _Tp>
  1414	      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
  1415	
  1416	    template <typename _Tp>
  1417	      static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
  1418	
  1419	    template <typename _Tp>
  1420	      static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
  1421	
  1422	    using _CommonImpl = typename _Abi::_CommonImpl;
  1423	    using _SuperImpl = typename _Abi::_SimdImpl;
  1424	    using _MaskImpl = typename _Abi::_MaskImpl;
  1425	
  1426	    // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
  1427	    template <typename _Tp, size_t _Np>
  1428	      _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
  1429	      _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
  1430	      { return {__private_init, __x}; }
  1431	
  1432	    template <typename _Tp, size_t _Np>
  1433	      _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
  1434	      _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
  1435	      { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
  1436	
  1437	    // _S_broadcast {{{2
  1438	    template <typename _Tp>
  1439	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
  1440	      _S_broadcast(_Tp __x) noexcept
  1441	      { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
  1442	
  1443	    // _S_generator {{{2
  1444	    template <typename _Fp, typename _Tp>
  1445	      inline static constexpr _SimdMember<_Tp>
  1446	      _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
  1447	      {
  1448		return __generate_vector<_Tp, _S_full_size<_Tp>>(
  1449			 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1450			   if constexpr (__i < _S_size<_Tp>)
  1451			     return __gen(__i);
  1452			   else
  1453			     return 0;
  1454			 });
  1455	      }
  1456	
  1457	    // _S_load {{{2
  1458	    template <typename _Tp, typename _Up>
  1459	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
  1460	      _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
  1461	      {
  1462		constexpr size_t _Np = _S_size<_Tp>;
  1463		constexpr size_t __max_load_size
  1464		  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw   ? 64
  1465		    : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
  1466									      : 16;
  1467		constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
  1468		if (__builtin_is_constant_evaluated())
  1469		  return __generate_vector<_Tp, _S_full_size<_Tp>>(
  1470			   [&](auto __i) constexpr {
  1471			     return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
  1472			   });
  1473		else if constexpr (sizeof(_Up) > 8)
  1474		  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
  1475			   [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1476			     return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
  1477			   });
  1478		else if constexpr (is_same_v<_Up, _Tp>)
  1479		  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
  1480						       _Np * sizeof(_Tp)>(__mem);
  1481		else if constexpr (__bytes_to_load <= __max_load_size)
  1482		  return __convert<_SimdMember<_Tp>>(
  1483		    _CommonImpl::template _S_load<_Up, _Np>(__mem));
  1484		else if constexpr (__bytes_to_load % __max_load_size == 0)
  1485		  {
  1486		    constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
  1487		    constexpr size_t __elements_per_load = _Np / __n_loads;
  1488		    return __call_with_n_evaluations<__n_loads>(
  1489			     [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1490			       return __convert<_SimdMember<_Tp>>(__uncvted...);
  1491			     }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1492			       return _CommonImpl::template _S_load<_Up, __elements_per_load>(
  1493							      __mem + __i * __elements_per_load);
  1494			     });
  1495		  }
  1496		else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
  1497				   && __max_load_size > 16)
  1498		  { // e.g. int[] -> <char, 12> with AVX2
  1499		    constexpr size_t __n_loads
  1500		      = __bytes_to_load / (__max_load_size / 2);
  1501		    constexpr size_t __elements_per_load = _Np / __n_loads;
  1502		    return __call_with_n_evaluations<__n_loads>(
  1503			     [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1504			       return __convert<_SimdMember<_Tp>>(__uncvted...);
  1505			     }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1506			       return _CommonImpl::template _S_load<_Up, __elements_per_load>(
  1507							      __mem + __i * __elements_per_load);
  1508			     });
  1509		  }
  1510		else // e.g. int[] -> <char, 9>
  1511		  return __call_with_subscripts(
  1512		    __mem, make_index_sequence<_Np>(),
  1513			   [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1514			     return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
  1515			   });
  1516	      }
  1517	
  1518	    // _S_masked_load {{{2
  1519	    template <typename _Tp, size_t _Np, typename _Up>
  1520	      static constexpr inline _SimdWrapper<_Tp, _Np>
  1521	      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
  1522			     const _Up* __mem) noexcept
  1523	      {
  1524		_BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
  1525					  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1526					    __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
  1527					  });
  1528		return __merge;
  1529	      }
  1530	
  1531	    // _S_store {{{2
  1532	    template <typename _Tp, typename _Up>
  1533	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  1534	      _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
  1535	      {
  1536		// TODO: converting int -> "smaller int" can be optimized with AVX512
  1537		constexpr size_t _Np = _S_size<_Tp>;
  1538		constexpr size_t __max_store_size
  1539		  = _SuperImpl::template _S_max_store_size<_Up>;
  1540		if (__builtin_is_constant_evaluated())
  1541		  {
  1542		    for (size_t __i = 0; __i < _Np; ++__i)
  1543		      __mem[__i] = __v[__i];
  1544		  }
  1545		else if constexpr (sizeof(_Up) > 8)
  1546		  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1547		    __mem[__i] = __v[__i];
  1548		  });
  1549		else if constexpr (is_same_v<_Up, _Tp>)
  1550		  _CommonImpl::_S_store(__v, __mem);
  1551		else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
  1552		  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
  1553					__mem);
  1554		else
  1555		  {
  1556		    constexpr size_t __vsize = __max_store_size / sizeof(_Up);
  1557		    // round up to convert the last partial vector as well:
  1558		    constexpr size_t __stores = __div_roundup(_Np, __vsize);
  1559		    constexpr size_t __full_stores = _Np / __vsize;
  1560		    using _V = __vector_type_t<_Up, __vsize>;
  1561		    const array<_V, __stores> __converted
  1562		      = __convert_all<_V, __stores>(__v);
  1563		    __execute_n_times<__full_stores>(
  1564		      [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1565			_CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
  1566		      });
  1567		    if constexpr (__full_stores < __stores)
  1568		      _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
  1569						     * sizeof(_Up)>(
  1570			__converted[__full_stores], __mem + __full_stores * __vsize);
  1571		  }
  1572	      }
  1573	
  1574	    // _S_masked_store_nocvt {{{2
  1575	    template <typename _Tp, size_t _Np>
  1576	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  1577	      _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
  1578	      {
  1579		_BitOps::_S_bit_iteration(
  1580		  _MaskImpl::_S_to_bits(__k),
  1581		  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1582		    __mem[__i] = __v[__i];
  1583		  });
  1584	      }
  1585	
  1586	    // _S_masked_store {{{2
  1587	    template <typename _TW, typename _TVT = _VectorTraits<_TW>,
  1588		      typename _Tp = typename _TVT::value_type, typename _Up>
  1589	      static constexpr inline void
  1590	      _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
  1591	      {
  1592		constexpr size_t _TV_size = _S_size<_Tp>;
  1593		[[maybe_unused]] const auto __vi = __to_intrin(__v);
  1594		constexpr size_t __max_store_size
  1595		  = _SuperImpl::template _S_max_store_size<_Up>;
  1596		if constexpr (
  1597		  is_same_v<
  1598		    _Tp,
  1599		    _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
  1600		  {
  1601		    // bitwise or no conversion, reinterpret:
  1602		    const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1603		      if constexpr (__is_bitmask_v<decltype(__k)>)
  1604			return _MaskMember<_Up>(__k._M_data);
  1605		      else
  1606			return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
  1607		    }();
  1608		    _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
  1609						      __mem, __kk);
  1610		  }
  1611		else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
  1612				   && !_CommonImpl::
  1613					template __converts_via_decomposition_v<
  1614					  _Tp, _Up, __max_store_size>)
  1615		  { // conversion via decomposition is better handled via the
  1616		    // bit_iteration
  1617		    // fallback below
  1618		    constexpr size_t _UW_size
  1619		      = std::min(_TV_size, __max_store_size / sizeof(_Up));
  1620		    static_assert(_UW_size <= _TV_size);
  1621		    using _UW = _SimdWrapper<_Up, _UW_size>;
  1622		    using _UV = __vector_type_t<_Up, _UW_size>;
  1623		    using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
  1624		    if constexpr (_UW_size == _TV_size) // one convert+store
  1625		      {
  1626			const _UW __converted = __convert<_UW>(__v);
  1627			_SuperImpl::_S_masked_store_nocvt(
  1628			  __converted, __mem,
  1629			  _UAbi::_MaskImpl::template _S_convert<
  1630			    __int_for_sizeof_t<_Up>>(__k));
  1631		      }
  1632		    else
  1633		      {
  1634			static_assert(_UW_size * sizeof(_Up) == __max_store_size);
  1635			constexpr size_t _NFullStores = _TV_size / _UW_size;
  1636			constexpr size_t _NAllStores
  1637			  = __div_roundup(_TV_size, _UW_size);
  1638			constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
  1639			const array<_UV, _NAllStores> __converted
  1640			  = __convert_all<_UV, _NAllStores>(__v);
  1641			__execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1642			  _SuperImpl::_S_masked_store_nocvt(
  1643			    _UW(__converted[__i]), __mem + __i * _UW_size,
  1644			    _UAbi::_MaskImpl::template _S_convert<
  1645			      __int_for_sizeof_t<_Up>>(
  1646			      __extract_part<__i, _NParts>(__k.__as_full_vector())));
  1647			});
  1648			if constexpr (_NAllStores
  1649				      > _NFullStores) // one partial at the end
  1650			  _SuperImpl::_S_masked_store_nocvt(
  1651			    _UW(__converted[_NFullStores]),
  1652			    __mem + _NFullStores * _UW_size,
  1653			    _UAbi::_MaskImpl::template _S_convert<
  1654			      __int_for_sizeof_t<_Up>>(
  1655			      __extract_part<_NFullStores, _NParts>(
  1656				__k.__as_full_vector())));
  1657		      }
  1658		  }
  1659		else
  1660		  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
  1661					    [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  1662					      __mem[__i] = static_cast<_Up>(__v[__i]);
  1663					    });
  1664	      }
  1665	
  1666	    // _S_complement {{{2
  1667	    template <typename _Tp, size_t _Np>
  1668	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1669	      _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
  1670	      { return ~__x._M_data; }
  1671	
  1672	    // _S_unary_minus {{{2
  1673	    template <typename _Tp, size_t _Np>
  1674	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1675	      _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
  1676	      {
  1677		// GCC doesn't use the psign instructions, but pxor & psub seem to be
  1678		// just as good a choice as pcmpeqd & psign. So meh.
  1679		return -__x._M_data;
  1680	      }
  1681	
  1682	    // arithmetic operators {{{2
  1683	    template <typename _Tp, size_t _Np>
  1684	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1685	      _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1686	      { return __x._M_data + __y._M_data; }
  1687	
  1688	    template <typename _Tp, size_t _Np>
  1689	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1690	      _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1691	      { return __x._M_data - __y._M_data; }
  1692	
  1693	    template <typename _Tp, size_t _Np>
  1694	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1695	      _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1696	      { return __x._M_data * __y._M_data; }
  1697	
  1698	    template <typename _Tp, size_t _Np>
  1699	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1700	      _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1701	      {
  1702		// Note that division by 0 is always UB, so we must ensure we avoid the
  1703		// case for partial registers
  1704		if constexpr (!_Abi::template _S_is_partial<_Tp>)
  1705		  return __x._M_data / __y._M_data;
  1706		else
  1707		  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
  1708	      }
  1709	
  1710	    template <typename _Tp, size_t _Np>
  1711	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1712	      _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1713	      {
  1714		if constexpr (!_Abi::template _S_is_partial<_Tp>)
  1715		  return __x._M_data % __y._M_data;
  1716		else
  1717		  return __as_vector(__x)
  1718			 % _Abi::__make_padding_nonzero(__as_vector(__y));
  1719	      }
  1720	
  1721	    template <typename _Tp, size_t _Np>
  1722	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1723	      _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1724	      { return __and(__x, __y); }
  1725	
  1726	    template <typename _Tp, size_t _Np>
  1727	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1728	      _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1729	      { return __or(__x, __y); }
  1730	
  1731	    template <typename _Tp, size_t _Np>
  1732	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1733	      _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1734	      { return __xor(__x, __y); }
  1735	
  1736	    template <typename _Tp, size_t _Np>
  1737	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  1738	      _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1739	      { return __x._M_data << __y._M_data; }
  1740	
  1741	    template <typename _Tp, size_t _Np>
  1742	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  1743	      _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1744	      { return __x._M_data >> __y._M_data; }
  1745	
  1746	    template <typename _Tp, size_t _Np>
  1747	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1748	      _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
  1749	      { return __x._M_data << __y; }
  1750	
  1751	    template <typename _Tp, size_t _Np>
  1752	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  1753	      _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
  1754	      { return __x._M_data >> __y; }
  1755	
  1756	    // compares {{{2
  1757	    // _S_equal_to {{{3
  1758	    template <typename _Tp, size_t _Np>
  1759	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1760	      _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1761	      { return __x._M_data == __y._M_data; }
  1762	
  1763	    // _S_not_equal_to {{{3
  1764	    template <typename _Tp, size_t _Np>
  1765	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1766	      _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1767	      { return __x._M_data != __y._M_data; }
  1768	
  1769	    // _S_less {{{3
  1770	    template <typename _Tp, size_t _Np>
  1771	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1772	      _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1773	      { return __x._M_data < __y._M_data; }
  1774	
  1775	    // _S_less_equal {{{3
  1776	    template <typename _Tp, size_t _Np>
  1777	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1778	      _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  1779	      { return __x._M_data <= __y._M_data; }
  1780	
  1781	    // _S_negate {{{2
  1782	    template <typename _Tp, size_t _Np>
  1783	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  1784	      _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
  1785	      { return !__x._M_data; }
  1786	
  1787	    // _S_min, _S_max, _S_minmax {{{2
  1788	    template <typename _Tp, size_t _Np>
  1789	      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
  1790	      _SimdWrapper<_Tp, _Np>
  1791	      _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
  1792	      { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
  1793	
  1794	    template <typename _Tp, size_t _Np>
  1795	      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
  1796	      _SimdWrapper<_Tp, _Np>
  1797	      _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
  1798	      { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
  1799	
  1800	    template <typename _Tp, size_t _Np>
  1801	      _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
  1802	      pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
  1803	      _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
  1804	      {
  1805		return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
  1806			__a._M_data < __b._M_data ? __b._M_data : __a._M_data};
  1807	      }
  1808	
  1809	    // reductions {{{2
  1810	    template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
  1811		      typename _BinaryOperation>
  1812	      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
  1813	      _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
  1814				simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
  1815	      {
  1816		using _V = __vector_type_t<_Tp, _Np / 2>;
  1817		static_assert(sizeof(_V) <= sizeof(__x));
  1818		// _S_full_size is the size of the smallest native SIMD register that
  1819		// can store _Np/2 elements:
  1820		using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
  1821		using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
  1822		const auto __xx = __as_vector(__x);
  1823		return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
  1824		  static_cast<_HalfSimd>(__as_vector(__binary_op(
  1825		    static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
  1826		    static_cast<_FullSimd>(__intrin_bitcast<_V>(
  1827		      __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
  1828			__xx)))))),
  1829		  __binary_op);
  1830	      }
  1831	
  1832	    template <typename _Tp, typename _BinaryOperation>
  1833	      _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
  1834	      _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
  1835	      {
  1836		constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
  1837		if constexpr (_Np == 1)
  1838		  return __x[0];
  1839		else if constexpr (_Np == 2)
  1840		  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
  1841				     simd<_Tp, simd_abi::scalar>(__x[1]))[0];
  1842		else if (__builtin_is_constant_evaluated())
  1843		  {
  1844		    simd<_Tp, simd_abi::scalar> __acc = __x[0];
  1845		    for (size_t __i = 1; __i < _Np; ++__i)
  1846		      __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
  1847		    return __acc[0];
  1848		  }
  1849		else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
  1850		  {
  1851		    [[maybe_unused]] constexpr auto __full_size
  1852		      = _Abi::template _S_full_size<_Tp>;
  1853		    if constexpr (_Np == 3)
  1854		      return __binary_op(
  1855			__binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
  1856				    simd<_Tp, simd_abi::scalar>(__x[1])),
  1857			simd<_Tp, simd_abi::scalar>(__x[2]))[0];
  1858		    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
  1859						 plus<>>)
  1860		      {
  1861			using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
  1862			return _Ap::_SimdImpl::_S_reduce(
  1863			  simd<_Tp, _Ap>(__private_init,
  1864					 _Abi::_S_masked(__as_vector(__x))),
  1865			  __binary_op);
  1866		      }
  1867		    else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
  1868						 multiplies<>>)
  1869		      {
  1870			using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
  1871			using _TW = _SimdWrapper<_Tp, __full_size>;
  1872			_GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
  1873			  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
  1874			_GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
  1875			  = __vector_broadcast<__full_size>(_Tp(1));
  1876			const _TW __x_full = __data(__x).__as_full_vector();
  1877			const _TW __x_padded_with_ones
  1878			  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
  1879						       __x_full);
  1880			return _Ap::_SimdImpl::_S_reduce(
  1881			  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
  1882			  __binary_op);
  1883		      }
  1884		    else if constexpr (_Np & 1)
  1885		      {
  1886			using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
  1887			return __binary_op(
  1888			  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
  1889			    simd<_Tp, _Ap>(
  1890			      __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
  1891				__as_vector(__x))),
  1892			    __binary_op)),
  1893			  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
  1894		      }
  1895		    else
  1896		      return _S_reduce_partial<_Np>(
  1897			make_index_sequence<_Np / 2>(),
  1898			make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
  1899		  }                                   //}}}
  1900		else if constexpr (sizeof(__x) == 16) //{{{
  1901		  {
  1902		    if constexpr (_Np == 16)
  1903		      {
  1904			const auto __y = __data(__x);
  1905			__x = __binary_op(
  1906			  _M_make_simd<_Tp, _Np>(
  1907			    __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
  1908					     7, 7>(__y)),
  1909			  _M_make_simd<_Tp, _Np>(
  1910			    __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
  1911					     14, 14, 15, 15>(__y)));
  1912		      }
  1913		    if constexpr (_Np >= 8)
  1914		      {
  1915			const auto __y = __vector_bitcast<short>(__data(__x));
  1916			__x = __binary_op(
  1917			  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
  1918			    __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
  1919			  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
  1920			    __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
  1921		      }
  1922		    if constexpr (_Np >= 4)
  1923		      {
  1924			using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
  1925			const auto __y = __vector_bitcast<_Up>(__data(__x));
  1926			__x = __binary_op(__x,
  1927					  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
  1928					    __vector_permute<3, 2, 1, 0>(__y))));
  1929		      }
  1930		    using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
  1931		    const auto __y = __vector_bitcast<_Up>(__data(__x));
  1932		    __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
  1933					     __vector_permute<1, 1>(__y))));
  1934		    return __x[0];
  1935		  } //}}}
  1936		else
  1937		  {
  1938		    static_assert(sizeof(__x) > __min_vector_size<_Tp>);
  1939		    static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
  1940		    using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
  1941		    using _V = simd<_Tp, _Ap>;
  1942		    return _Ap::_SimdImpl::_S_reduce(
  1943		      __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
  1944				  _V(__private_init,
  1945				     __extract<1, 2>(__as_vector(__x)))),
  1946		      static_cast<_BinaryOperation&&>(__binary_op));
  1947		  }
  1948	      }
  1949	
  1950	    // math {{{2
  1951	    // frexp, modf and copysign implemented in simd_math.h
  1952	#define _GLIBCXX_SIMD_MATH_FALLBACK(__name)                                    \
  1953	    template <typename _Tp, typename... _More>                                 \
  1954	      static _Tp                                                               \
  1955	      _S_##__name(const _Tp& __x, const _More&... __more)                      \
  1956	      {                                                                        \
  1957		return __generate_vector<_Tp>(                                         \
  1958			 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {            \
  1959			   return __name(__x[__i], __more[__i]...);                    \
  1960			 });                                                           \
  1961	      }
  1962	
  1963	#define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name)                            \
  1964	    template <typename _Tp, typename... _More>                                 \
  1965	      static typename _Tp::mask_type                                           \
  1966	      _S_##__name(const _Tp& __x, const _More&... __more)                      \
  1967	      {                                                                        \
  1968		return __generate_vector<_Tp>(                                         \
  1969			 [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {            \
  1970			   return __name(__x[__i], __more[__i]...);                    \
  1971			 });                                                           \
  1972	      }
  1973	
  1974	#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name)                          \
  1975	    template <typename _Tp, typename... _More>                                        \
  1976	      static auto                                                                     \
  1977	      _S_##__name(const _Tp& __x, const _More&... __more)                             \
  1978	      {                                                                               \
  1979		return __fixed_size_storage_t<_RetTp,                                         \
  1980					      _VectorTraits<_Tp>::_S_partial_width>::         \
  1981		  _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
  1982		    return __meta._S_generator(                                               \
  1983		      [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {                      \
  1984			return __name(__x[__meta._S_offset + __i],                            \
  1985				      __more[__meta._S_offset + __i]...);                     \
  1986		      },                                                                      \
  1987		      static_cast<_RetTp*>(nullptr));                                         \
  1988		  });                                                                         \
  1989	      }
  1990	
  1991	    _GLIBCXX_SIMD_MATH_FALLBACK(acos)
  1992	    _GLIBCXX_SIMD_MATH_FALLBACK(asin)
  1993	    _GLIBCXX_SIMD_MATH_FALLBACK(atan)
  1994	    _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
  1995	    _GLIBCXX_SIMD_MATH_FALLBACK(cos)
  1996	    _GLIBCXX_SIMD_MATH_FALLBACK(sin)
  1997	    _GLIBCXX_SIMD_MATH_FALLBACK(tan)
  1998	    _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
  1999	    _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
  2000	    _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
  2001	    _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
  2002	    _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
  2003	    _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
  2004	    _GLIBCXX_SIMD_MATH_FALLBACK(exp)
  2005	    _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
  2006	    _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
  2007	    _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
  2008	    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
  2009	    _GLIBCXX_SIMD_MATH_FALLBACK(log)
  2010	    _GLIBCXX_SIMD_MATH_FALLBACK(log10)
  2011	    _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
  2012	    _GLIBCXX_SIMD_MATH_FALLBACK(log2)
  2013	    _GLIBCXX_SIMD_MATH_FALLBACK(logb)
  2014	
  2015	    // modf implemented in simd_math.h
  2016	    _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
  2017	    _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
  2018	    _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
  2019	    _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
  2020	    _GLIBCXX_SIMD_MATH_FALLBACK(pow)
  2021	    _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
  2022	    _GLIBCXX_SIMD_MATH_FALLBACK(erf)
  2023	    _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
  2024	    _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
  2025	    _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
  2026	
  2027	    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
  2028	    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
  2029	
  2030	    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
  2031	    _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
  2032	
  2033	    _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
  2034	    _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
  2035	
  2036	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  2037	      static _Tp
  2038	      _S_remquo(const _Tp __x, const _Tp __y,
  2039			__fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
  2040	      {
  2041		return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2042		  int __tmp;
  2043		  auto __r = remquo(__x[__i], __y[__i], &__tmp);
  2044		  __z->_M_set(__i, __tmp);
  2045		  return __r;
  2046		});
  2047	      }
  2048	
  2049	    // copysign in simd_math.h
  2050	    _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
  2051	    _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
  2052	    _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
  2053	    _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
  2054	    _GLIBCXX_SIMD_MATH_FALLBACK(fma)
  2055	
  2056	    template <typename _Tp, size_t _Np>
  2057	      static constexpr _MaskMember<_Tp>
  2058	      _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
  2059			   _SimdWrapper<_Tp, _Np> __y) noexcept
  2060	      {
  2061		using _Ip = __int_for_sizeof_t<_Tp>;
  2062		const auto __xn = __vector_bitcast<_Ip>(__x);
  2063		const auto __yn = __vector_bitcast<_Ip>(__y);
  2064		const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
  2065		const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
  2066		return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
  2067				__xp > __yp);
  2068	      }
  2069	
  2070	    template <typename _Tp, size_t _Np>
  2071	      static constexpr _MaskMember<_Tp>
  2072	      _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
  2073				_SimdWrapper<_Tp, _Np> __y) noexcept
  2074	      {
  2075		using _Ip = __int_for_sizeof_t<_Tp>;
  2076		const auto __xn = __vector_bitcast<_Ip>(__x);
  2077		const auto __yn = __vector_bitcast<_Ip>(__y);
  2078		const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
  2079		const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
  2080		return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
  2081				__xp >= __yp);
  2082	      }
  2083	
  2084	    template <typename _Tp, size_t _Np>
  2085	      static constexpr _MaskMember<_Tp>
  2086	      _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
  2087	      {
  2088		using _Ip = __int_for_sizeof_t<_Tp>;
  2089		const auto __xn = __vector_bitcast<_Ip>(__x);
  2090		const auto __yn = __vector_bitcast<_Ip>(__y);
  2091		const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
  2092		const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
  2093		return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
  2094				__xp < __yp);
  2095	      }
  2096	
  2097	    template <typename _Tp, size_t _Np>
  2098	      static constexpr _MaskMember<_Tp>
  2099	      _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
  2100			     _SimdWrapper<_Tp, _Np> __y) noexcept
  2101	      {
  2102		using _Ip = __int_for_sizeof_t<_Tp>;
  2103		const auto __xn = __vector_bitcast<_Ip>(__x);
  2104		const auto __yn = __vector_bitcast<_Ip>(__y);
  2105		const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
  2106		const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
  2107		return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
  2108				__xp <= __yp);
  2109	      }
  2110	
  2111	    template <typename _Tp, size_t _Np>
  2112	      static constexpr _MaskMember<_Tp>
  2113	      _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
  2114			       _SimdWrapper<_Tp, _Np> __y) noexcept
  2115	      {
  2116		return __andnot(_SuperImpl::_S_isunordered(__x, __y),
  2117				_SuperImpl::_S_not_equal_to(__x, __y));
  2118	      }
  2119	
  2120	#undef _GLIBCXX_SIMD_MATH_FALLBACK
  2121	#undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
  2122	#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
  2123	    // _S_abs {{{3
  2124	    template <typename _Tp, size_t _Np>
  2125	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  2126	      _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
  2127	      {
  2128		// if (__builtin_is_constant_evaluated())
  2129		//  {
  2130		//    return __x._M_data < 0 ? -__x._M_data : __x._M_data;
  2131		//  }
  2132		if constexpr (is_floating_point_v<_Tp>)
  2133		  // `v < 0 ? -v : v` cannot compile to the efficient implementation of
  2134		  // masking the signbit off because it must consider v == -0
  2135	
  2136		  // ~(-0.) & v would be easy, but breaks with fno-signed-zeros
  2137		  return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
  2138		else
  2139		  return __x._M_data < 0 ? -__x._M_data : __x._M_data;
  2140	      }
  2141	
  2142	    // }}}3
  2143	    // _S_plus_minus {{{
  2144	    // Returns __x + __y - __y without -fassociative-math optimizing to __x.
  2145	    // - _TV must be __vector_type_t<floating-point type, N>.
  2146	    // - _UV must be _TV or floating-point type.
  2147	    template <typename _TV, typename _UV>
  2148	      _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
  2149	      _S_plus_minus(_TV __x, _UV __y) noexcept
  2150	      {
  2151	#if defined __i386__ && !defined __SSE_MATH__
  2152		if constexpr (sizeof(__x) == 8)
  2153		  { // operations on __x would use the FPU
  2154		    static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
  2155		    const auto __x4 = __vector_bitcast<float, 4>(__x);
  2156		    if constexpr (is_same_v<_TV, _UV>)
  2157		      return __vector_bitcast<float, 2>(
  2158			       _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
  2159		    else
  2160		      return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
  2161		  }
  2162	#endif
  2163	#if !defined __clang__ && __GCC_IEC_559 == 0
  2164		if (__builtin_is_constant_evaluated()
  2165		      || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
  2166		  return (__x + __y) - __y;
  2167		else
  2168		  return [&] {
  2169		    __x += __y;
  2170		    if constexpr(__have_sse)
  2171		      {
  2172			if constexpr (sizeof(__x) >= 16)
  2173			  asm("" : "+x"(__x));
  2174			else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
  2175			  asm("" : "+x"(__x[0]), "+x"(__x[1]));
  2176			else
  2177			  __assert_unreachable<_TV>();
  2178		      }
  2179		    else if constexpr(__have_neon)
  2180		      asm("" : "+w"(__x));
  2181		    else if constexpr (__have_power_vmx)
  2182		      {
  2183			if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
  2184			  asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
  2185			else
  2186			  asm("" : "+v"(__x));
  2187		      }
  2188		    else
  2189		      asm("" : "+g"(__x));
  2190		    return __x - __y;
  2191		  }();
  2192	#else
  2193		return (__x + __y) - __y;
  2194	#endif
  2195	      }
  2196	
  2197	    // }}}
  2198	    // _S_nearbyint {{{3
  2199	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  2200	      _GLIBCXX_SIMD_INTRINSIC static _Tp
  2201	      _S_nearbyint(_Tp __x_) noexcept
  2202	      {
  2203		using value_type = typename _TVT::value_type;
  2204		using _V = typename _TVT::type;
  2205		const _V __x = __x_;
  2206		const _V __absx = __and(__x, _S_absmask<_V>);
  2207		static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
  2208		_GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
  2209		  = _V() + (1ull << (__digits_v<value_type> - 1));
  2210		const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
  2211		const _V __shifted = _S_plus_minus(__x, __shifter);
  2212		return __absx < __shifter_abs ? __shifted : __x;
  2213	      }
  2214	
  2215	    // _S_rint {{{3
  2216	    template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  2217	      _GLIBCXX_SIMD_INTRINSIC static _Tp
  2218	      _S_rint(_Tp __x) noexcept
  2219	      { return _SuperImpl::_S_nearbyint(__x); }
  2220	
  2221	    // _S_trunc {{{3
  2222	    template <typename _Tp, size_t _Np>
  2223	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  2224	      _S_trunc(_SimdWrapper<_Tp, _Np> __x)
  2225	      {
  2226		using _V = __vector_type_t<_Tp, _Np>;
  2227		const _V __absx = __and(__x._M_data, _S_absmask<_V>);
  2228		static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
  2229		constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
  2230		_V __truncated = _S_plus_minus(__absx, __shifter);
  2231		__truncated -= __truncated > __absx ? _V() + 1 : _V();
  2232		return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
  2233					  : __x._M_data;
  2234	      }
  2235	
  2236	    // _S_round {{{3
  2237	    template <typename _Tp, size_t _Np>
  2238	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  2239	      _S_round(_SimdWrapper<_Tp, _Np> __x)
  2240	      {
  2241		const auto __abs_x = _SuperImpl::_S_abs(__x);
  2242		const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
  2243		const auto __r_abs // round(abs(x)) =
  2244		  = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
  2245		return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
  2246	      }
  2247	
  2248	    // _S_floor {{{3
  2249	    template <typename _Tp, size_t _Np>
  2250	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  2251	      _S_floor(_SimdWrapper<_Tp, _Np> __x)
  2252	      {
  2253		const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
  2254		const auto __negative_input
  2255		  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
  2256		const auto __mask
  2257		  = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
  2258		return __or(__andnot(__mask, __y),
  2259			    __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
  2260	      }
  2261	
  2262	    // _S_ceil {{{3
  2263	    template <typename _Tp, size_t _Np>
  2264	      _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  2265	      _S_ceil(_SimdWrapper<_Tp, _Np> __x)
  2266	      {
  2267		const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
  2268		const auto __negative_input
  2269		  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
  2270		const auto __inv_mask
  2271		  = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
  2272		return __or(__and(__inv_mask, __y),
  2273			    __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
  2274	      }
  2275	
  2276	    // _S_isnan {{{3
  2277	    template <typename _Tp, size_t _Np>
  2278	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2279	      _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
  2280	      {
  2281	#if __FINITE_MATH_ONLY__
  2282		return {}; // false
  2283	#elif !defined __SUPPORT_SNAN__
  2284		return ~(__x._M_data == __x._M_data);
  2285	#elif defined __STDC_IEC_559__
  2286		using _Ip = __int_for_sizeof_t<_Tp>;
  2287		const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
  2288		const auto __infn
  2289		  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
  2290		return __infn < __absn;
  2291	#else
  2292	#error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
  2293	#endif
  2294	      }
  2295	
  2296	    // _S_isfinite {{{3
  2297	    template <typename _Tp, size_t _Np>
  2298	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2299	      _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
  2300	      {
  2301	#if __FINITE_MATH_ONLY__
  2302		using _UV = typename _MaskMember<_Tp>::_BuiltinType;
  2303		_GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
  2304		return __alltrue;
  2305	#else
  2306		// if all exponent bits are set, __x is either inf or NaN
  2307		using _Ip = __int_for_sizeof_t<_Tp>;
  2308		const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
  2309		const auto __maxn
  2310		  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
  2311		return __absn <= __maxn;
  2312	#endif
  2313	      }
  2314	
  2315	    // _S_isunordered {{{3
  2316	    template <typename _Tp, size_t _Np>
  2317	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2318	      _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
  2319	      { return __or(_S_isnan(__x), _S_isnan(__y)); }
  2320	
  2321	    // _S_signbit {{{3
  2322	    template <typename _Tp, size_t _Np>
  2323	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2324	      _S_signbit(_SimdWrapper<_Tp, _Np> __x)
  2325	      {
  2326		using _Ip = __int_for_sizeof_t<_Tp>;
  2327		return __vector_bitcast<_Ip>(__x) < 0;
  2328		// Arithmetic right shift (SRA) would also work (instead of compare), but
  2329		// 64-bit SRA isn't available on x86 before AVX512. And in general,
  2330		// compares are more likely to be efficient than SRA.
  2331	      }
  2332	
  2333	    // _S_isinf {{{3
  2334	    template <typename _Tp, size_t _Np>
  2335	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2336	      _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
  2337	      {
  2338	#if __FINITE_MATH_ONLY__
  2339		return {}; // false
  2340	#else
  2341		return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
  2342								  __vector_broadcast<_Np>(
  2343								    __infinity_v<_Tp>));
  2344		// alternative:
  2345		// compare to inf using the corresponding integer type
  2346		/*
  2347		   return
  2348		   __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
  2349					 _S_abs(__x)._M_data)
  2350		   ==
  2351		   __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
  2352		   __infinity_v<_Tp>)));
  2353		   */
  2354	#endif
  2355	      }
  2356	
  2357	    // _S_isnormal {{{3
  2358	    template <typename _Tp, size_t _Np>
  2359	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2360	      _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
  2361	      {
  2362		using _Ip = __int_for_sizeof_t<_Tp>;
  2363		const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
  2364		const auto __minn
  2365		  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
  2366	#if __FINITE_MATH_ONLY__
  2367		return __absn >= __minn;
  2368	#else
  2369		const auto __maxn
  2370		  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
  2371		return __minn <= __absn && __absn <= __maxn;
  2372	#endif
  2373	      }
  2374	
  2375	    // _S_fpclassify {{{3
  2376	    template <typename _Tp, size_t _Np>
  2377	      _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
  2378	      _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
  2379	      {
  2380		using _I = __int_for_sizeof_t<_Tp>;
  2381		const auto __xn
  2382		  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
  2383		constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
  2384		_GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
  2385		  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
  2386	
  2387		_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
  2388		  = __vector_broadcast<_NI, _I>(FP_NORMAL);
  2389	#if !__FINITE_MATH_ONLY__
  2390		_GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
  2391		  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
  2392		_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
  2393		  = __vector_broadcast<_NI, _I>(FP_NAN);
  2394		_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
  2395		  = __vector_broadcast<_NI, _I>(FP_INFINITE);
  2396	#endif
  2397	#ifndef __FAST_MATH__
  2398		_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
  2399		  = __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
  2400	#endif
  2401		_GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
  2402		  = __vector_broadcast<_NI, _I>(FP_ZERO);
  2403	
  2404		__vector_type_t<_I, _NI>
  2405		  __tmp = __xn < __minn
  2406	  #ifdef __FAST_MATH__
  2407			    ? __fp_zero
  2408	  #else
  2409			    ? (__xn == 0 ? __fp_zero : __fp_subnormal)
  2410	  #endif
  2411	  #if __FINITE_MATH_ONLY__
  2412			    : __fp_normal;
  2413	  #else
  2414			    : (__xn < __infn ? __fp_normal
  2415					     : (__xn == __infn ? __fp_infinite : __fp_nan));
  2416	  #endif
  2417	
  2418		if constexpr (sizeof(_I) == sizeof(int))
  2419		  {
  2420		    using _FixedInt = __fixed_size_storage_t<int, _Np>;
  2421		    const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
  2422		    if constexpr (_FixedInt::_S_tuple_size == 1)
  2423		      return {__as_int};
  2424		    else if constexpr (_FixedInt::_S_tuple_size == 2
  2425					 && is_same_v<
  2426					      typename _FixedInt::_SecondType::_FirstAbi,
  2427					      simd_abi::scalar>)
  2428		      return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
  2429		    else if constexpr (_FixedInt::_S_tuple_size == 2)
  2430		      return {__extract<0, 2>(__as_int),
  2431			      __auto_bitcast(__extract<1, 2>(__as_int))};
  2432		    else
  2433		      __assert_unreachable<_Tp>();
  2434		  }
  2435		else if constexpr (_Np == 2 && sizeof(_I) == 8
  2436				     && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
  2437		  {
  2438		    const auto __aslong = __vector_bitcast<_LLong>(__tmp);
  2439		    return {int(__aslong[0]), {int(__aslong[1])}};
  2440		  }
  2441	#if _GLIBCXX_SIMD_X86INTRIN
  2442		else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
  2443				     && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
  2444		  return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
  2445					  __to_intrin(__hi128(__tmp)))};
  2446		else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
  2447				     && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
  2448		  return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
  2449	#endif // _GLIBCXX_SIMD_X86INTRIN
  2450		else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
  2451		  return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
  2452						      [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2453							return __make_wrapper<int>(__l...);
  2454						      })};
  2455		else
  2456		  __assert_unreachable<_Tp>();
  2457	      }
  2458	
  2459	    // _S_increment & _S_decrement{{{2
  2460	    template <typename _Tp, size_t _Np>
  2461	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2462	      _S_increment(_SimdWrapper<_Tp, _Np>& __x)
  2463	      { __x = __x._M_data + 1; }
  2464	
  2465	    template <typename _Tp, size_t _Np>
  2466	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2467	      _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
  2468	      { __x = __x._M_data - 1; }
  2469	
  2470	    // smart_reference access {{{2
  2471	    template <typename _Tp, size_t _Np, typename _Up>
  2472	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2473	      _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
  2474	      { __v._M_set(__i, static_cast<_Up&&>(__x)); }
  2475	
  2476	    // _S_masked_assign{{{2
  2477	    template <typename _Tp, typename _K, size_t _Np>
  2478	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2479	      _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
  2480			       __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
  2481	      {
  2482		if (__k._M_is_constprop_none_of())
  2483		  return;
  2484		else if (__k._M_is_constprop_all_of())
  2485		  __lhs = __rhs;
  2486		else
  2487		  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
  2488	      }
  2489	
  2490	    template <typename _Tp, typename _K, size_t _Np>
  2491	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2492	      _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
  2493			       __type_identity_t<_Tp> __rhs)
  2494	      {
  2495		if (__k._M_is_constprop_none_of())
  2496		  return;
  2497		else if (__k._M_is_constprop_all_of())
  2498		  __lhs = __vector_broadcast<_Np>(__rhs);
  2499		else if (__builtin_constant_p(__rhs) && __rhs == 0)
  2500		  {
  2501		    if constexpr (!is_same_v<bool, _K>)
  2502		      // the __andnot optimization only makes sense if __k._M_data is a
  2503		      // vector register
  2504		      __lhs._M_data
  2505			= __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
  2506		    else
  2507		      // for AVX512/__mmask, a _mm512_maskz_mov is best
  2508		      __lhs
  2509			= _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
  2510		  }
  2511		else
  2512		  __lhs = _CommonImpl::_S_blend(__k, __lhs,
  2513						_SimdWrapper<_Tp, _Np>(
  2514						  __vector_broadcast<_Np>(__rhs)));
  2515	      }
  2516	
  2517	    // _S_masked_cassign {{{2
  2518	    template <typename _Op, typename _Tp, typename _K, size_t _Np>
  2519	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2520	      _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
  2521				_SimdWrapper<_Tp, _Np>& __lhs,
  2522				const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
  2523				_Op __op)
  2524	      {
  2525		if (__k._M_is_constprop_none_of())
  2526		  return;
  2527		else if (__k._M_is_constprop_all_of())
  2528		  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
  2529		else
  2530		  __lhs = _CommonImpl::_S_blend(__k, __lhs,
  2531						__op(_SuperImpl{}, __lhs, __rhs));
  2532	      }
  2533	
  2534	    template <typename _Op, typename _Tp, typename _K, size_t _Np>
  2535	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2536	      _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
  2537				_SimdWrapper<_Tp, _Np>& __lhs,
  2538				const __type_identity_t<_Tp> __rhs, _Op __op)
  2539	      { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
  2540	
  2541	    // _S_masked_unary {{{2
  2542	    template <template <typename> class _Op, typename _Tp, typename _K,
  2543		      size_t _Np>
  2544	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2545	      _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
  2546			      const _SimdWrapper<_Tp, _Np> __v)
  2547	      {
  2548		if (__k._M_is_constprop_none_of())
  2549		  return __v;
  2550		auto __vv = _M_make_simd(__v);
  2551		_Op<decltype(__vv)> __op;
  2552		if (__k._M_is_constprop_all_of())
  2553		  return __data(__op(__vv));
  2554		else if constexpr (is_same_v<_Op<void>, __increment<void>>)
  2555		  {
  2556		    static_assert(not std::is_same_v<_K, bool>);
  2557		    if constexpr (is_integral_v<_Tp>)
  2558		      // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
  2559		      return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
  2560		    else if constexpr (not __have_avx2)
  2561		      return __v._M_data
  2562			       + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
  2563								       _K, _Tp(1)));
  2564		    // starting with AVX2 it is more efficient to blend after add
  2565		  }
  2566		else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
  2567		  {
  2568		    static_assert(not std::is_same_v<_K, bool>);
  2569		    if constexpr (is_integral_v<_Tp>)
  2570		      // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
  2571		      return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
  2572		    else if constexpr (not __have_avx2)
  2573		      return __v._M_data
  2574			       - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
  2575								       _K, _Tp(1)));
  2576		    // starting with AVX2 it is more efficient to blend after sub
  2577		  }
  2578		return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
  2579	      }
  2580	
  2581	    //}}}2
  2582	  };
  2583	
  2584	// _MaskImplBuiltinMixin {{{1
  2585	struct _MaskImplBuiltinMixin
  2586	{
  2587	  template <typename _Tp>
  2588	    using _TypeTag = _Tp*;
  2589	
  2590	  // _S_to_maskvector {{{
  2591	  template <typename _Up, size_t _ToN = 1>
  2592	    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
  2593	    _S_to_maskvector(bool __x)
  2594	    {
  2595	      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
  2596	      return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
  2597			 : __vector_type_t<_Up, _ToN>{};
  2598	    }
  2599	
  2600	  template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
  2601		    size_t _ToN = _UpN == 0 ? _Np : _UpN>
  2602	    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
  2603	    _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
  2604	    {
  2605	      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
  2606	      return __generate_vector<__vector_type_t<_Up, _ToN>>(
  2607		       [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2608			 if constexpr (__i < _Np)
  2609			   return __x[__i] ? ~_Up() : _Up();
  2610			 else
  2611			   return _Up();
  2612		       });
  2613	    }
  2614	
  2615	  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
  2616		    size_t _ToN = _UpN == 0 ? _Np : _UpN>
  2617	    _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
  2618	    _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
  2619	    {
  2620	      static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
  2621	      using _TW = _SimdWrapper<_Tp, _Np>;
  2622	      using _UW = _SimdWrapper<_Up, _ToN>;
  2623	      if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
  2624		return __wrapper_bitcast<_Up, _ToN>(__x);
  2625	      else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
  2626		return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
  2627	      else
  2628		{ // vector -> vector
  2629		  /*
  2630		  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
  2631		  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
  2632		  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
  2633		  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
  2634				     && sizeof(__y) == 16)
  2635		    return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
  2636		  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
  2637				     && sizeof(__y) == 16)
  2638		    return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
  2639		  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
  2640				     && sizeof(__y) == 16)
  2641		    return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
  2642		  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
  2643		  sizeof(_Up) == 1
  2644				     && sizeof(__y) == 16)
  2645		    return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
  2646		  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
  2647		  sizeof(_Up) == 1
  2648				     && sizeof(__y) == 16)
  2649		    return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  2650		  -1, -1, -1, -1, -1>(__y); else
  2651		  */
  2652		  {
  2653		    return __generate_vector<__vector_type_t<_Up, _ToN>>(
  2654			     [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2655			       if constexpr (__i < _Np)
  2656				 return _Up(__x[__i.value]);
  2657			       else
  2658				 return _Up();
  2659			     });
  2660		  }
  2661		}
  2662	    }
  2663	
  2664	  // }}}
  2665	  // _S_to_bits {{{
  2666	  template <typename _Tp, size_t _Np>
  2667	    _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
  2668	    _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
  2669	    {
  2670	      static_assert(!is_same_v<_Tp, bool>);
  2671	      static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
  2672	      using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
  2673	      const auto __bools
  2674		= __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
  2675	      _ULLong __r = 0;
  2676	      __execute_n_times<_Np>(
  2677		[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2678		  __r |= _ULLong(__bools[__i.value]) << __i;
  2679		});
  2680	      return __r;
  2681	    }
  2682	
  2683	  // }}}
  2684	};
  2685	
  2686	// _MaskImplBuiltin {{{1
  2687	template <typename _Abi>
  2688	  struct _MaskImplBuiltin : _MaskImplBuiltinMixin
  2689	  {
  2690	    using _MaskImplBuiltinMixin::_S_to_bits;
  2691	    using _MaskImplBuiltinMixin::_S_to_maskvector;
  2692	
  2693	    // member types {{{
  2694	    template <typename _Tp>
  2695	      using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
  2696	
  2697	    template <typename _Tp>
  2698	      using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
  2699	
  2700	    using _SuperImpl = typename _Abi::_MaskImpl;
  2701	    using _CommonImpl = typename _Abi::_CommonImpl;
  2702	
  2703	    template <typename _Tp>
  2704	      static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
  2705	
  2706	    // }}}
  2707	    // _S_broadcast {{{
  2708	    template <typename _Tp>
  2709	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  2710	      _S_broadcast(bool __x)
  2711	      { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
  2712	
  2713	    // }}}
  2714	    // _S_load {{{
  2715	    template <typename _Tp>
  2716	      _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
  2717	      _S_load(const bool* __mem)
  2718	      {
  2719		using _I = __int_for_sizeof_t<_Tp>;
  2720		if (not __builtin_is_constant_evaluated())
  2721		  if constexpr (sizeof(_Tp) == sizeof(bool))
  2722		    {
  2723		      const auto __bools
  2724			= _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
  2725		      // bool is {0, 1}, everything else is UB
  2726		      return __bools > 0;
  2727		    }
  2728		return __generate_vector<_I, _S_size<_Tp>>(
  2729			 [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2730			   return __mem[__i] ? ~_I() : _I();
  2731			 });
  2732	      }
  2733	
  2734	    // }}}
  2735	    // _S_convert {{{
  2736	    template <typename _Tp, size_t _Np, bool _Sanitized>
  2737	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  2738	      _S_convert(_BitMask<_Np, _Sanitized> __x)
  2739	      {
  2740		if constexpr (__is_builtin_bitmask_abi<_Abi>())
  2741		  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
  2742		else
  2743		  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
  2744							       _S_size<_Tp>>(
  2745		    __x._M_sanitized());
  2746	      }
  2747	
  2748	    template <typename _Tp, size_t _Np>
  2749	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  2750	      _S_convert(_SimdWrapper<bool, _Np> __x)
  2751	      {
  2752		if constexpr (__is_builtin_bitmask_abi<_Abi>())
  2753		  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
  2754		else
  2755		  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
  2756							       _S_size<_Tp>>(
  2757		    _BitMask<_Np>(__x._M_data)._M_sanitized());
  2758	      }
  2759	
  2760	    template <typename _Tp, typename _Up, size_t _Np>
  2761	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  2762	      _S_convert(_SimdWrapper<_Up, _Np> __x)
  2763	      {
  2764		if constexpr (__is_builtin_bitmask_abi<_Abi>())
  2765		  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
  2766		    _SuperImpl::_S_to_bits(__x));
  2767		else
  2768		  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
  2769							       _S_size<_Tp>>(__x);
  2770	      }
  2771	
  2772	    template <typename _Tp, typename _Up, typename _UAbi>
  2773	      _GLIBCXX_SIMD_INTRINSIC static constexpr auto
  2774	      _S_convert(simd_mask<_Up, _UAbi> __x)
  2775	      {
  2776		if constexpr (__is_builtin_bitmask_abi<_Abi>())
  2777		  {
  2778		    using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
  2779		    if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
  2780		      return _R(__data(__x));
  2781		    else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
  2782		      return _R(__data(__x));
  2783		    else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
  2784		      return _R(__data(__x)._M_to_bits());
  2785		    else // vector -> bits
  2786		      return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
  2787		  }
  2788		else
  2789		  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
  2790							       _S_size<_Tp>>(
  2791		    __data(__x));
  2792	      }
  2793	
  2794	    // }}}
  2795	    // _S_masked_load {{{2
  2796	    template <typename _Tp, size_t _Np>
  2797	      static inline _SimdWrapper<_Tp, _Np>
  2798	      _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
  2799			     _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
  2800	      {
  2801		// AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
  2802		auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
  2803		_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
  2804					  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2805					    __tmp._M_set(__i, -__mem[__i]);
  2806					  });
  2807		__merge = __wrapper_bitcast<_Tp>(__tmp);
  2808		return __merge;
  2809	      }
  2810	
  2811	    // _S_store {{{2
  2812	    template <typename _Tp, size_t _Np>
  2813	      _GLIBCXX_SIMD_INTRINSIC static constexpr void
  2814	      _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
  2815	      {
  2816		__execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2817		  __mem[__i] = __v[__i];
  2818		});
  2819	      }
  2820	
  2821	    // _S_masked_store {{{2
  2822	    template <typename _Tp, size_t _Np>
  2823	      static inline void
  2824	      _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
  2825			      const _SimdWrapper<_Tp, _Np> __k) noexcept
  2826	      {
  2827		_BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
  2828					  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2829					    __mem[__i] = __v[__i];
  2830					  });
  2831	      }
  2832	
  2833	    // _S_from_bitmask{{{2
  2834	    template <size_t _Np, typename _Tp>
  2835	      _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
  2836	      _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
  2837	      { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
  2838	
  2839	    // logical and bitwise operators {{{2
  2840	    template <typename _Tp, size_t _Np>
  2841	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2842	      _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
  2843	      { return __and(__x._M_data, __y._M_data); }
  2844	
  2845	    template <typename _Tp, size_t _Np>
  2846	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2847	      _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
  2848	      { return __or(__x._M_data, __y._M_data); }
  2849	
  2850	    template <typename _Tp, size_t _Np>
  2851	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2852	      _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
  2853	      {
  2854		if constexpr (_Abi::template _S_is_partial<_Tp>)
  2855		  return __andnot(__x, __wrapper_bitcast<_Tp>(
  2856					 _Abi::template _S_implicit_mask<_Tp>()));
  2857		else
  2858		  return __not(__x._M_data);
  2859	      }
  2860	
  2861	    template <typename _Tp, size_t _Np>
  2862	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2863	      _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
  2864	      { return __and(__x._M_data, __y._M_data); }
  2865	
  2866	    template <typename _Tp, size_t _Np>
  2867	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2868	      _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
  2869	      { return __or(__x._M_data, __y._M_data); }
  2870	
  2871	    template <typename _Tp, size_t _Np>
  2872	      _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
  2873	      _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
  2874	      { return __xor(__x._M_data, __y._M_data); }
  2875	
  2876	    // smart_reference access {{{2
  2877	    template <typename _Tp, size_t _Np>
  2878	      static constexpr void
  2879	      _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
  2880	      {
  2881		if constexpr (is_same_v<_Tp, bool>)
  2882		  __k._M_set(__i, __x);
  2883		else
  2884		  {
  2885		    static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
  2886		    if (__builtin_is_constant_evaluated())
  2887		      {
  2888			__k = __generate_from_n_evaluations<_Np,
  2889							    __vector_type_t<_Tp, _Np>>(
  2890			  [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
  2891			    if (__i == static_cast<int>(__j))
  2892			      return _Tp(-__x);
  2893			    else
  2894			      return __k[+__j];
  2895			  });
  2896		      }
  2897		    else
  2898		      __k._M_data[__i] = -__x;
  2899		  }
  2900	      }
  2901	
  2902	    // _S_masked_assign{{{2
  2903	    template <typename _Tp, size_t _Np>
  2904	      _GLIBCXX_SIMD_INTRINSIC static void
  2905	      _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
  2906			       __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
  2907	      { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
  2908	
  2909	    template <typename _Tp, size_t _Np>
  2910	      _GLIBCXX_SIMD_INTRINSIC static void
  2911	      _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
  2912	      {
  2913		if (__builtin_constant_p(__rhs))
  2914		  {
  2915		    if (__rhs == false)
  2916		      __lhs = __andnot(__k, __lhs);
  2917		    else
  2918		      __lhs = __or(__k, __lhs);
  2919		    return;
  2920		  }
  2921		__lhs = _CommonImpl::_S_blend(__k, __lhs,
  2922					      __data(simd_mask<_Tp, _Abi>(__rhs)));
  2923	      }
  2924	
  2925	    //}}}2
  2926	    // _S_all_of {{{
  2927	    template <typename _Tp>
  2928	      _GLIBCXX_SIMD_INTRINSIC static bool
  2929	      _S_all_of(simd_mask<_Tp, _Abi> __k)
  2930	      {
  2931		return __call_with_subscripts(
  2932		  __data(__k), make_index_sequence<_S_size<_Tp>>(),
  2933		  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
  2934		  { return (... && !(__ent == 0)); });
  2935	      }
  2936	
  2937	    // }}}
  2938	    // _S_any_of {{{
  2939	    template <typename _Tp>
  2940	      _GLIBCXX_SIMD_INTRINSIC static bool
  2941	      _S_any_of(simd_mask<_Tp, _Abi> __k)
  2942	      {
  2943		return __call_with_subscripts(
  2944		  __data(__k), make_index_sequence<_S_size<_Tp>>(),
  2945		  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
  2946		  { return (... || !(__ent == 0)); });
  2947	      }
  2948	
  2949	    // }}}
  2950	    // _S_none_of {{{
  2951	    template <typename _Tp>
  2952	      _GLIBCXX_SIMD_INTRINSIC static bool
  2953	      _S_none_of(simd_mask<_Tp, _Abi> __k)
  2954	      {
  2955		return __call_with_subscripts(
  2956		  __data(__k), make_index_sequence<_S_size<_Tp>>(),
  2957		  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
  2958		  { return (... && (__ent == 0)); });
  2959	      }
  2960	
  2961	    // }}}
  2962	    // _S_some_of {{{
  2963	    template <typename _Tp>
  2964	      _GLIBCXX_SIMD_INTRINSIC static bool
  2965	      _S_some_of(simd_mask<_Tp, _Abi> __k)
  2966	      {
  2967		const int __n_true = _SuperImpl::_S_popcount(__k);
  2968		return __n_true > 0 && __n_true < int(_S_size<_Tp>);
  2969	      }
  2970	
  2971	    // }}}
  2972	    // _S_popcount {{{
  2973	    template <typename _Tp>
  2974	      _GLIBCXX_SIMD_INTRINSIC static int
  2975	      _S_popcount(simd_mask<_Tp, _Abi> __k)
  2976	      {
  2977		using _I = __int_for_sizeof_t<_Tp>;
  2978		if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
  2979		  return -reduce(
  2980		    simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
  2981		else
  2982		  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
  2983		    simd<_Tp, _Abi>(__private_init, __data(__k))));
  2984	      }
  2985	
  2986	    // }}}
  2987	    // _S_find_first_set {{{
  2988	    template <typename _Tp>
  2989	      _GLIBCXX_SIMD_INTRINSIC static int
  2990	      _S_find_first_set(simd_mask<_Tp, _Abi> __k)
  2991	      { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
  2992	
  2993	    // }}}
  2994	    // _S_find_last_set {{{
  2995	    template <typename _Tp>
  2996	      _GLIBCXX_SIMD_INTRINSIC static int
  2997	      _S_find_last_set(simd_mask<_Tp, _Abi> __k)
  2998	      { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; }
  2999	
  3000	    // }}}
  3001	  };
  3002	
  3003	//}}}1
  3004	_GLIBCXX_SIMD_END_NAMESPACE
  3005	#endif // __cplusplus >= 201703L
  3006	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
  3007	
  3008	// vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=80