Where Online Learning is simpler!

The C and C++ Include Header Files

/usr/include/c++/11/experimental/bits/simd_x86_conversions.h


$ cat -n /usr/include/c++/11/experimental/bits/simd_x86_conversions.h

     1	// x86 specific conversion optimizations -*- C++ -*-
     2	
     3	// Copyright (C) 2020-2021 Free Software Foundation, Inc.
     4	//
     5	// This file is part of the GNU ISO C++ Library.  This library is free
     6	// software; you can redistribute it and/or modify it under the
     7	// terms of the GNU General Public License as published by the
     8	// Free Software Foundation; either version 3, or (at your option)
     9	// any later version.
    10	
    11	// This library is distributed in the hope that it will be useful,
    12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
    13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14	// GNU General Public License for more details.
    15	
    16	// Under Section 7 of GPL version 3, you are granted additional
    17	// permissions described in the GCC Runtime Library Exception, version
    18	// 3.1, as published by the Free Software Foundation.
    19	
    20	// You should have received a copy of the GNU General Public License and
    21	// a copy of the GCC Runtime Library Exception along with this program;
    22	// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    23	// <http://www.gnu.org/licenses/>.
    24	
    25	#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
    26	#define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
    27	
    28	#if __cplusplus >= 201703L
    29	
    30	// work around PR85827
    31	// 1-arg __convert_x86 {{{1
    32	template <typename _To, typename _V, typename _Traits>
    33	  _GLIBCXX_SIMD_INTRINSIC _To
    34	  __convert_x86(_V __v)
    35	  {
    36	    static_assert(__is_vector_type_v<_V>);
    37	    using _Tp = typename _Traits::value_type;
    38	    constexpr size_t _Np = _Traits::_S_full_size;
    39	    [[maybe_unused]] const auto __intrin = __to_intrin(__v);
    40	    using _Up = typename _VectorTraits<_To>::value_type;
    41	    constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
    42	
    43	    // [xyz]_to_[xyz] {{{2
    44	    [[maybe_unused]] constexpr bool __x_to_x
    45	      = sizeof(__v) <= 16 && sizeof(_To) <= 16;
    46	    [[maybe_unused]] constexpr bool __x_to_y
    47	      = sizeof(__v) <= 16 && sizeof(_To) == 32;
    48	    [[maybe_unused]] constexpr bool __x_to_z
    49	      = sizeof(__v) <= 16 && sizeof(_To) == 64;
    50	    [[maybe_unused]] constexpr bool __y_to_x
    51	      = sizeof(__v) == 32 && sizeof(_To) <= 16;
    52	    [[maybe_unused]] constexpr bool __y_to_y
    53	      = sizeof(__v) == 32 && sizeof(_To) == 32;
    54	    [[maybe_unused]] constexpr bool __y_to_z
    55	      = sizeof(__v) == 32 && sizeof(_To) == 64;
    56	    [[maybe_unused]] constexpr bool __z_to_x
    57	      = sizeof(__v) == 64 && sizeof(_To) <= 16;
    58	    [[maybe_unused]] constexpr bool __z_to_y
    59	      = sizeof(__v) == 64 && sizeof(_To) == 32;
    60	    [[maybe_unused]] constexpr bool __z_to_z
    61	      = sizeof(__v) == 64 && sizeof(_To) == 64;
    62	
    63	    // iX_to_iX {{{2
    64	    [[maybe_unused]] constexpr bool __i_to_i
    65	      = is_integral_v<_Up> && is_integral_v<_Tp>;
    66	    [[maybe_unused]] constexpr bool __i8_to_i16
    67	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
    68	    [[maybe_unused]] constexpr bool __i8_to_i32
    69	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
    70	    [[maybe_unused]] constexpr bool __i8_to_i64
    71	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
    72	    [[maybe_unused]] constexpr bool __i16_to_i8
    73	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
    74	    [[maybe_unused]] constexpr bool __i16_to_i32
    75	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
    76	    [[maybe_unused]] constexpr bool __i16_to_i64
    77	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
    78	    [[maybe_unused]] constexpr bool __i32_to_i8
    79	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
    80	    [[maybe_unused]] constexpr bool __i32_to_i16
    81	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
    82	    [[maybe_unused]] constexpr bool __i32_to_i64
    83	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
    84	    [[maybe_unused]] constexpr bool __i64_to_i8
    85	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
    86	    [[maybe_unused]] constexpr bool __i64_to_i16
    87	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
    88	    [[maybe_unused]] constexpr bool __i64_to_i32
    89	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
    90	
    91	    // [fsu]X_to_[fsu]X {{{2
    92	    // ibw = integral && byte or word, i.e. char and short with any signedness
    93	    [[maybe_unused]] constexpr bool __s64_to_f32
    94	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
    95		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
    96	    [[maybe_unused]] constexpr bool __s32_to_f32
    97	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
    98		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
    99	    [[maybe_unused]] constexpr bool __s16_to_f32
   100	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
   101		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   102	    [[maybe_unused]] constexpr bool __s8_to_f32
   103	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
   104		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   105	    [[maybe_unused]] constexpr bool __u64_to_f32
   106	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
   107		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   108	    [[maybe_unused]] constexpr bool __u32_to_f32
   109	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
   110		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   111	    [[maybe_unused]] constexpr bool __u16_to_f32
   112	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
   113		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   114	    [[maybe_unused]] constexpr bool __u8_to_f32
   115	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
   116		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   117	    [[maybe_unused]] constexpr bool __s64_to_f64
   118	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
   119		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   120	    [[maybe_unused]] constexpr bool __s32_to_f64
   121	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
   122		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   123	    [[maybe_unused]] constexpr bool __u64_to_f64
   124	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
   125		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   126	    [[maybe_unused]] constexpr bool __u32_to_f64
   127	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
   128		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   129	    [[maybe_unused]] constexpr bool __f32_to_s64
   130	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
   131		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   132	    [[maybe_unused]] constexpr bool __f32_to_s32
   133	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
   134		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   135	    [[maybe_unused]] constexpr bool __f32_to_u64
   136	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
   137		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   138	    [[maybe_unused]] constexpr bool __f32_to_u32
   139	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
   140		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   141	    [[maybe_unused]] constexpr bool __f64_to_s64
   142	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
   143		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   144	    [[maybe_unused]] constexpr bool __f64_to_s32
   145	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
   146		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   147	    [[maybe_unused]] constexpr bool __f64_to_u64
   148	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
   149		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   150	    [[maybe_unused]] constexpr bool __f64_to_u32
   151	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
   152		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   153	    [[maybe_unused]] constexpr bool __ibw_to_f32
   154	      = is_integral_v<_Tp> && sizeof(_Tp) <= 2
   155		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   156	    [[maybe_unused]] constexpr bool __ibw_to_f64
   157	      = is_integral_v<_Tp> && sizeof(_Tp) <= 2
   158		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   159	    [[maybe_unused]] constexpr bool __f32_to_ibw
   160	      = is_integral_v<_Up> && sizeof(_Up) <= 2
   161		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   162	    [[maybe_unused]] constexpr bool __f64_to_ibw
   163	      = is_integral_v<_Up> && sizeof(_Up) <= 2
   164		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   165	    [[maybe_unused]] constexpr bool __f32_to_f64
   166	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
   167		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   168	    [[maybe_unused]] constexpr bool __f64_to_f32
   169	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
   170		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   171	
   172	    if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
   173	      return __convert_x86<_To>(__lo128(__v), __hi128(__v));
   174	    else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2
   175	      return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
   176			      __convert_x86<__vector_type_t<_Up, _M / 2>>(
   177				__extract_part<1, _Np / _M * 2>(__v)));
   178	    else if constexpr (__i_to_i) //{{{2
   179	      {
   180		static_assert(__x_to_x || __have_avx2,
   181			      "integral conversions with ymm registers require AVX2");
   182		static_assert(__have_avx512bw
   183				|| ((sizeof(_Tp) >= 4 || sizeof(__v) < 64)
   184				    && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
   185			      "8/16-bit integers in zmm registers require AVX512BW");
   186		static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) || __have_avx512f,
   187			      "integral conversions with ymm registers require AVX2");
   188	      }
   189	    if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2
   190			  sizeof(_Tp) == sizeof(_Up))
   191	      {
   192		// conversion uses simple bit reinterpretation (or no conversion at all)
   193		if constexpr (_Np >= _M)
   194		  return __intrin_bitcast<_To>(__v);
   195		else
   196		  return __zero_extend(__vector_bitcast<_Up>(__v));
   197	      }
   198	    else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2
   199	      // zero extend (eg. xmm -> ymm)
   200	      return __zero_extend(
   201		__convert_x86<__vector_type_t<
   202		  _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v));
   203	    else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2
   204	      // partial input (eg. ymm -> xmm)
   205	      return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
   206	    else if constexpr (__i64_to_i32) //{{{2
   207	      {
   208		if constexpr (__x_to_x && __have_avx512vl)
   209		  return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
   210		else if constexpr (__x_to_x)
   211		  return __auto_bitcast(
   212		    _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
   213		else if constexpr (__y_to_x && __have_avx512vl)
   214		  return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
   215		else if constexpr (__y_to_x && __have_avx512f)
   216		  return __intrin_bitcast<_To>(
   217		    __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
   218		else if constexpr (__y_to_x)
   219		  return __intrin_bitcast<_To>(
   220		    __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
   221						     0 + 4 * 2)));
   222		else if constexpr (__z_to_y)
   223		  return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
   224	      }
   225	    else if constexpr (__i64_to_i16) //{{{2
   226	      {
   227		if constexpr (__x_to_x && __have_avx512vl)
   228		  return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
   229		else if constexpr (__x_to_x && __have_avx512f)
   230		  return __intrin_bitcast<_To>(
   231		    __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
   232		else if constexpr (__x_to_x && __have_ssse3)
   233		  {
   234		    return __intrin_bitcast<_To>(
   235		      _mm_shuffle_epi8(__intrin,
   236				       _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
   237						     -0x80, -0x80, -0x80, -0x80, -0x80,
   238						     -0x80, -0x80, -0x80, -0x80)));
   239		    // fallback without SSSE3
   240		  }
   241		else if constexpr (__y_to_x && __have_avx512vl)
   242		  return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
   243		else if constexpr (__y_to_x && __have_avx512f)
   244		  return __intrin_bitcast<_To>(
   245		    __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
   246		else if constexpr (__y_to_x)
   247		  {
   248		    const auto __a = _mm256_shuffle_epi8(
   249		      __intrin,
   250		      _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
   251				       -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
   252				       -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
   253				       -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
   254				       -0x80));
   255		    return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
   256		  }
   257		else if constexpr (__z_to_x)
   258		  return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
   259	      }
   260	    else if constexpr (__i64_to_i8) //{{{2
   261	      {
   262		if constexpr (__x_to_x && __have_avx512vl)
   263		  return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
   264		else if constexpr (__x_to_x && __have_avx512f)
   265		  return __intrin_bitcast<_To>(
   266		    __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
   267		else if constexpr (__y_to_x && __have_avx512vl)
   268		  return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
   269		else if constexpr (__y_to_x && __have_avx512f)
   270		  return __intrin_bitcast<_To>(
   271		    _mm512_cvtepi64_epi8(__zero_extend(__intrin)));
   272		else if constexpr (__z_to_x)
   273		  return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
   274	      }
   275	    else if constexpr (__i32_to_i64) //{{{2
   276	      {
   277		if constexpr (__have_sse4_1 && __x_to_x)
   278		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   279						 ? _mm_cvtepi32_epi64(__intrin)
   280						 : _mm_cvtepu32_epi64(__intrin));
   281		else if constexpr (__x_to_x)
   282		  {
   283		    return __intrin_bitcast<_To>(
   284		      _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
   285						     ? _mm_srai_epi32(__intrin, 31)
   286						     : __m128i()));
   287		  }
   288		else if constexpr (__x_to_y)
   289		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   290						 ? _mm256_cvtepi32_epi64(__intrin)
   291						 : _mm256_cvtepu32_epi64(__intrin));
   292		else if constexpr (__y_to_z)
   293		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   294						 ? _mm512_cvtepi32_epi64(__intrin)
   295						 : _mm512_cvtepu32_epi64(__intrin));
   296	      }
   297	    else if constexpr (__i32_to_i16) //{{{2
   298	      {
   299		if constexpr (__x_to_x && __have_avx512vl)
   300		  return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
   301		else if constexpr (__x_to_x && __have_avx512f)
   302		  return __intrin_bitcast<_To>(
   303		    __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
   304		else if constexpr (__x_to_x && __have_ssse3)
   305		  return __intrin_bitcast<_To>(_mm_shuffle_epi8(
   306		    __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
   307					    -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
   308		else if constexpr (__x_to_x)
   309		  {
   310		    auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o
   311		    auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o
   312		    auto __c = _mm_unpacklo_epi16(__a, __b);            // 02oo ..oo
   313		    auto __d = _mm_unpackhi_epi16(__a, __b);            // 13oo ..oo
   314		    return __intrin_bitcast<_To>(
   315		      _mm_unpacklo_epi16(__c, __d)); // 0123 oooo
   316		  }
   317		else if constexpr (__y_to_x && __have_avx512vl)
   318		  return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
   319		else if constexpr (__y_to_x && __have_avx512f)
   320		  return __intrin_bitcast<_To>(
   321		    __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
   322		else if constexpr (__y_to_x)
   323		  {
   324		    auto __a = _mm256_shuffle_epi8(
   325		      __intrin,
   326		      _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
   327				       -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
   328				       9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
   329				       -0x80, -0x80, -0x80));
   330		    return __intrin_bitcast<_To>(__lo128(
   331		      _mm256_permute4x64_epi64(__a,
   332					       0xf8))); // __a[0] __a[2] | __a[3] __a[3]
   333		  }
   334		else if constexpr (__z_to_y)
   335		  return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
   336	      }
   337	    else if constexpr (__i32_to_i8) //{{{2
   338	      {
   339		if constexpr (__x_to_x && __have_avx512vl)
   340		  return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
   341		else if constexpr (__x_to_x && __have_avx512f)
   342		  return __intrin_bitcast<_To>(
   343		    __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
   344		else if constexpr (__x_to_x && __have_ssse3)
   345		  {
   346		    return __intrin_bitcast<_To>(
   347		      _mm_shuffle_epi8(__intrin,
   348				       _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
   349						     -0x80, -0x80, -0x80, -0x80, -0x80,
   350						     -0x80, -0x80, -0x80, -0x80)));
   351		  }
   352		else if constexpr (__x_to_x)
   353		  {
   354		    const auto __a
   355		      = _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... ....
   356		    const auto __b
   357		      = _mm_unpackhi_epi8(__intrin, __intrin);    // 2... .... 3... ....
   358		    const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... ....
   359		    const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... ....
   360		    const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... ....
   361		    return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
   362		  }
   363		else if constexpr (__y_to_x && __have_avx512vl)
   364		  return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
   365		else if constexpr (__y_to_x && __have_avx512f)
   366		  return __intrin_bitcast<_To>(
   367		    _mm512_cvtepi32_epi8(__zero_extend(__intrin)));
   368		else if constexpr (__z_to_x)
   369		  return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
   370	      }
   371	    else if constexpr (__i16_to_i64) //{{{2
   372	      {
   373		if constexpr (__x_to_x && __have_sse4_1)
   374		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   375						 ? _mm_cvtepi16_epi64(__intrin)
   376						 : _mm_cvtepu16_epi64(__intrin));
   377		else if constexpr (__x_to_x && is_signed_v<_Tp>)
   378		  {
   379		    auto __x = _mm_srai_epi16(__intrin, 15);
   380		    auto __y = _mm_unpacklo_epi16(__intrin, __x);
   381		    __x = _mm_unpacklo_epi16(__x, __x);
   382		    return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
   383		  }
   384		else if constexpr (__x_to_x)
   385		  return __intrin_bitcast<_To>(
   386		    _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
   387				       __m128i()));
   388		else if constexpr (__x_to_y)
   389		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   390						 ? _mm256_cvtepi16_epi64(__intrin)
   391						 : _mm256_cvtepu16_epi64(__intrin));
   392		else if constexpr (__x_to_z)
   393		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   394						 ? _mm512_cvtepi16_epi64(__intrin)
   395						 : _mm512_cvtepu16_epi64(__intrin));
   396	      }
   397	    else if constexpr (__i16_to_i32) //{{{2
   398	      {
   399		if constexpr (__x_to_x && __have_sse4_1)
   400		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   401						 ? _mm_cvtepi16_epi32(__intrin)
   402						 : _mm_cvtepu16_epi32(__intrin));
   403		else if constexpr (__x_to_x && is_signed_v<_Tp>)
   404		  return __intrin_bitcast<_To>(
   405		    _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
   406		else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
   407		  return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
   408		else if constexpr (__x_to_y)
   409		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   410						 ? _mm256_cvtepi16_epi32(__intrin)
   411						 : _mm256_cvtepu16_epi32(__intrin));
   412		else if constexpr (__y_to_z)
   413		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   414						 ? _mm512_cvtepi16_epi32(__intrin)
   415						 : _mm512_cvtepu16_epi32(__intrin));
   416	      }
   417	    else if constexpr (__i16_to_i8) //{{{2
   418	      {
   419		if constexpr (__x_to_x && __have_avx512bw_vl)
   420		  return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
   421		else if constexpr (__x_to_x && __have_avx512bw)
   422		  return __intrin_bitcast<_To>(
   423		    __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
   424		else if constexpr (__x_to_x && __have_ssse3)
   425		  return __intrin_bitcast<_To>(_mm_shuffle_epi8(
   426		    __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
   427					    -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
   428		else if constexpr (__x_to_x)
   429		  {
   430		    auto __a
   431		      = _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33..
   432		    auto __b
   433		      = _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77..
   434		    auto __c = _mm_unpacklo_epi8(__a, __b);    // 0404 .... 1515 ....
   435		    auto __d = _mm_unpackhi_epi8(__a, __b);    // 2626 .... 3737 ....
   436		    auto __e = _mm_unpacklo_epi8(__c, __d);    // 0246 0246 .... ....
   437		    auto __f = _mm_unpackhi_epi8(__c, __d);    // 1357 1357 .... ....
   438		    return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
   439		  }
   440		else if constexpr (__y_to_x && __have_avx512bw_vl)
   441		  return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
   442		else if constexpr (__y_to_x && __have_avx512bw)
   443		  return __intrin_bitcast<_To>(
   444		    __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
   445		else if constexpr (__y_to_x)
   446		  {
   447		    auto __a = _mm256_shuffle_epi8(
   448		      __intrin,
   449		      _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
   450				       -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
   451				       -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
   452				       4, 6, 8, 10, 12, 14));
   453		    return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
   454		  }
   455		else if constexpr (__z_to_y && __have_avx512bw)
   456		  return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
   457		else if constexpr (__z_to_y)
   458		  __assert_unreachable<_Tp>();
   459	      }
   460	    else if constexpr (__i8_to_i64) //{{{2
   461	      {
   462		if constexpr (__x_to_x && __have_sse4_1)
   463		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   464						 ? _mm_cvtepi8_epi64(__intrin)
   465						 : _mm_cvtepu8_epi64(__intrin));
   466		else if constexpr (__x_to_x && is_signed_v<_Tp>)
   467		  {
   468		    if constexpr (__have_ssse3)
   469		      {
   470			auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
   471			auto __epi16 = _mm_srai_epi16(__dup, 8);
   472			_mm_shuffle_epi8(__epi16,
   473					 _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
   474						       3, 3, 3, 3, 3));
   475		      }
   476		    else
   477		      {
   478			auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
   479			__x = _mm_unpacklo_epi16(__x, __x);
   480			return __intrin_bitcast<_To>(
   481			  _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
   482					     _mm_srai_epi32(__x, 31)));
   483		      }
   484		  }
   485		else if constexpr (__x_to_x)
   486		  {
   487		    return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
   488		      _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
   489					 __m128i()),
   490		      __m128i()));
   491		  }
   492		else if constexpr (__x_to_y)
   493		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   494						 ? _mm256_cvtepi8_epi64(__intrin)
   495						 : _mm256_cvtepu8_epi64(__intrin));
   496		else if constexpr (__x_to_z)
   497		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   498						 ? _mm512_cvtepi8_epi64(__intrin)
   499						 : _mm512_cvtepu8_epi64(__intrin));
   500	      }
   501	    else if constexpr (__i8_to_i32) //{{{2
   502	      {
   503		if constexpr (__x_to_x && __have_sse4_1)
   504		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   505						 ? _mm_cvtepi8_epi32(__intrin)
   506						 : _mm_cvtepu8_epi32(__intrin));
   507		else if constexpr (__x_to_x && is_signed_v<_Tp>)
   508		  {
   509		    const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
   510		    return __intrin_bitcast<_To>(
   511		      _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
   512		  }
   513		else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
   514		  return __intrin_bitcast<_To>(
   515		    _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
   516				       __m128i()));
   517		else if constexpr (__x_to_y)
   518		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   519						 ? _mm256_cvtepi8_epi32(__intrin)
   520						 : _mm256_cvtepu8_epi32(__intrin));
   521		else if constexpr (__x_to_z)
   522		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   523						 ? _mm512_cvtepi8_epi32(__intrin)
   524						 : _mm512_cvtepu8_epi32(__intrin));
   525	      }
   526	    else if constexpr (__i8_to_i16) //{{{2
   527	      {
   528		if constexpr (__x_to_x && __have_sse4_1)
   529		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   530						 ? _mm_cvtepi8_epi16(__intrin)
   531						 : _mm_cvtepu8_epi16(__intrin));
   532		else if constexpr (__x_to_x && is_signed_v<_Tp>)
   533		  return __intrin_bitcast<_To>(
   534		    _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
   535		else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
   536		  return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
   537		else if constexpr (__x_to_y)
   538		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   539						 ? _mm256_cvtepi8_epi16(__intrin)
   540						 : _mm256_cvtepu8_epi16(__intrin));
   541		else if constexpr (__y_to_z && __have_avx512bw)
   542		  return __intrin_bitcast<_To>(is_signed_v<_Tp>
   543						 ? _mm512_cvtepi8_epi16(__intrin)
   544						 : _mm512_cvtepu8_epi16(__intrin));
   545		else if constexpr (__y_to_z)
   546		  __assert_unreachable<_Tp>();
   547	      }
   548	    else if constexpr (__f32_to_s64) //{{{2
   549	      {
   550		if constexpr (__have_avx512dq_vl && __x_to_x)
   551		  return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
   552		else if constexpr (__have_avx512dq_vl && __x_to_y)
   553		  return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
   554		else if constexpr (__have_avx512dq && __y_to_z)
   555		  return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
   556		// else use scalar fallback
   557	      }
   558	    else if constexpr (__f32_to_u64) //{{{2
   559	      {
   560		if constexpr (__have_avx512dq_vl && __x_to_x)
   561		  return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
   562		else if constexpr (__have_avx512dq_vl && __x_to_y)
   563		  return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
   564		else if constexpr (__have_avx512dq && __y_to_z)
   565		  return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
   566		// else use scalar fallback
   567	      }
   568	    else if constexpr (__f32_to_s32) //{{{2
   569	      {
   570		if constexpr (__x_to_x || __y_to_y || __z_to_z)
   571		  {
   572		    // go to fallback, it does the right thing
   573		  }
   574		else
   575		  __assert_unreachable<_Tp>();
   576	      }
   577	    else if constexpr (__f32_to_u32) //{{{2
   578	      {
   579		if constexpr (__have_avx512vl && __x_to_x)
   580		  return __auto_bitcast(_mm_cvttps_epu32(__intrin));
   581		else if constexpr (__have_avx512f && __x_to_x)
   582		  return __auto_bitcast(
   583		    __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
   584		else if constexpr (__have_avx512vl && __y_to_y)
   585		  return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
   586		else if constexpr (__have_avx512f && __y_to_y)
   587		  return __vector_bitcast<_Up>(
   588		    __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
   589		else if constexpr (__x_to_x || __y_to_y || __z_to_z)
   590		  {
   591		    // go to fallback, it does the right thing. We can't use the
   592		    // _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would
   593		    // discard small input values (only 24 mantissa bits)
   594		  }
   595		else
   596		  __assert_unreachable<_Tp>();
   597	      }
   598	    else if constexpr (__f32_to_ibw) //{{{2
   599	      return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
   600	    else if constexpr (__f64_to_s64) //{{{2
   601	      {
   602		if constexpr (__have_avx512dq_vl && __x_to_x)
   603		  return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
   604		else if constexpr (__have_avx512dq_vl && __y_to_y)
   605		  return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
   606		else if constexpr (__have_avx512dq && __z_to_z)
   607		  return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
   608		// else use scalar fallback
   609	      }
   610	    else if constexpr (__f64_to_u64) //{{{2
   611	      {
   612		if constexpr (__have_avx512dq_vl && __x_to_x)
   613		  return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
   614		else if constexpr (__have_avx512dq_vl && __y_to_y)
   615		  return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
   616		else if constexpr (__have_avx512dq && __z_to_z)
   617		  return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
   618		// else use scalar fallback
   619	      }
   620	    else if constexpr (__f64_to_s32) //{{{2
   621	      {
   622		if constexpr (__x_to_x)
   623		  return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
   624		else if constexpr (__y_to_x)
   625		  return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
   626		else if constexpr (__z_to_y)
   627		  return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
   628	      }
   629	    else if constexpr (__f64_to_u32) //{{{2
   630	      {
   631		if constexpr (__have_avx512vl && __x_to_x)
   632		  return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
   633		else if constexpr (__have_sse4_1 && __x_to_x)
   634		  return __vector_bitcast<_Up, _M>(
   635			   _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u))
   636			 ^ 0x8000'0000u;
   637		else if constexpr (__x_to_x)
   638		  {
   639		    // use scalar fallback: it's only 2 values to convert, can't get
   640		    // much better than scalar decomposition
   641		  }
   642		else if constexpr (__have_avx512vl && __y_to_x)
   643		  return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
   644		else if constexpr (__y_to_x)
   645		  {
   646		    return __intrin_bitcast<_To>(
   647		      __vector_bitcast<_Up>(
   648			_mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u))
   649		      ^ 0x8000'0000u);
   650		  }
   651		else if constexpr (__z_to_y)
   652		  return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
   653	      }
   654	    else if constexpr (__f64_to_ibw) //{{{2
   655	      {
   656		return __convert_x86<_To>(
   657		  __convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v));
   658	      }
   659	    else if constexpr (__s64_to_f32) //{{{2
   660	      {
   661		if constexpr (__x_to_x && __have_avx512dq_vl)
   662		  return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
   663		else if constexpr (__y_to_x && __have_avx512dq_vl)
   664		  return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
   665		else if constexpr (__z_to_y && __have_avx512dq)
   666		  return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
   667		else if constexpr (__z_to_y)
   668		  return __intrin_bitcast<_To>(
   669		    _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
   670	      }
   671	    else if constexpr (__u64_to_f32) //{{{2
   672	      {
   673		if constexpr (__x_to_x && __have_avx512dq_vl)
   674		  return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
   675		else if constexpr (__y_to_x && __have_avx512dq_vl)
   676		  return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
   677		else if constexpr (__z_to_y && __have_avx512dq)
   678		  return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
   679		else if constexpr (__z_to_y)
   680		  {
   681		    return __intrin_bitcast<_To>(
   682		      __lo256(_mm512_cvtepu32_ps(__auto_bitcast(
   683			_mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
   684			* 0x100000000LL
   685		      + __lo256(_mm512_cvtepu32_ps(
   686			__auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
   687		  }
   688	      }
   689	    else if constexpr (__s32_to_f32) //{{{2
   690	      {
   691		// use fallback (builtin conversion)
   692	      }
   693	    else if constexpr (__u32_to_f32) //{{{2
   694	      {
   695		if constexpr (__x_to_x && __have_avx512vl)
   696		  {
   697		    // use fallback
   698		  }
   699		else if constexpr (__x_to_x && __have_avx512f)
   700		  return __intrin_bitcast<_To>(
   701		    __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
   702		else if constexpr (__x_to_x && (__have_fma || __have_fma4))
   703		  // work around PR85819
   704		  return __auto_bitcast(0x10000
   705					  * _mm_cvtepi32_ps(__to_intrin(__v >> 16))
   706					+ _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
   707		else if constexpr (__y_to_y && __have_avx512vl)
   708		  {
   709		    // use fallback
   710		  }
   711		else if constexpr (__y_to_y && __have_avx512f)
   712		  return __intrin_bitcast<_To>(
   713		    __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
   714		else if constexpr (__y_to_y)
   715		  // work around PR85819
   716		  return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
   717			 + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
   718		// else use fallback (builtin conversion)
   719	      }
   720	    else if constexpr (__ibw_to_f32) //{{{2
   721	      {
   722		if constexpr (_M <= 4 || __have_avx2)
   723		  return __convert_x86<_To>(
   724		    __convert_x86<__vector_type_t<int, _M>>(__v));
   725		else
   726		  {
   727		    static_assert(__x_to_y);
   728		    __m128i __a, __b;
   729		    if constexpr (__have_sse4_1)
   730		      {
   731			__a = sizeof(_Tp) == 2
   732				? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
   733						    : _mm_cvtepu16_epi32(__intrin))
   734				: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
   735						    : _mm_cvtepu8_epi32(__intrin));
   736			const auto __w
   737			  = _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9);
   738			__b = sizeof(_Tp) == 2
   739				? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
   740						    : _mm_cvtepu16_epi32(__w))
   741				: (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
   742						    : _mm_cvtepu8_epi32(__w));
   743		      }
   744		    else
   745		      {
   746			__m128i __tmp;
   747			if constexpr (sizeof(_Tp) == 1)
   748			  {
   749			    __tmp = is_signed_v<_Tp>
   750				      ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
   751									 __intrin),
   752						       8)
   753				      : _mm_unpacklo_epi8(__intrin, __m128i());
   754			  }
   755			else
   756			  {
   757			    static_assert(sizeof(_Tp) == 2);
   758			    __tmp = __intrin;
   759			  }
   760			__a = is_signed_v<_Tp>
   761				? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
   762				: _mm_unpacklo_epi16(__tmp, __m128i());
   763			__b = is_signed_v<_Tp>
   764				? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
   765				: _mm_unpackhi_epi16(__tmp, __m128i());
   766		      }
   767		    return __convert_x86<_To>(__vector_bitcast<int>(__a),
   768					      __vector_bitcast<int>(__b));
   769		  }
   770	      }
   771	    else if constexpr (__s64_to_f64) //{{{2
   772	      {
   773		if constexpr (__x_to_x && __have_avx512dq_vl)
   774		  return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
   775		else if constexpr (__y_to_y && __have_avx512dq_vl)
   776		  return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
   777		else if constexpr (__z_to_z && __have_avx512dq)
   778		  return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
   779		else if constexpr (__z_to_z)
   780		  {
   781		    return __intrin_bitcast<_To>(
   782		      _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
   783			* 0x100000000LL
   784		      + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
   785		  }
   786	      }
   787	    else if constexpr (__u64_to_f64) //{{{2
   788	      {
   789		if constexpr (__x_to_x && __have_avx512dq_vl)
   790		  return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
   791		else if constexpr (__y_to_y && __have_avx512dq_vl)
   792		  return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
   793		else if constexpr (__z_to_z && __have_avx512dq)
   794		  return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
   795		else if constexpr (__z_to_z)
   796		  {
   797		    return __intrin_bitcast<_To>(
   798		      _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
   799			* 0x100000000LL
   800		      + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
   801		  }
   802	      }
   803	    else if constexpr (__s32_to_f64) //{{{2
   804	      {
   805		if constexpr (__x_to_x)
   806		  return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
   807		else if constexpr (__x_to_y)
   808		  return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
   809		else if constexpr (__y_to_z)
   810		  return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
   811	      }
   812	    else if constexpr (__u32_to_f64) //{{{2
   813	      {
   814		if constexpr (__x_to_x && __have_avx512vl)
   815		  return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
   816		else if constexpr (__x_to_x && __have_avx512f)
   817		  return __intrin_bitcast<_To>(
   818		    __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
   819		else if constexpr (__x_to_x)
   820		  return __intrin_bitcast<_To>(
   821		    _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
   822		else if constexpr (__x_to_y && __have_avx512vl)
   823		  return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
   824		else if constexpr (__x_to_y && __have_avx512f)
   825		  return __intrin_bitcast<_To>(
   826		    __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
   827		else if constexpr (__x_to_y)
   828		  return __intrin_bitcast<_To>(
   829		    _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u);
   830		else if constexpr (__y_to_z)
   831		  return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
   832	      }
   833	    else if constexpr (__ibw_to_f64) //{{{2
   834	      {
   835		return __convert_x86<_To>(
   836		  __convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v));
   837	      }
   838	    else if constexpr (__f32_to_f64) //{{{2
   839	      {
   840		if constexpr (__x_to_x)
   841		  return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
   842		else if constexpr (__x_to_y)
   843		  return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
   844		else if constexpr (__y_to_z)
   845		  return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
   846	      }
   847	    else if constexpr (__f64_to_f32) //{{{2
   848	      {
   849		if constexpr (__x_to_x)
   850		  return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
   851		else if constexpr (__y_to_x)
   852		  return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
   853		else if constexpr (__z_to_y)
   854		  return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
   855	      }
   856	    else //{{{2
   857	      __assert_unreachable<_Tp>();
   858	
   859	    // fallback:{{{2
   860	    return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>());
   861	    //}}}
   862	  }
   863	
   864	// }}}
   865	// 2-arg __convert_x86 {{{1
   866	template <typename _To, typename _V, typename _Traits>
   867	  _GLIBCXX_SIMD_INTRINSIC _To
   868	  __convert_x86(_V __v0, _V __v1)
   869	  {
   870	    static_assert(__is_vector_type_v<_V>);
   871	    using _Tp = typename _Traits::value_type;
   872	    constexpr size_t _Np = _Traits::_S_full_size;
   873	    [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
   874	    [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
   875	    using _Up = typename _VectorTraits<_To>::value_type;
   876	    constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
   877	
   878	    static_assert(2 * _Np <= _M,
   879			  "__v1 would be discarded; use the one-argument "
   880			  "__convert_x86 overload instead");
   881	
   882	    // [xyz]_to_[xyz] {{{2
   883	    [[maybe_unused]] constexpr bool __x_to_x
   884	      = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
   885	    [[maybe_unused]] constexpr bool __x_to_y
   886	      = sizeof(__v0) <= 16 && sizeof(_To) == 32;
   887	    [[maybe_unused]] constexpr bool __x_to_z
   888	      = sizeof(__v0) <= 16 && sizeof(_To) == 64;
   889	    [[maybe_unused]] constexpr bool __y_to_x
   890	      = sizeof(__v0) == 32 && sizeof(_To) <= 16;
   891	    [[maybe_unused]] constexpr bool __y_to_y
   892	      = sizeof(__v0) == 32 && sizeof(_To) == 32;
   893	    [[maybe_unused]] constexpr bool __y_to_z
   894	      = sizeof(__v0) == 32 && sizeof(_To) == 64;
   895	    [[maybe_unused]] constexpr bool __z_to_x
   896	      = sizeof(__v0) == 64 && sizeof(_To) <= 16;
   897	    [[maybe_unused]] constexpr bool __z_to_y
   898	      = sizeof(__v0) == 64 && sizeof(_To) == 32;
   899	    [[maybe_unused]] constexpr bool __z_to_z
   900	      = sizeof(__v0) == 64 && sizeof(_To) == 64;
   901	
   902	    // iX_to_iX {{{2
   903	    [[maybe_unused]] constexpr bool __i_to_i
   904	      = is_integral_v<_Up> && is_integral_v<_Tp>;
   905	    [[maybe_unused]] constexpr bool __i8_to_i16
   906	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
   907	    [[maybe_unused]] constexpr bool __i8_to_i32
   908	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
   909	    [[maybe_unused]] constexpr bool __i8_to_i64
   910	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
   911	    [[maybe_unused]] constexpr bool __i16_to_i8
   912	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
   913	    [[maybe_unused]] constexpr bool __i16_to_i32
   914	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
   915	    [[maybe_unused]] constexpr bool __i16_to_i64
   916	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
   917	    [[maybe_unused]] constexpr bool __i32_to_i8
   918	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
   919	    [[maybe_unused]] constexpr bool __i32_to_i16
   920	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
   921	    [[maybe_unused]] constexpr bool __i32_to_i64
   922	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
   923	    [[maybe_unused]] constexpr bool __i64_to_i8
   924	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
   925	    [[maybe_unused]] constexpr bool __i64_to_i16
   926	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
   927	    [[maybe_unused]] constexpr bool __i64_to_i32
   928	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
   929	
   930	    // [fsu]X_to_[fsu]X {{{2
   931	    // ibw = integral && byte or word, i.e. char and short with any signedness
   932	    [[maybe_unused]] constexpr bool __i64_to_f32
   933	      = is_integral_v<_Tp> && sizeof(_Tp) == 8
   934		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   935	    [[maybe_unused]] constexpr bool __s32_to_f32
   936	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
   937		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   938	    [[maybe_unused]] constexpr bool __s16_to_f32
   939	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
   940		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   941	    [[maybe_unused]] constexpr bool __s8_to_f32
   942	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
   943		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   944	    [[maybe_unused]] constexpr bool __u32_to_f32
   945	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
   946		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   947	    [[maybe_unused]] constexpr bool __u16_to_f32
   948	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
   949		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   950	    [[maybe_unused]] constexpr bool __u8_to_f32
   951	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
   952		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
   953	    [[maybe_unused]] constexpr bool __s64_to_f64
   954	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
   955		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   956	    [[maybe_unused]] constexpr bool __s32_to_f64
   957	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
   958		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   959	    [[maybe_unused]] constexpr bool __s16_to_f64
   960	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
   961		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   962	    [[maybe_unused]] constexpr bool __s8_to_f64
   963	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
   964		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   965	    [[maybe_unused]] constexpr bool __u64_to_f64
   966	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
   967		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   968	    [[maybe_unused]] constexpr bool __u32_to_f64
   969	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
   970		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   971	    [[maybe_unused]] constexpr bool __u16_to_f64
   972	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
   973		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   974	    [[maybe_unused]] constexpr bool __u8_to_f64
   975	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
   976		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
   977	    [[maybe_unused]] constexpr bool __f32_to_s64
   978	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
   979		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   980	    [[maybe_unused]] constexpr bool __f32_to_s32
   981	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
   982		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   983	    [[maybe_unused]] constexpr bool __f32_to_u64
   984	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
   985		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   986	    [[maybe_unused]] constexpr bool __f32_to_u32
   987	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
   988		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
   989	    [[maybe_unused]] constexpr bool __f64_to_s64
   990	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
   991		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   992	    [[maybe_unused]] constexpr bool __f64_to_s32
   993	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
   994		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   995	    [[maybe_unused]] constexpr bool __f64_to_u64
   996	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
   997		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
   998	    [[maybe_unused]] constexpr bool __f64_to_u32
   999	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
  1000		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1001	    [[maybe_unused]] constexpr bool __f32_to_ibw
  1002	      = is_integral_v<_Up> && sizeof(_Up) <= 2
  1003		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1004	    [[maybe_unused]] constexpr bool __f64_to_ibw
  1005	      = is_integral_v<_Up> && sizeof(_Up) <= 2
  1006		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1007	    [[maybe_unused]] constexpr bool __f32_to_f64
  1008	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
  1009		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1010	    [[maybe_unused]] constexpr bool __f64_to_f32
  1011	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
  1012		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1013	
  1014	    if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
  1015	      // <double, 4>, <double, 4> => <short, 8>
  1016	      return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
  1017					__hi128(__v1));
  1018	    else if constexpr (__i_to_i) // assert ISA {{{2
  1019	      {
  1020		static_assert(__x_to_x || __have_avx2,
  1021			      "integral conversions with ymm registers require AVX2");
  1022		static_assert(__have_avx512bw
  1023				|| ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
  1024				    && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
  1025			      "8/16-bit integers in zmm registers require AVX512BW");
  1026		static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
  1027			      "integral conversions with ymm registers require AVX2");
  1028	      }
  1029	    // concat => use 1-arg __convert_x86 {{{2
  1030	    if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
  1031			  || (sizeof(__v0) == 16 && __have_avx
  1032			      && is_floating_point_v<_Tp>)
  1033			  || (sizeof(__v0) == 32 && __have_avx512f
  1034			      && (sizeof(_Tp) >= 4 || __have_avx512bw)))
  1035	      {
  1036		// The ISA can handle wider input registers, so concat and use one-arg
  1037		// implementation. This reduces code duplication considerably.
  1038		return __convert_x86<_To>(__concat(__v0, __v1));
  1039	      }
  1040	    else //{{{2
  1041	      {
  1042		// conversion using bit reinterpretation (or no conversion at all)
  1043		// should all go through the concat branch above:
  1044		static_assert(
  1045		  !(is_floating_point_v<
  1046		      _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
  1047		// handle all zero extension{{{2
  1048		if constexpr (2 * _Np < _M && sizeof(_To) > 16)
  1049		  {
  1050		    constexpr size_t Min = 16 / sizeof(_Up);
  1051		    return __zero_extend(
  1052		      __convert_x86<
  1053			__vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
  1054									       __v1));
  1055		  }
  1056		else if constexpr (__i64_to_i32) //{{{2
  1057		  {
  1058		    if constexpr (__x_to_x)
  1059		      return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
  1060							   __auto_bitcast(__v1), 0x88));
  1061		    else if constexpr (__y_to_y)
  1062		      {
  1063			// AVX512F is not available (would concat otherwise)
  1064			return __auto_bitcast(
  1065			  __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
  1066						   __auto_bitcast(__v1), 0x88)));
  1067			// alternative:
  1068			// const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8);
  1069			// const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8);
  1070			// const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx,
  1071			// v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh,
  1072			// 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3);  // abcdefgh
  1073		      }
  1074		    else if constexpr (__z_to_z)
  1075		      return __intrin_bitcast<_To>(
  1076			__concat(_mm512_cvtepi64_epi32(__i0),
  1077				 _mm512_cvtepi64_epi32(__i1)));
  1078		  }
  1079		else if constexpr (__i64_to_i16) //{{{2
  1080		  {
  1081		    if constexpr (__x_to_x)
  1082		      {
  1083			// AVX2 is not available (would concat otherwise)
  1084			if constexpr (__have_sse4_1)
  1085			  {
  1086			    return __intrin_bitcast<_To>(_mm_shuffle_epi8(
  1087			      _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
  1088			      _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
  1089					    -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
  1090			  }
  1091			else
  1092			  {
  1093			    return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
  1094							    _Up(__v1[0]), _Up(__v1[1])};
  1095			  }
  1096		      }
  1097		    else if constexpr (__y_to_x)
  1098		      {
  1099			auto __a
  1100			  = _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. ....
  1101			auto __b
  1102			  = _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. ....
  1103			auto __c
  1104			  = _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 ....
  1105			return __intrin_bitcast<_To>(
  1106			  _mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567
  1107		      }
  1108		    else if constexpr (__z_to_y)
  1109		      return __intrin_bitcast<_To>(
  1110			__concat(_mm512_cvtepi64_epi16(__i0),
  1111				 _mm512_cvtepi64_epi16(__i1)));
  1112		  }
  1113		else if constexpr (__i64_to_i8) //{{{2
  1114		  {
  1115		    if constexpr (__x_to_x && __have_sse4_1)
  1116		      {
  1117			return __intrin_bitcast<_To>(_mm_shuffle_epi8(
  1118			  _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
  1119			  _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
  1120					-0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  1121					-0x80)));
  1122		      }
  1123		    else if constexpr (__x_to_x && __have_ssse3)
  1124		      {
  1125			return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
  1126			  _mm_shuffle_epi8(
  1127			    __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
  1128						-0x80, -0x80, -0x80, -0x80, -0x80,
  1129						-0x80, -0x80, -0x80, -0x80)),
  1130			  _mm_shuffle_epi8(
  1131			    __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
  1132						-0x80, -0x80, -0x80, -0x80, -0x80,
  1133						-0x80, -0x80, -0x80, -0x80))));
  1134		      }
  1135		    else if constexpr (__x_to_x)
  1136		      {
  1137			return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
  1138							_Up(__v1[0]), _Up(__v1[1])};
  1139		      }
  1140		    else if constexpr (__y_to_x)
  1141		      {
  1142			const auto __a = _mm256_shuffle_epi8(
  1143			  _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
  1144			  _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
  1145					   -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  1146					   -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
  1147					   -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
  1148					   -0x80, -0x80, -0x80, -0x80));
  1149			return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
  1150		      } // __z_to_x uses concat fallback
  1151		  }
  1152		else if constexpr (__i32_to_i16) //{{{2
  1153		  {
  1154		    if constexpr (__x_to_x)
  1155		      {
  1156			// AVX2 is not available (would concat otherwise)
  1157			if constexpr (__have_sse4_1)
  1158			  {
  1159			    return __intrin_bitcast<_To>(_mm_shuffle_epi8(
  1160			      _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
  1161			      _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
  1162					    11, 14, 15)));
  1163			  }
  1164			else if constexpr (__have_ssse3)
  1165			  {
  1166			    return __intrin_bitcast<_To>(
  1167			      _mm_hadd_epi16(__to_intrin(__v0 << 16),
  1168					     __to_intrin(__v1 << 16)));
  1169			    /*
  1170			    return _mm_unpacklo_epi64(
  1171				_mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9,
  1172			    12, 13, 8, 9, 12, 13, 12, 13, 14, 15)),
  1173			    _mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12,
  1174			    13, 8, 9, 12, 13, 12, 13, 14, 15)));
  1175								   */
  1176			  }
  1177			else
  1178			  {
  1179			    auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15..
  1180			    auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37..
  1181			    auto __c = _mm_unpacklo_epi16(__a, __b);   // 0246 ....
  1182			    auto __d = _mm_unpackhi_epi16(__a, __b);   // 1357 ....
  1183			    return __intrin_bitcast<_To>(
  1184			      _mm_unpacklo_epi16(__c, __d)); // 0123 4567
  1185			  }
  1186		      }
  1187		    else if constexpr (__y_to_y)
  1188		      {
  1189			const auto __shuf
  1190			  = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
  1191					     -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  1192					     0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
  1193					     -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
  1194			auto __a = _mm256_shuffle_epi8(__i0, __shuf);
  1195			auto __b = _mm256_shuffle_epi8(__i1, __shuf);
  1196			return __intrin_bitcast<_To>(
  1197			  __xzyw(_mm256_unpacklo_epi64(__a, __b)));
  1198		      } // __z_to_z uses concat fallback
  1199		  }
  1200		else if constexpr (__i32_to_i8) //{{{2
  1201		  {
  1202		    if constexpr (__x_to_x && __have_ssse3)
  1203		      {
  1204			const auto shufmask
  1205			  = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
  1206					  -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
  1207					  -0x80, -0x80);
  1208			return __intrin_bitcast<_To>(
  1209			  _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
  1210					     _mm_shuffle_epi8(__i1, shufmask)));
  1211		      }
  1212		    else if constexpr (__x_to_x)
  1213		      {
  1214			auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. ....
  1215			auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. ....
  1216			auto __c = _mm_unpacklo_epi8(__a, __b);   // 0246 .... .... ....
  1217			auto __d = _mm_unpackhi_epi8(__a, __b);   // 1357 .... .... ....
  1218			auto __e = _mm_unpacklo_epi8(__c, __d);   // 0123 4567 .... ....
  1219			return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
  1220		      }
  1221		    else if constexpr (__y_to_x)
  1222		      {
  1223			const auto __a = _mm256_shuffle_epi8(
  1224			  _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
  1225			  _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
  1226					   6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
  1227					   -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
  1228					   -0x80, -0x80, -0x80, 2, 6, 10, 14));
  1229			return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
  1230		      } // __z_to_y uses concat fallback
  1231		  }
  1232		else if constexpr (__i16_to_i8) //{{{2
  1233		  {
  1234		    if constexpr (__x_to_x && __have_ssse3)
  1235		      {
  1236			const auto __shuf = reinterpret_cast<__m128i>(
  1237			  __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
  1238						      0x80, 0x80, 0x80, 0x80, 0x80,
  1239						      0x80, 0x80});
  1240			return __intrin_bitcast<_To>(
  1241			  _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
  1242					     _mm_shuffle_epi8(__i1, __shuf)));
  1243		      }
  1244		    else if constexpr (__x_to_x)
  1245		      {
  1246			auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B..
  1247			auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F..
  1248			auto __c = _mm_unpacklo_epi8(__a, __b);   // 048C .... 159D ....
  1249			auto __d = _mm_unpackhi_epi8(__a, __b);   // 26AE .... 37BF ....
  1250			auto __e = _mm_unpacklo_epi8(__c, __d);   // 0246 8ACE .... ....
  1251			auto __f = _mm_unpackhi_epi8(__c, __d);   // 1357 9BDF .... ....
  1252			return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
  1253		      }
  1254		    else if constexpr (__y_to_y)
  1255		      {
  1256			return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
  1257			  (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
  1258			    | _mm256_slli_epi16(__i1, 8),
  1259			  _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
  1260					   13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
  1261					   7, 9, 11, 13, 15))));
  1262		      } // __z_to_z uses concat fallback
  1263		  }
  1264		else if constexpr (__i64_to_f32) //{{{2
  1265		  {
  1266		    if constexpr (__x_to_x)
  1267		      return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
  1268		    else if constexpr (__y_to_y)
  1269		      {
  1270			static_assert(__y_to_y && __have_avx2);
  1271			const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG
  1272			const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH
  1273			const auto __lo32
  1274			  = _mm256_unpacklo_epi32(__a, __b); // abef cdgh
  1275			const auto __hi32 = __vector_bitcast<
  1276			  conditional_t<is_signed_v<_Tp>, int, _UInt>>(
  1277			  _mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH
  1278			const auto __hi
  1279			  = 0x100000000LL
  1280			    * __convert_x86<__vector_type_t<float, 8>>(__hi32);
  1281			const auto __mid
  1282			  = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
  1283			const auto __lo
  1284			  = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
  1285			return __xzyw((__hi + __mid) + __lo);
  1286		      }
  1287		    else if constexpr (__z_to_z && __have_avx512dq)
  1288		      {
  1289			return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
  1290							   _mm512_cvtepi64_ps(__i1))
  1291						: __concat(_mm512_cvtepu64_ps(__i0),
  1292							   _mm512_cvtepu64_ps(__i1));
  1293		      }
  1294		    else if constexpr (__z_to_z && is_signed_v<_Tp>)
  1295		      {
  1296			const __m512 __hi32 = _mm512_cvtepi32_ps(
  1297			  __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
  1298				   _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
  1299			const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
  1300							_mm512_cvtepi64_epi32(__i1));
  1301			// split low 32-bits, because if __hi32 is a small negative
  1302			// number, the 24-bit mantissa may lose important information if
  1303			// any of the high 8 bits of __lo32 is set, leading to
  1304			// catastrophic cancelation in the FMA
  1305			const __m512 __hi16
  1306			  = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
  1307			const __m512 __lo16
  1308			  = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
  1309			return (__hi32 * 0x100000000LL + __hi16) + __lo16;
  1310		      }
  1311		    else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
  1312		      {
  1313			return __intrin_bitcast<_To>(
  1314			  _mm512_cvtepu32_ps(__concat(
  1315			    _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
  1316			    _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
  1317			    * 0x100000000LL
  1318			  + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
  1319							_mm512_cvtepi64_epi32(__i1))));
  1320		      }
  1321		  }
  1322		else if constexpr (__f64_to_s32) //{{{2
  1323		  {
  1324		    // use concat fallback
  1325		  }
  1326		else if constexpr (__f64_to_u32) //{{{2
  1327		  {
  1328		    if constexpr (__x_to_x && __have_sse4_1)
  1329		      {
  1330			return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
  1331				 _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u),
  1332				 _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
  1333			       ^ 0x8000'0000u;
  1334			// without SSE4.1 just use the scalar fallback, it's only four
  1335			// values
  1336		      }
  1337		    else if constexpr (__y_to_y)
  1338		      {
  1339			return __vector_bitcast<_Up>(
  1340				 __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
  1341							      - 0x8000'0000u),
  1342					  _mm256_cvttpd_epi32(_mm256_floor_pd(__i1)
  1343							      - 0x8000'0000u)))
  1344			       ^ 0x8000'0000u;
  1345		      } // __z_to_z uses fallback
  1346		  }
  1347		else if constexpr (__f64_to_ibw) //{{{2
  1348		  {
  1349		    // one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback
  1350		    // would go via two independet conversions to _SimdWrapper<_To> and
  1351		    // subsequent interleaving. This is better, because f64->__i32
  1352		    // allows to combine __v0 and __v1 into one register: if constexpr
  1353		    // (__z_to_x || __y_to_x) {
  1354		    return __convert_x86<_To>(
  1355		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1));
  1356		    //}
  1357		  }
  1358		else if constexpr (__f32_to_ibw) //{{{2
  1359		  {
  1360		    return __convert_x86<_To>(
  1361		      __convert_x86<__vector_type_t<int, _Np>>(__v0),
  1362		      __convert_x86<__vector_type_t<int, _Np>>(__v1));
  1363		  } //}}}
  1364	
  1365		// fallback: {{{2
  1366		if constexpr (sizeof(_To) >= 32)
  1367		  // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
  1368		  return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0),
  1369				  __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1));
  1370		else if constexpr (sizeof(_To) == 16)
  1371		  {
  1372		    const auto __lo = __to_intrin(__convert_x86<_To>(__v0));
  1373		    const auto __hi = __to_intrin(__convert_x86<_To>(__v1));
  1374		    if constexpr (sizeof(_Up) * _Np == 8)
  1375		      {
  1376			if constexpr (is_floating_point_v<_Up>)
  1377			  return __auto_bitcast(
  1378			    _mm_unpacklo_pd(__vector_bitcast<double>(__lo),
  1379					    __vector_bitcast<double>(__hi)));
  1380			else
  1381			  return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
  1382		      }
  1383		    else if constexpr (sizeof(_Up) * _Np == 4)
  1384		      {
  1385			if constexpr (is_floating_point_v<_Up>)
  1386			  return __auto_bitcast(
  1387			    _mm_unpacklo_ps(__vector_bitcast<float>(__lo),
  1388					    __vector_bitcast<float>(__hi)));
  1389			else
  1390			  return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
  1391		      }
  1392		    else if constexpr (sizeof(_Up) * _Np == 2)
  1393		      return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi));
  1394		    else
  1395		      __assert_unreachable<_Tp>();
  1396		  }
  1397		else
  1398		  return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>());
  1399		//}}}
  1400	      }
  1401	  }
  1402	
  1403	//}}}1
  1404	// 4-arg __convert_x86 {{{1
  1405	template <typename _To, typename _V, typename _Traits>
  1406	  _GLIBCXX_SIMD_INTRINSIC _To
  1407	  __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3)
  1408	  {
  1409	    static_assert(__is_vector_type_v<_V>);
  1410	    using _Tp = typename _Traits::value_type;
  1411	    constexpr size_t _Np = _Traits::_S_full_size;
  1412	    [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
  1413	    [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
  1414	    [[maybe_unused]] const auto __i2 = __to_intrin(__v2);
  1415	    [[maybe_unused]] const auto __i3 = __to_intrin(__v3);
  1416	    using _Up = typename _VectorTraits<_To>::value_type;
  1417	    constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
  1418	
  1419	    static_assert(4 * _Np <= _M,
  1420			  "__v2/__v3 would be discarded; use the two/one-argument "
  1421			  "__convert_x86 overload instead");
  1422	
  1423	    // [xyz]_to_[xyz] {{{2
  1424	    [[maybe_unused]] constexpr bool __x_to_x
  1425	      = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
  1426	    [[maybe_unused]] constexpr bool __x_to_y
  1427	      = sizeof(__v0) <= 16 && sizeof(_To) == 32;
  1428	    [[maybe_unused]] constexpr bool __x_to_z
  1429	      = sizeof(__v0) <= 16 && sizeof(_To) == 64;
  1430	    [[maybe_unused]] constexpr bool __y_to_x
  1431	      = sizeof(__v0) == 32 && sizeof(_To) <= 16;
  1432	    [[maybe_unused]] constexpr bool __y_to_y
  1433	      = sizeof(__v0) == 32 && sizeof(_To) == 32;
  1434	    [[maybe_unused]] constexpr bool __y_to_z
  1435	      = sizeof(__v0) == 32 && sizeof(_To) == 64;
  1436	    [[maybe_unused]] constexpr bool __z_to_x
  1437	      = sizeof(__v0) == 64 && sizeof(_To) <= 16;
  1438	    [[maybe_unused]] constexpr bool __z_to_y
  1439	      = sizeof(__v0) == 64 && sizeof(_To) == 32;
  1440	    [[maybe_unused]] constexpr bool __z_to_z
  1441	      = sizeof(__v0) == 64 && sizeof(_To) == 64;
  1442	
  1443	    // iX_to_iX {{{2
  1444	    [[maybe_unused]] constexpr bool __i_to_i
  1445	      = is_integral_v<_Up> && is_integral_v<_Tp>;
  1446	    [[maybe_unused]] constexpr bool __i8_to_i16
  1447	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2;
  1448	    [[maybe_unused]] constexpr bool __i8_to_i32
  1449	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4;
  1450	    [[maybe_unused]] constexpr bool __i8_to_i64
  1451	      = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8;
  1452	    [[maybe_unused]] constexpr bool __i16_to_i8
  1453	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1;
  1454	    [[maybe_unused]] constexpr bool __i16_to_i32
  1455	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4;
  1456	    [[maybe_unused]] constexpr bool __i16_to_i64
  1457	      = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8;
  1458	    [[maybe_unused]] constexpr bool __i32_to_i8
  1459	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1;
  1460	    [[maybe_unused]] constexpr bool __i32_to_i16
  1461	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2;
  1462	    [[maybe_unused]] constexpr bool __i32_to_i64
  1463	      = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8;
  1464	    [[maybe_unused]] constexpr bool __i64_to_i8
  1465	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
  1466	    [[maybe_unused]] constexpr bool __i64_to_i16
  1467	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2;
  1468	    [[maybe_unused]] constexpr bool __i64_to_i32
  1469	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4;
  1470	
  1471	    // [fsu]X_to_[fsu]X {{{2
  1472	    // ibw = integral && byte or word, i.e. char and short with any signedness
  1473	    [[maybe_unused]] constexpr bool __i64_to_f32
  1474	      = is_integral_v<_Tp> && sizeof(_Tp) == 8
  1475		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1476	    [[maybe_unused]] constexpr bool __s32_to_f32
  1477	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
  1478		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1479	    [[maybe_unused]] constexpr bool __s16_to_f32
  1480	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
  1481		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1482	    [[maybe_unused]] constexpr bool __s8_to_f32
  1483	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
  1484		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1485	    [[maybe_unused]] constexpr bool __u32_to_f32
  1486	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
  1487		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1488	    [[maybe_unused]] constexpr bool __u16_to_f32
  1489	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
  1490		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1491	    [[maybe_unused]] constexpr bool __u8_to_f32
  1492	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
  1493		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1494	    [[maybe_unused]] constexpr bool __s64_to_f64
  1495	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8
  1496		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1497	    [[maybe_unused]] constexpr bool __s32_to_f64
  1498	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4
  1499		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1500	    [[maybe_unused]] constexpr bool __s16_to_f64
  1501	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2
  1502		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1503	    [[maybe_unused]] constexpr bool __s8_to_f64
  1504	      = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1
  1505		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1506	    [[maybe_unused]] constexpr bool __u64_to_f64
  1507	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8
  1508		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1509	    [[maybe_unused]] constexpr bool __u32_to_f64
  1510	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4
  1511		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1512	    [[maybe_unused]] constexpr bool __u16_to_f64
  1513	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2
  1514		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1515	    [[maybe_unused]] constexpr bool __u8_to_f64
  1516	      = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1
  1517		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1518	    [[maybe_unused]] constexpr bool __f32_to_s64
  1519	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
  1520		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1521	    [[maybe_unused]] constexpr bool __f32_to_s32
  1522	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
  1523		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1524	    [[maybe_unused]] constexpr bool __f32_to_u64
  1525	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
  1526		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1527	    [[maybe_unused]] constexpr bool __f32_to_u32
  1528	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
  1529		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1530	    [[maybe_unused]] constexpr bool __f64_to_s64
  1531	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8
  1532		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1533	    [[maybe_unused]] constexpr bool __f64_to_s32
  1534	      = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4
  1535		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1536	    [[maybe_unused]] constexpr bool __f64_to_u64
  1537	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8
  1538		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1539	    [[maybe_unused]] constexpr bool __f64_to_u32
  1540	      = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4
  1541		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1542	    [[maybe_unused]] constexpr bool __f32_to_ibw
  1543	      = is_integral_v<_Up> && sizeof(_Up) <= 2
  1544		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 4;
  1545	    [[maybe_unused]] constexpr bool __f64_to_ibw
  1546	      = is_integral_v<_Up> && sizeof(_Up) <= 2
  1547		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1548	    [[maybe_unused]] constexpr bool __f32_to_f64
  1549	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 4
  1550		&& is_floating_point_v<_Up> && sizeof(_Up) == 8;
  1551	    [[maybe_unused]] constexpr bool __f64_to_f32
  1552	      = is_floating_point_v<_Tp> && sizeof(_Tp) == 8
  1553		&& is_floating_point_v<_Up> && sizeof(_Up) == 4;
  1554	
  1555	    if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2
  1556	      {
  1557		// <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16>
  1558		return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
  1559					  __hi128(__v1), __lo128(__v2), __hi128(__v2),
  1560					  __lo128(__v3), __hi128(__v3));
  1561	      }
  1562	    else if constexpr (__i_to_i) // assert ISA {{{2
  1563	      {
  1564		static_assert(__x_to_x || __have_avx2,
  1565			      "integral conversions with ymm registers require AVX2");
  1566		static_assert(__have_avx512bw
  1567				|| ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
  1568				    && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
  1569			      "8/16-bit integers in zmm registers require AVX512BW");
  1570		static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
  1571			      "integral conversions with ymm registers require AVX2");
  1572	      }
  1573	    // concat => use 2-arg __convert_x86 {{{2
  1574	    if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
  1575			  || (sizeof(__v0) == 16 && __have_avx
  1576			      && is_floating_point_v<_Tp>)
  1577			  || (sizeof(__v0) == 32 && __have_avx512f))
  1578	      {
  1579		// The ISA can handle wider input registers, so concat and use two-arg
  1580		// implementation. This reduces code duplication considerably.
  1581		return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3));
  1582	      }
  1583	    else //{{{2
  1584	      {
  1585		// conversion using bit reinterpretation (or no conversion at all)
  1586		// should all go through the concat branch above:
  1587		static_assert(
  1588		  !(is_floating_point_v<
  1589		      _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
  1590		// handle all zero extension{{{2
  1591		if constexpr (4 * _Np < _M && sizeof(_To) > 16)
  1592		  {
  1593		    constexpr size_t Min = 16 / sizeof(_Up);
  1594		    return __zero_extend(
  1595		      __convert_x86<
  1596			__vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>(
  1597			__v0, __v1, __v2, __v3));
  1598		  }
  1599		else if constexpr (__i64_to_i16) //{{{2
  1600		  {
  1601		    if constexpr (__x_to_x && __have_sse4_1)
  1602		      {
  1603			return __intrin_bitcast<_To>(_mm_shuffle_epi8(
  1604			  _mm_blend_epi16(
  1605			    _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22),
  1606			    _mm_blend_epi16(_mm_slli_si128(__i2, 4),
  1607					    _mm_slli_si128(__i3, 6), 0x88),
  1608			    0xcc),
  1609			  _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
  1610					14, 15)));
  1611		      }
  1612		    else if constexpr (__y_to_y && __have_avx2)
  1613		      {
  1614			return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
  1615			  __xzyw(_mm256_blend_epi16(
  1616			    __auto_bitcast(
  1617			      _mm256_shuffle_ps(__vector_bitcast<float>(__v0),
  1618						__vector_bitcast<float>(__v2),
  1619						0x88)), // 0.1. 8.9. 2.3. A.B.
  1620			    __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps(
  1621					  __vector_bitcast<float>(__v1),
  1622					  __vector_bitcast<float>(__v3), 0x88))
  1623					<< 16), // .4.5 .C.D .6.7 .E.F
  1624			    0xaa)               // 0415 8C9D 2637 AEBF
  1625				 ),             // 0415 2637 8C9D AEBF
  1626			  _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11,
  1627					   14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7,
  1628					   10, 11, 14, 15)));
  1629			/*
  1630			auto __a = _mm256_unpacklo_epi16(__v0, __v1);  // 04.. .... 26..
  1631			.... auto __b = _mm256_unpackhi_epi16(__v0, __v1);  // 15..
  1632			.... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3);  //
  1633			8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2,
  1634			__v3);
  1635			// 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a,
  1636			__b);
  1637			// 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c,
  1638			__d);
  1639			// 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e,
  1640			__f);
  1641			// 0145 89CD 2367 ABEF return __concat(
  1642			    _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)),
  1643			    _mm_unpackhi_epi32(__lo128(__g), __hi128(__g)));  // 0123
  1644			4567 89AB CDEF
  1645			    */
  1646		      } // else use fallback
  1647		  }
  1648		else if constexpr (__i64_to_i8) //{{{2
  1649		  {
  1650		    if constexpr (__x_to_x)
  1651		      {
  1652			// TODO: use fallback for now
  1653		      }
  1654		    else if constexpr (__y_to_x)
  1655		      {
  1656			auto __a
  1657			  = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24)
  1658			    | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16)
  1659			    | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8)
  1660			    | _mm256_slli_epi32(
  1661			      __i3, 24); // 048C .... 159D .... 26AE .... 37BF ....
  1662			/*return _mm_shuffle_epi8(
  1663			    _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5),
  1664			    _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15,
  1665			   3, 11));*/
  1666			auto __b = _mm256_unpackhi_epi64(
  1667			  __a, __a); // 159D .... 159D .... 37BF .... 37BF ....
  1668			auto __c = _mm256_unpacklo_epi8(
  1669			  __a, __b); // 0145 89CD .... .... 2367 ABEF .... ....
  1670			return __intrin_bitcast<_To>(
  1671			  _mm_unpacklo_epi16(__lo128(__c),
  1672					     __hi128(__c))); // 0123 4567 89AB CDEF
  1673		      }
  1674		  }
  1675		else if constexpr (__i32_to_i8) //{{{2
  1676		  {
  1677		    if constexpr (__x_to_x)
  1678		      {
  1679			if constexpr (__have_ssse3)
  1680			  {
  1681			    const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff;
  1682			    const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff)
  1683					      << 8;
  1684			    const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff)
  1685					      << 16;
  1686			    const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24;
  1687			    return __intrin_bitcast<_To>(
  1688			      _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3),
  1689					       _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
  1690							     2, 6, 10, 14, 3, 7, 11,
  1691							     15)));
  1692			  }
  1693			else
  1694			  {
  1695			    auto __a
  1696			      = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. ....
  1697			    auto __b
  1698			      = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. ....
  1699			    auto __c
  1700			      = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. ....
  1701			    auto __d
  1702			      = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. ....
  1703			    auto __e
  1704			      = _mm_unpacklo_epi8(__a, __c); // 048C .... .... ....
  1705			    auto __f
  1706			      = _mm_unpackhi_epi8(__a, __c); // 159D .... .... ....
  1707			    auto __g
  1708			      = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... ....
  1709			    auto __h
  1710			      = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... ....
  1711			    return __intrin_bitcast<_To>(_mm_unpacklo_epi8(
  1712			      _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... ....
  1713			      _mm_unpacklo_epi8(__f, __h)  // 1357 9BDF .... ....
  1714			      ));                          // 0123 4567 89AB CDEF
  1715			  }
  1716		      }
  1717		    else if constexpr (__y_to_y)
  1718		      {
  1719			const auto __a = _mm256_shuffle_epi8(
  1720			  __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16(
  1721					 __i0, _mm256_slli_epi32(__i1, 16), 0xAA))
  1722				       & 0xff)
  1723				      | (__vector_bitcast<_UShort>(_mm256_blend_epi16(
  1724					   __i2, _mm256_slli_epi32(__i3, 16), 0xAA))
  1725					 << 8)),
  1726			  _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7,
  1727					   11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9,
  1728					   13, 3, 7, 11, 15));
  1729			return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32(
  1730			  __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)));
  1731		      }
  1732		  }
  1733		else if constexpr (__i64_to_f32) //{{{2
  1734		  {
  1735		    // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm
  1736		    // integers)
  1737		    if constexpr (__x_to_y)
  1738		      {
  1739			return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1],
  1740						     __v2[0], __v2[1], __v3[0],
  1741						     __v3[1]);
  1742	
  1743			const auto __a = _mm_unpacklo_epi32(__i0, __i1);   // acAC
  1744			const auto __b = _mm_unpackhi_epi32(__i0, __i1);   // bdBD
  1745			const auto __c = _mm_unpacklo_epi32(__i2, __i3);   // egEG
  1746			const auto __d = _mm_unpackhi_epi32(__i2, __i3);   // fhFH
  1747			const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd
  1748			const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh
  1749			const auto __hi32 = __vector_bitcast<
  1750			  conditional_t<is_signed_v<_Tp>, int, _UInt>>(
  1751			  __concat(_mm_unpackhi_epi32(__a, __b),
  1752				   _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH
  1753			const auto __hi
  1754			  = 0x100000000LL
  1755			    * __convert_x86<__vector_type_t<float, 8>>(__hi32);
  1756			const auto __mid
  1757			  = 0x10000
  1758			    * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16),
  1759							  _mm_srli_epi32(__lo32b, 16)));
  1760			const auto __lo = _mm256_cvtepi32_ps(
  1761			  __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a,
  1762				   _mm_set1_epi32(0x0000ffffu) & __lo32b));
  1763			return (__hi + __mid) + __lo;
  1764		      }
  1765		  }
  1766		else if constexpr (__f64_to_ibw) //{{{2
  1767		  {
  1768		    return __convert_x86<_To>(
  1769		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
  1770		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3));
  1771		  }
  1772		else if constexpr (__f32_to_ibw) //{{{2
  1773		  {
  1774		    return __convert_x86<_To>(
  1775		      __convert_x86<__vector_type_t<int, _Np>>(__v0),
  1776		      __convert_x86<__vector_type_t<int, _Np>>(__v1),
  1777		      __convert_x86<__vector_type_t<int, _Np>>(__v2),
  1778		      __convert_x86<__vector_type_t<int, _Np>>(__v3));
  1779		  } //}}}
  1780	
  1781		// fallback: {{{2
  1782		if constexpr (sizeof(_To) >= 32)
  1783		  // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
  1784		  return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0,
  1785									      __v1),
  1786				  __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2,
  1787									      __v3));
  1788		else if constexpr (sizeof(_To) == 16)
  1789		  {
  1790		    const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1));
  1791		    const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3));
  1792		    if constexpr (sizeof(_Up) * _Np * 2 == 8)
  1793		      {
  1794			if constexpr (is_floating_point_v<_Up>)
  1795			  return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi));
  1796			else
  1797			  return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
  1798		      }
  1799		    else if constexpr (sizeof(_Up) * _Np * 2 == 4)
  1800		      {
  1801			if constexpr (is_floating_point_v<_Up>)
  1802			  return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi));
  1803			else
  1804			  return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi));
  1805		      }
  1806		    else
  1807		      __assert_unreachable<_Tp>();
  1808		  }
  1809		else
  1810		  return __vector_convert<_To>(__v0, __v1, __v2, __v3,
  1811					       make_index_sequence<_Np>());
  1812		//}}}2
  1813	      }
  1814	  }
  1815	
  1816	//}}}
  1817	// 8-arg __convert_x86 {{{1
  1818	template <typename _To, typename _V, typename _Traits>
  1819	  _GLIBCXX_SIMD_INTRINSIC _To
  1820	  __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
  1821			_V __v7)
  1822	  {
  1823	    static_assert(__is_vector_type_v<_V>);
  1824	    using _Tp = typename _Traits::value_type;
  1825	    constexpr size_t _Np = _Traits::_S_full_size;
  1826	    [[maybe_unused]] const auto __i0 = __to_intrin(__v0);
  1827	    [[maybe_unused]] const auto __i1 = __to_intrin(__v1);
  1828	    [[maybe_unused]] const auto __i2 = __to_intrin(__v2);
  1829	    [[maybe_unused]] const auto __i3 = __to_intrin(__v3);
  1830	    [[maybe_unused]] const auto __i4 = __to_intrin(__v4);
  1831	    [[maybe_unused]] const auto __i5 = __to_intrin(__v5);
  1832	    [[maybe_unused]] const auto __i6 = __to_intrin(__v6);
  1833	    [[maybe_unused]] const auto __i7 = __to_intrin(__v7);
  1834	    using _Up = typename _VectorTraits<_To>::value_type;
  1835	    constexpr size_t _M = _VectorTraits<_To>::_S_full_size;
  1836	
  1837	    static_assert(8 * _Np <= _M,
  1838			  "__v4-__v7 would be discarded; use the four/two/one-argument "
  1839			  "__convert_x86 overload instead");
  1840	
  1841	    // [xyz]_to_[xyz] {{{2
  1842	    [[maybe_unused]] constexpr bool __x_to_x
  1843	      = sizeof(__v0) <= 16 && sizeof(_To) <= 16;
  1844	    [[maybe_unused]] constexpr bool __x_to_y
  1845	      = sizeof(__v0) <= 16 && sizeof(_To) == 32;
  1846	    [[maybe_unused]] constexpr bool __x_to_z
  1847	      = sizeof(__v0) <= 16 && sizeof(_To) == 64;
  1848	    [[maybe_unused]] constexpr bool __y_to_x
  1849	      = sizeof(__v0) == 32 && sizeof(_To) <= 16;
  1850	    [[maybe_unused]] constexpr bool __y_to_y
  1851	      = sizeof(__v0) == 32 && sizeof(_To) == 32;
  1852	    [[maybe_unused]] constexpr bool __y_to_z
  1853	      = sizeof(__v0) == 32 && sizeof(_To) == 64;
  1854	    [[maybe_unused]] constexpr bool __z_to_x
  1855	      = sizeof(__v0) == 64 && sizeof(_To) <= 16;
  1856	    [[maybe_unused]] constexpr bool __z_to_y
  1857	      = sizeof(__v0) == 64 && sizeof(_To) == 32;
  1858	    [[maybe_unused]] constexpr bool __z_to_z
  1859	      = sizeof(__v0) == 64 && sizeof(_To) == 64;
  1860	
  1861	    // [if]X_to_i8 {{{2
  1862	    [[maybe_unused]] constexpr bool __i_to_i
  1863	      = is_integral_v<_Up> && is_integral_v<_Tp>;
  1864	    [[maybe_unused]] constexpr bool __i64_to_i8
  1865	      = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1;
  1866	    [[maybe_unused]] constexpr bool __f64_to_i8
  1867	      = is_integral_v<_Up> && sizeof(_Up) == 1
  1868		&& is_floating_point_v<_Tp> && sizeof(_Tp) == 8;
  1869	
  1870	    if constexpr (__i_to_i) // assert ISA {{{2
  1871	      {
  1872		static_assert(__x_to_x || __have_avx2,
  1873			      "integral conversions with ymm registers require AVX2");
  1874		static_assert(__have_avx512bw
  1875				|| ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64)
  1876				    && (sizeof(_Up) >= 4 || sizeof(_To) < 64)),
  1877			      "8/16-bit integers in zmm registers require AVX512BW");
  1878		static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f,
  1879			      "integral conversions with ymm registers require AVX2");
  1880	      }
  1881	    // concat => use 4-arg __convert_x86 {{{2
  1882	    if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2)
  1883			  || (sizeof(__v0) == 16 && __have_avx
  1884			      && is_floating_point_v<_Tp>)
  1885			  || (sizeof(__v0) == 32 && __have_avx512f))
  1886	      {
  1887		// The ISA can handle wider input registers, so concat and use two-arg
  1888		// implementation. This reduces code duplication considerably.
  1889		return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
  1890					  __concat(__v4, __v5), __concat(__v6, __v7));
  1891	      }
  1892	    else //{{{2
  1893	      {
  1894		// conversion using bit reinterpretation (or no conversion at all)
  1895		// should all go through the concat branch above:
  1896		static_assert(
  1897		  !(is_floating_point_v<
  1898		      _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up)));
  1899		static_assert(!(8 * _Np < _M && sizeof(_To) > 16),
  1900			      "zero extension should be impossible");
  1901		if constexpr (__i64_to_i8) //{{{2
  1902		  {
  1903		    if constexpr (__x_to_x && __have_ssse3)
  1904		      {
  1905			// unsure whether this is better than the variant below
  1906			return __intrin_bitcast<_To>(_mm_shuffle_epi8(
  1907			  __to_intrin(
  1908			    (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
  1909			     | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
  1910			    | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
  1911			       | (((__v6 & 0xff) << 48) | (__v7 << 56)))),
  1912			  _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14,
  1913					7, 15)));
  1914		      }
  1915		    else if constexpr (__x_to_x)
  1916		      {
  1917			const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac
  1918			const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd
  1919			const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg
  1920			const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh
  1921			const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik
  1922			const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl
  1923			const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo
  1924			const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np
  1925			return __intrin_bitcast<_To>(_mm_unpacklo_epi64(
  1926			  _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b),  // abcd
  1927					     _mm_unpacklo_epi8(__c, __d)), // efgh
  1928			  _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f),  // ijkl
  1929					     _mm_unpacklo_epi8(__g, __h))  // mnop
  1930			  ));
  1931		      }
  1932		    else if constexpr (__y_to_y)
  1933		      {
  1934			auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV
  1935			  __to_intrin(
  1936			    (((__v0 & 0xff) | ((__v1 & 0xff) << 8))
  1937			     | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24)))
  1938			    | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40))
  1939			       | (((__v6 & 0xff) << 48) | ((__v7 << 56)))));
  1940			/*
  1941			auto __b = _mm256_unpackhi_epi64(__a, __a);  // 159D HLPT 159D
  1942			HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a,
  1943			__b);  // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d =
  1944			__xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return
  1945			_mm256_shuffle_epi8(
  1946			    __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12,
  1947			13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7,
  1948			14, 15));
  1949						*/
  1950			auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF
  1951							// IJMN QRUV
  1952			  __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13,
  1953						6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11,
  1954						4, 12, 5, 13, 6, 14, 7, 15));
  1955			auto __c
  1956			  = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV
  1957			return __intrin_bitcast<_To>(_mm256_shuffle_epi8(
  1958			  __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13,
  1959						6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11,
  1960						4, 5, 12, 13, 6, 7, 14, 15)));
  1961		      }
  1962		    else if constexpr (__z_to_z)
  1963		      {
  1964			return __concat(
  1965			  __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2,
  1966								      __v3),
  1967			  __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
  1968								      __v7));
  1969		      }
  1970		  }
  1971		else if constexpr (__f64_to_i8) //{{{2
  1972		  {
  1973		    return __convert_x86<_To>(
  1974		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1),
  1975		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3),
  1976		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5),
  1977		      __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7));
  1978		  }
  1979		else // unreachable {{{2
  1980		  __assert_unreachable<_Tp>();
  1981		//}}}
  1982	
  1983		// fallback: {{{2
  1984		if constexpr (sizeof(_To) >= 32)
  1985		  // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm
  1986		  return __concat(
  1987		    __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3),
  1988		    __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6,
  1989								__v7));
  1990		else if constexpr (sizeof(_To) == 16)
  1991		  {
  1992		    const auto __lo
  1993		      = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3));
  1994		    const auto __hi
  1995		      = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7));
  1996		    static_assert(sizeof(_Up) == 1 && _Np == 2);
  1997		    return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi));
  1998		  }
  1999		else
  2000		  {
  2001		    __assert_unreachable<_Tp>();
  2002		    // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5,
  2003		    // __v6, __v7,
  2004		    //                             make_index_sequence<_Np>());
  2005		  } //}}}2
  2006	      }
  2007	  }
  2008	
  2009	//}}}
  2010	// 16-arg __convert_x86 {{{1
  2011	template <typename _To, typename _V, typename _Traits>
  2012	  _GLIBCXX_SIMD_INTRINSIC _To
  2013	  __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6,
  2014			_V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12,
  2015			_V __v13, _V __v14, _V __v15)
  2016	  {
  2017	    // concat => use 8-arg __convert_x86
  2018	    return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3),
  2019				      __concat(__v4, __v5), __concat(__v6, __v7),
  2020				      __concat(__v8, __v9), __concat(__v10, __v11),
  2021				      __concat(__v12, __v13), __concat(__v14, __v15));
  2022	  }
  2023	
  2024	//}}}
  2025	
  2026	#endif // __cplusplus >= 201703L
  2027	#endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H
  2028	
  2029	// vim: foldmethod=marker