Where Online Learning is simpler!
The C and C++ Include Header Files
/usr/include/c++/11/experimental/bits/simd_x86_conversions.h
$ cat -n /usr/include/c++/11/experimental/bits/simd_x86_conversions.h 1 // x86 specific conversion optimizations -*- C++ -*- 2 3 // Copyright (C) 2020-2021 Free Software Foundation, Inc. 4 // 5 // This file is part of the GNU ISO C++ Library. This library is free 6 // software; you can redistribute it and/or modify it under the 7 // terms of the GNU General Public License as published by the 8 // Free Software Foundation; either version 3, or (at your option) 9 // any later version. 10 11 // This library is distributed in the hope that it will be useful, 12 // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 // GNU General Public License for more details. 15 16 // Under Section 7 of GPL version 3, you are granted additional 17 // permissions described in the GCC Runtime Library Exception, version 18 // 3.1, as published by the Free Software Foundation. 19 20 // You should have received a copy of the GNU General Public License and 21 // a copy of the GCC Runtime Library Exception along with this program; 22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 //
. 24 25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 27 28 #if __cplusplus >= 201703L 29 30 // work around PR85827 31 // 1-arg __convert_x86 {{{1 32 template
33 _GLIBCXX_SIMD_INTRINSIC _To 34 __convert_x86(_V __v) 35 { 36 static_assert(__is_vector_type_v<_V>); 37 using _Tp = typename _Traits::value_type; 38 constexpr size_t _Np = _Traits::_S_full_size; 39 [[maybe_unused]] const auto __intrin = __to_intrin(__v); 40 using _Up = typename _VectorTraits<_To>::value_type; 41 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 42 43 // [xyz]_to_[xyz] {{{2 44 [[maybe_unused]] constexpr bool __x_to_x 45 = sizeof(__v) <= 16 && sizeof(_To) <= 16; 46 [[maybe_unused]] constexpr bool __x_to_y 47 = sizeof(__v) <= 16 && sizeof(_To) == 32; 48 [[maybe_unused]] constexpr bool __x_to_z 49 = sizeof(__v) <= 16 && sizeof(_To) == 64; 50 [[maybe_unused]] constexpr bool __y_to_x 51 = sizeof(__v) == 32 && sizeof(_To) <= 16; 52 [[maybe_unused]] constexpr bool __y_to_y 53 = sizeof(__v) == 32 && sizeof(_To) == 32; 54 [[maybe_unused]] constexpr bool __y_to_z 55 = sizeof(__v) == 32 && sizeof(_To) == 64; 56 [[maybe_unused]] constexpr bool __z_to_x 57 = sizeof(__v) == 64 && sizeof(_To) <= 16; 58 [[maybe_unused]] constexpr bool __z_to_y 59 = sizeof(__v) == 64 && sizeof(_To) == 32; 60 [[maybe_unused]] constexpr bool __z_to_z 61 = sizeof(__v) == 64 && sizeof(_To) == 64; 62 63 // iX_to_iX {{{2 64 [[maybe_unused]] constexpr bool __i_to_i 65 = is_integral_v<_Up> && is_integral_v<_Tp>; 66 [[maybe_unused]] constexpr bool __i8_to_i16 67 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; 68 [[maybe_unused]] constexpr bool __i8_to_i32 69 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; 70 [[maybe_unused]] constexpr bool __i8_to_i64 71 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; 72 [[maybe_unused]] constexpr bool __i16_to_i8 73 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; 74 [[maybe_unused]] constexpr bool __i16_to_i32 75 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; 76 [[maybe_unused]] constexpr bool __i16_to_i64 77 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; 78 [[maybe_unused]] constexpr bool __i32_to_i8 79 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; 80 [[maybe_unused]] constexpr bool __i32_to_i16 81 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; 82 [[maybe_unused]] constexpr bool __i32_to_i64 83 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; 84 [[maybe_unused]] constexpr bool __i64_to_i8 85 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 86 [[maybe_unused]] constexpr bool __i64_to_i16 87 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; 88 [[maybe_unused]] constexpr bool __i64_to_i32 89 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; 90 91 // [fsu]X_to_[fsu]X {{{2 92 // ibw = integral && byte or word, i.e. char and short with any signedness 93 [[maybe_unused]] constexpr bool __s64_to_f32 94 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 95 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 96 [[maybe_unused]] constexpr bool __s32_to_f32 97 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 98 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 99 [[maybe_unused]] constexpr bool __s16_to_f32 100 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 101 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 102 [[maybe_unused]] constexpr bool __s8_to_f32 103 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 104 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 105 [[maybe_unused]] constexpr bool __u64_to_f32 106 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 107 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 108 [[maybe_unused]] constexpr bool __u32_to_f32 109 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 110 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 111 [[maybe_unused]] constexpr bool __u16_to_f32 112 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 113 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 114 [[maybe_unused]] constexpr bool __u8_to_f32 115 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 116 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 117 [[maybe_unused]] constexpr bool __s64_to_f64 118 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 119 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 120 [[maybe_unused]] constexpr bool __s32_to_f64 121 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 122 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 123 [[maybe_unused]] constexpr bool __u64_to_f64 124 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 125 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 126 [[maybe_unused]] constexpr bool __u32_to_f64 127 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 128 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 129 [[maybe_unused]] constexpr bool __f32_to_s64 130 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 131 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 132 [[maybe_unused]] constexpr bool __f32_to_s32 133 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 134 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 135 [[maybe_unused]] constexpr bool __f32_to_u64 136 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 137 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 138 [[maybe_unused]] constexpr bool __f32_to_u32 139 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 140 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 141 [[maybe_unused]] constexpr bool __f64_to_s64 142 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 143 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 144 [[maybe_unused]] constexpr bool __f64_to_s32 145 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 146 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 147 [[maybe_unused]] constexpr bool __f64_to_u64 148 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 149 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 150 [[maybe_unused]] constexpr bool __f64_to_u32 151 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 152 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 153 [[maybe_unused]] constexpr bool __ibw_to_f32 154 = is_integral_v<_Tp> && sizeof(_Tp) <= 2 155 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 156 [[maybe_unused]] constexpr bool __ibw_to_f64 157 = is_integral_v<_Tp> && sizeof(_Tp) <= 2 158 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 159 [[maybe_unused]] constexpr bool __f32_to_ibw 160 = is_integral_v<_Up> && sizeof(_Up) <= 2 161 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 162 [[maybe_unused]] constexpr bool __f64_to_ibw 163 = is_integral_v<_Up> && sizeof(_Up) <= 2 164 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 165 [[maybe_unused]] constexpr bool __f32_to_f64 166 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 167 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 168 [[maybe_unused]] constexpr bool __f64_to_f32 169 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 170 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 171 172 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 173 return __convert_x86<_To>(__lo128(__v), __hi128(__v)); 174 else if constexpr (__i_to_i && __x_to_y && !__have_avx2) //{{{2 175 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v), 176 __convert_x86<__vector_type_t<_Up, _M / 2>>( 177 __extract_part<1, _Np / _M * 2>(__v))); 178 else if constexpr (__i_to_i) //{{{2 179 { 180 static_assert(__x_to_x || __have_avx2, 181 "integral conversions with ymm registers require AVX2"); 182 static_assert(__have_avx512bw 183 || ((sizeof(_Tp) >= 4 || sizeof(__v) < 64) 184 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 185 "8/16-bit integers in zmm registers require AVX512BW"); 186 static_assert((sizeof(__v) < 64 && sizeof(_To) < 64) || __have_avx512f, 187 "integral conversions with ymm registers require AVX2"); 188 } 189 if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> && //{{{2 190 sizeof(_Tp) == sizeof(_Up)) 191 { 192 // conversion uses simple bit reinterpretation (or no conversion at all) 193 if constexpr (_Np >= _M) 194 return __intrin_bitcast<_To>(__v); 195 else 196 return __zero_extend(__vector_bitcast<_Up>(__v)); 197 } 198 else if constexpr (_Np < _M && sizeof(_To) > 16) //{{{2 199 // zero extend (eg. xmm -> ymm) 200 return __zero_extend( 201 __convert_x86<__vector_type_t< 202 _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v)); 203 else if constexpr (_Np > _M && sizeof(__v) > 16) //{{{2 204 // partial input (eg. ymm -> xmm) 205 return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v)); 206 else if constexpr (__i64_to_i32) //{{{2 207 { 208 if constexpr (__x_to_x && __have_avx512vl) 209 return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin)); 210 else if constexpr (__x_to_x) 211 return __auto_bitcast( 212 _mm_shuffle_ps(__vector_bitcast
(__v), __m128(), 8)); 213 else if constexpr (__y_to_x && __have_avx512vl) 214 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin)); 215 else if constexpr (__y_to_x && __have_avx512f) 216 return __intrin_bitcast<_To>( 217 __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v)))); 218 else if constexpr (__y_to_x) 219 return __intrin_bitcast<_To>( 220 __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8), 221 0 + 4 * 2))); 222 else if constexpr (__z_to_y) 223 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin)); 224 } 225 else if constexpr (__i64_to_i16) //{{{2 226 { 227 if constexpr (__x_to_x && __have_avx512vl) 228 return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin)); 229 else if constexpr (__x_to_x && __have_avx512f) 230 return __intrin_bitcast<_To>( 231 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); 232 else if constexpr (__x_to_x && __have_ssse3) 233 { 234 return __intrin_bitcast<_To>( 235 _mm_shuffle_epi8(__intrin, 236 _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, 237 -0x80, -0x80, -0x80, -0x80, -0x80, 238 -0x80, -0x80, -0x80, -0x80))); 239 // fallback without SSSE3 240 } 241 else if constexpr (__y_to_x && __have_avx512vl) 242 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin)); 243 else if constexpr (__y_to_x && __have_avx512f) 244 return __intrin_bitcast<_To>( 245 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v)))); 246 else if constexpr (__y_to_x) 247 { 248 const auto __a = _mm256_shuffle_epi8( 249 __intrin, 250 _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80, 251 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 252 -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80, 253 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 254 -0x80)); 255 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); 256 } 257 else if constexpr (__z_to_x) 258 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin)); 259 } 260 else if constexpr (__i64_to_i8) //{{{2 261 { 262 if constexpr (__x_to_x && __have_avx512vl) 263 return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin)); 264 else if constexpr (__x_to_x && __have_avx512f) 265 return __intrin_bitcast<_To>( 266 __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin)))); 267 else if constexpr (__y_to_x && __have_avx512vl) 268 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin)); 269 else if constexpr (__y_to_x && __have_avx512f) 270 return __intrin_bitcast<_To>( 271 _mm512_cvtepi64_epi8(__zero_extend(__intrin))); 272 else if constexpr (__z_to_x) 273 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin)); 274 } 275 else if constexpr (__i32_to_i64) //{{{2 276 { 277 if constexpr (__have_sse4_1 && __x_to_x) 278 return __intrin_bitcast<_To>(is_signed_v<_Tp> 279 ? _mm_cvtepi32_epi64(__intrin) 280 : _mm_cvtepu32_epi64(__intrin)); 281 else if constexpr (__x_to_x) 282 { 283 return __intrin_bitcast<_To>( 284 _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp> 285 ? _mm_srai_epi32(__intrin, 31) 286 : __m128i())); 287 } 288 else if constexpr (__x_to_y) 289 return __intrin_bitcast<_To>(is_signed_v<_Tp> 290 ? _mm256_cvtepi32_epi64(__intrin) 291 : _mm256_cvtepu32_epi64(__intrin)); 292 else if constexpr (__y_to_z) 293 return __intrin_bitcast<_To>(is_signed_v<_Tp> 294 ? _mm512_cvtepi32_epi64(__intrin) 295 : _mm512_cvtepu32_epi64(__intrin)); 296 } 297 else if constexpr (__i32_to_i16) //{{{2 298 { 299 if constexpr (__x_to_x && __have_avx512vl) 300 return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin)); 301 else if constexpr (__x_to_x && __have_avx512f) 302 return __intrin_bitcast<_To>( 303 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); 304 else if constexpr (__x_to_x && __have_ssse3) 305 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 306 __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, 307 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); 308 else if constexpr (__x_to_x) 309 { 310 auto __a = _mm_unpacklo_epi16(__intrin, __m128i()); // 0o.o 1o.o 311 auto __b = _mm_unpackhi_epi16(__intrin, __m128i()); // 2o.o 3o.o 312 auto __c = _mm_unpacklo_epi16(__a, __b); // 02oo ..oo 313 auto __d = _mm_unpackhi_epi16(__a, __b); // 13oo ..oo 314 return __intrin_bitcast<_To>( 315 _mm_unpacklo_epi16(__c, __d)); // 0123 oooo 316 } 317 else if constexpr (__y_to_x && __have_avx512vl) 318 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin)); 319 else if constexpr (__y_to_x && __have_avx512f) 320 return __intrin_bitcast<_To>( 321 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v)))); 322 else if constexpr (__y_to_x) 323 { 324 auto __a = _mm256_shuffle_epi8( 325 __intrin, 326 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, 327 -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 328 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80, 329 -0x80, -0x80, -0x80)); 330 return __intrin_bitcast<_To>(__lo128( 331 _mm256_permute4x64_epi64(__a, 332 0xf8))); // __a[0] __a[2] | __a[3] __a[3] 333 } 334 else if constexpr (__z_to_y) 335 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin)); 336 } 337 else if constexpr (__i32_to_i8) //{{{2 338 { 339 if constexpr (__x_to_x && __have_avx512vl) 340 return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin)); 341 else if constexpr (__x_to_x && __have_avx512f) 342 return __intrin_bitcast<_To>( 343 __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin)))); 344 else if constexpr (__x_to_x && __have_ssse3) 345 { 346 return __intrin_bitcast<_To>( 347 _mm_shuffle_epi8(__intrin, 348 _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, 349 -0x80, -0x80, -0x80, -0x80, -0x80, 350 -0x80, -0x80, -0x80, -0x80))); 351 } 352 else if constexpr (__x_to_x) 353 { 354 const auto __a 355 = _mm_unpacklo_epi8(__intrin, __intrin); // 0... .... 1... .... 356 const auto __b 357 = _mm_unpackhi_epi8(__intrin, __intrin); // 2... .... 3... .... 358 const auto __c = _mm_unpacklo_epi8(__a, __b); // 02.. .... .... .... 359 const auto __d = _mm_unpackhi_epi8(__a, __b); // 13.. .... .... .... 360 const auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 .... .... .... 361 return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1)); 362 } 363 else if constexpr (__y_to_x && __have_avx512vl) 364 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin)); 365 else if constexpr (__y_to_x && __have_avx512f) 366 return __intrin_bitcast<_To>( 367 _mm512_cvtepi32_epi8(__zero_extend(__intrin))); 368 else if constexpr (__z_to_x) 369 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin)); 370 } 371 else if constexpr (__i16_to_i64) //{{{2 372 { 373 if constexpr (__x_to_x && __have_sse4_1) 374 return __intrin_bitcast<_To>(is_signed_v<_Tp> 375 ? _mm_cvtepi16_epi64(__intrin) 376 : _mm_cvtepu16_epi64(__intrin)); 377 else if constexpr (__x_to_x && is_signed_v<_Tp>) 378 { 379 auto __x = _mm_srai_epi16(__intrin, 15); 380 auto __y = _mm_unpacklo_epi16(__intrin, __x); 381 __x = _mm_unpacklo_epi16(__x, __x); 382 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x)); 383 } 384 else if constexpr (__x_to_x) 385 return __intrin_bitcast<_To>( 386 _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()), 387 __m128i())); 388 else if constexpr (__x_to_y) 389 return __intrin_bitcast<_To>(is_signed_v<_Tp> 390 ? _mm256_cvtepi16_epi64(__intrin) 391 : _mm256_cvtepu16_epi64(__intrin)); 392 else if constexpr (__x_to_z) 393 return __intrin_bitcast<_To>(is_signed_v<_Tp> 394 ? _mm512_cvtepi16_epi64(__intrin) 395 : _mm512_cvtepu16_epi64(__intrin)); 396 } 397 else if constexpr (__i16_to_i32) //{{{2 398 { 399 if constexpr (__x_to_x && __have_sse4_1) 400 return __intrin_bitcast<_To>(is_signed_v<_Tp> 401 ? _mm_cvtepi16_epi32(__intrin) 402 : _mm_cvtepu16_epi32(__intrin)); 403 else if constexpr (__x_to_x && is_signed_v<_Tp>) 404 return __intrin_bitcast<_To>( 405 _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16)); 406 else if constexpr (__x_to_x && is_unsigned_v<_Tp>) 407 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i())); 408 else if constexpr (__x_to_y) 409 return __intrin_bitcast<_To>(is_signed_v<_Tp> 410 ? _mm256_cvtepi16_epi32(__intrin) 411 : _mm256_cvtepu16_epi32(__intrin)); 412 else if constexpr (__y_to_z) 413 return __intrin_bitcast<_To>(is_signed_v<_Tp> 414 ? _mm512_cvtepi16_epi32(__intrin) 415 : _mm512_cvtepu16_epi32(__intrin)); 416 } 417 else if constexpr (__i16_to_i8) //{{{2 418 { 419 if constexpr (__x_to_x && __have_avx512bw_vl) 420 return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin)); 421 else if constexpr (__x_to_x && __have_avx512bw) 422 return __intrin_bitcast<_To>( 423 __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); 424 else if constexpr (__x_to_x && __have_ssse3) 425 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 426 __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, 427 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); 428 else if constexpr (__x_to_x) 429 { 430 auto __a 431 = _mm_unpacklo_epi8(__intrin, __intrin); // 00.. 11.. 22.. 33.. 432 auto __b 433 = _mm_unpackhi_epi8(__intrin, __intrin); // 44.. 55.. 66.. 77.. 434 auto __c = _mm_unpacklo_epi8(__a, __b); // 0404 .... 1515 .... 435 auto __d = _mm_unpackhi_epi8(__a, __b); // 2626 .... 3737 .... 436 auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 0246 .... .... 437 auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 1357 .... .... 438 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); 439 } 440 else if constexpr (__y_to_x && __have_avx512bw_vl) 441 return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin)); 442 else if constexpr (__y_to_x && __have_avx512bw) 443 return __intrin_bitcast<_To>( 444 __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin)))); 445 else if constexpr (__y_to_x) 446 { 447 auto __a = _mm256_shuffle_epi8( 448 __intrin, 449 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80, 450 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 451 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2, 452 4, 6, 8, 10, 12, 14)); 453 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); 454 } 455 else if constexpr (__z_to_y && __have_avx512bw) 456 return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin)); 457 else if constexpr (__z_to_y) 458 __assert_unreachable<_Tp>(); 459 } 460 else if constexpr (__i8_to_i64) //{{{2 461 { 462 if constexpr (__x_to_x && __have_sse4_1) 463 return __intrin_bitcast<_To>(is_signed_v<_Tp> 464 ? _mm_cvtepi8_epi64(__intrin) 465 : _mm_cvtepu8_epi64(__intrin)); 466 else if constexpr (__x_to_x && is_signed_v<_Tp>) 467 { 468 if constexpr (__have_ssse3) 469 { 470 auto __dup = _mm_unpacklo_epi8(__intrin, __intrin); 471 auto __epi16 = _mm_srai_epi16(__dup, 8); 472 _mm_shuffle_epi8(__epi16, 473 _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 474 3, 3, 3, 3, 3)); 475 } 476 else 477 { 478 auto __x = _mm_unpacklo_epi8(__intrin, __intrin); 479 __x = _mm_unpacklo_epi16(__x, __x); 480 return __intrin_bitcast<_To>( 481 _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24), 482 _mm_srai_epi32(__x, 31))); 483 } 484 } 485 else if constexpr (__x_to_x) 486 { 487 return __intrin_bitcast<_To>(_mm_unpacklo_epi32( 488 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), 489 __m128i()), 490 __m128i())); 491 } 492 else if constexpr (__x_to_y) 493 return __intrin_bitcast<_To>(is_signed_v<_Tp> 494 ? _mm256_cvtepi8_epi64(__intrin) 495 : _mm256_cvtepu8_epi64(__intrin)); 496 else if constexpr (__x_to_z) 497 return __intrin_bitcast<_To>(is_signed_v<_Tp> 498 ? _mm512_cvtepi8_epi64(__intrin) 499 : _mm512_cvtepu8_epi64(__intrin)); 500 } 501 else if constexpr (__i8_to_i32) //{{{2 502 { 503 if constexpr (__x_to_x && __have_sse4_1) 504 return __intrin_bitcast<_To>(is_signed_v<_Tp> 505 ? _mm_cvtepi8_epi32(__intrin) 506 : _mm_cvtepu8_epi32(__intrin)); 507 else if constexpr (__x_to_x && is_signed_v<_Tp>) 508 { 509 const auto __x = _mm_unpacklo_epi8(__intrin, __intrin); 510 return __intrin_bitcast<_To>( 511 _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24)); 512 } 513 else if constexpr (__x_to_x && is_unsigned_v<_Tp>) 514 return __intrin_bitcast<_To>( 515 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()), 516 __m128i())); 517 else if constexpr (__x_to_y) 518 return __intrin_bitcast<_To>(is_signed_v<_Tp> 519 ? _mm256_cvtepi8_epi32(__intrin) 520 : _mm256_cvtepu8_epi32(__intrin)); 521 else if constexpr (__x_to_z) 522 return __intrin_bitcast<_To>(is_signed_v<_Tp> 523 ? _mm512_cvtepi8_epi32(__intrin) 524 : _mm512_cvtepu8_epi32(__intrin)); 525 } 526 else if constexpr (__i8_to_i16) //{{{2 527 { 528 if constexpr (__x_to_x && __have_sse4_1) 529 return __intrin_bitcast<_To>(is_signed_v<_Tp> 530 ? _mm_cvtepi8_epi16(__intrin) 531 : _mm_cvtepu8_epi16(__intrin)); 532 else if constexpr (__x_to_x && is_signed_v<_Tp>) 533 return __intrin_bitcast<_To>( 534 _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8)); 535 else if constexpr (__x_to_x && is_unsigned_v<_Tp>) 536 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i())); 537 else if constexpr (__x_to_y) 538 return __intrin_bitcast<_To>(is_signed_v<_Tp> 539 ? _mm256_cvtepi8_epi16(__intrin) 540 : _mm256_cvtepu8_epi16(__intrin)); 541 else if constexpr (__y_to_z && __have_avx512bw) 542 return __intrin_bitcast<_To>(is_signed_v<_Tp> 543 ? _mm512_cvtepi8_epi16(__intrin) 544 : _mm512_cvtepu8_epi16(__intrin)); 545 else if constexpr (__y_to_z) 546 __assert_unreachable<_Tp>(); 547 } 548 else if constexpr (__f32_to_s64) //{{{2 549 { 550 if constexpr (__have_avx512dq_vl && __x_to_x) 551 return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin)); 552 else if constexpr (__have_avx512dq_vl && __x_to_y) 553 return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin)); 554 else if constexpr (__have_avx512dq && __y_to_z) 555 return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin)); 556 // else use scalar fallback 557 } 558 else if constexpr (__f32_to_u64) //{{{2 559 { 560 if constexpr (__have_avx512dq_vl && __x_to_x) 561 return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin)); 562 else if constexpr (__have_avx512dq_vl && __x_to_y) 563 return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin)); 564 else if constexpr (__have_avx512dq && __y_to_z) 565 return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin)); 566 // else use scalar fallback 567 } 568 else if constexpr (__f32_to_s32) //{{{2 569 { 570 if constexpr (__x_to_x || __y_to_y || __z_to_z) 571 { 572 // go to fallback, it does the right thing 573 } 574 else 575 __assert_unreachable<_Tp>(); 576 } 577 else if constexpr (__f32_to_u32) //{{{2 578 { 579 if constexpr (__have_avx512vl && __x_to_x) 580 return __auto_bitcast(_mm_cvttps_epu32(__intrin)); 581 else if constexpr (__have_avx512f && __x_to_x) 582 return __auto_bitcast( 583 __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v)))); 584 else if constexpr (__have_avx512vl && __y_to_y) 585 return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin)); 586 else if constexpr (__have_avx512f && __y_to_y) 587 return __vector_bitcast<_Up>( 588 __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v)))); 589 else if constexpr (__x_to_x || __y_to_y || __z_to_z) 590 { 591 // go to fallback, it does the right thing. We can't use the 592 // _mm_floor_ps - 0x8000'0000 trick for f32->u32 because it would 593 // discard small input values (only 24 mantissa bits) 594 } 595 else 596 __assert_unreachable<_Tp>(); 597 } 598 else if constexpr (__f32_to_ibw) //{{{2 599 return __convert_x86<_To>(__convert_x86<__vector_type_t
>(__v)); 600 else if constexpr (__f64_to_s64) //{{{2 601 { 602 if constexpr (__have_avx512dq_vl && __x_to_x) 603 return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin)); 604 else if constexpr (__have_avx512dq_vl && __y_to_y) 605 return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin)); 606 else if constexpr (__have_avx512dq && __z_to_z) 607 return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin)); 608 // else use scalar fallback 609 } 610 else if constexpr (__f64_to_u64) //{{{2 611 { 612 if constexpr (__have_avx512dq_vl && __x_to_x) 613 return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin)); 614 else if constexpr (__have_avx512dq_vl && __y_to_y) 615 return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin)); 616 else if constexpr (__have_avx512dq && __z_to_z) 617 return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin)); 618 // else use scalar fallback 619 } 620 else if constexpr (__f64_to_s32) //{{{2 621 { 622 if constexpr (__x_to_x) 623 return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin)); 624 else if constexpr (__y_to_x) 625 return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin)); 626 else if constexpr (__z_to_y) 627 return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin)); 628 } 629 else if constexpr (__f64_to_u32) //{{{2 630 { 631 if constexpr (__have_avx512vl && __x_to_x) 632 return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin)); 633 else if constexpr (__have_sse4_1 && __x_to_x) 634 return __vector_bitcast<_Up, _M>( 635 _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000'0000u)) 636 ^ 0x8000'0000u; 637 else if constexpr (__x_to_x) 638 { 639 // use scalar fallback: it's only 2 values to convert, can't get 640 // much better than scalar decomposition 641 } 642 else if constexpr (__have_avx512vl && __y_to_x) 643 return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin)); 644 else if constexpr (__y_to_x) 645 { 646 return __intrin_bitcast<_To>( 647 __vector_bitcast<_Up>( 648 _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000'0000u)) 649 ^ 0x8000'0000u); 650 } 651 else if constexpr (__z_to_y) 652 return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin)); 653 } 654 else if constexpr (__f64_to_ibw) //{{{2 655 { 656 return __convert_x86<_To>( 657 __convert_x86<__vector_type_t
>(__v)); 658 } 659 else if constexpr (__s64_to_f32) //{{{2 660 { 661 if constexpr (__x_to_x && __have_avx512dq_vl) 662 return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin)); 663 else if constexpr (__y_to_x && __have_avx512dq_vl) 664 return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin)); 665 else if constexpr (__z_to_y && __have_avx512dq) 666 return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin)); 667 else if constexpr (__z_to_y) 668 return __intrin_bitcast<_To>( 669 _mm512_cvtpd_ps(__convert_x86<__vector_type_t
>(__v))); 670 } 671 else if constexpr (__u64_to_f32) //{{{2 672 { 673 if constexpr (__x_to_x && __have_avx512dq_vl) 674 return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin)); 675 else if constexpr (__y_to_x && __have_avx512dq_vl) 676 return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin)); 677 else if constexpr (__z_to_y && __have_avx512dq) 678 return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin)); 679 else if constexpr (__z_to_y) 680 { 681 return __intrin_bitcast<_To>( 682 __lo256(_mm512_cvtepu32_ps(__auto_bitcast( 683 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32))))) 684 * 0x100000000LL 685 + __lo256(_mm512_cvtepu32_ps( 686 __auto_bitcast(_mm512_cvtepi64_epi32(__intrin))))); 687 } 688 } 689 else if constexpr (__s32_to_f32) //{{{2 690 { 691 // use fallback (builtin conversion) 692 } 693 else if constexpr (__u32_to_f32) //{{{2 694 { 695 if constexpr (__x_to_x && __have_avx512vl) 696 { 697 // use fallback 698 } 699 else if constexpr (__x_to_x && __have_avx512f) 700 return __intrin_bitcast<_To>( 701 __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); 702 else if constexpr (__x_to_x && (__have_fma || __have_fma4)) 703 // work around PR85819 704 return __auto_bitcast(0x10000 705 * _mm_cvtepi32_ps(__to_intrin(__v >> 16)) 706 + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff))); 707 else if constexpr (__y_to_y && __have_avx512vl) 708 { 709 // use fallback 710 } 711 else if constexpr (__y_to_y && __have_avx512f) 712 return __intrin_bitcast<_To>( 713 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v)))); 714 else if constexpr (__y_to_y) 715 // work around PR85819 716 return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16)) 717 + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff)); 718 // else use fallback (builtin conversion) 719 } 720 else if constexpr (__ibw_to_f32) //{{{2 721 { 722 if constexpr (_M <= 4 || __have_avx2) 723 return __convert_x86<_To>( 724 __convert_x86<__vector_type_t
>(__v)); 725 else 726 { 727 static_assert(__x_to_y); 728 __m128i __a, __b; 729 if constexpr (__have_sse4_1) 730 { 731 __a = sizeof(_Tp) == 2 732 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin) 733 : _mm_cvtepu16_epi32(__intrin)) 734 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin) 735 : _mm_cvtepu8_epi32(__intrin)); 736 const auto __w 737 = _mm_shuffle_epi32(__intrin, sizeof(_Tp) == 2 ? 0xee : 0xe9); 738 __b = sizeof(_Tp) == 2 739 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w) 740 : _mm_cvtepu16_epi32(__w)) 741 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w) 742 : _mm_cvtepu8_epi32(__w)); 743 } 744 else 745 { 746 __m128i __tmp; 747 if constexpr (sizeof(_Tp) == 1) 748 { 749 __tmp = is_signed_v<_Tp> 750 ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, 751 __intrin), 752 8) 753 : _mm_unpacklo_epi8(__intrin, __m128i()); 754 } 755 else 756 { 757 static_assert(sizeof(_Tp) == 2); 758 __tmp = __intrin; 759 } 760 __a = is_signed_v<_Tp> 761 ? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16) 762 : _mm_unpacklo_epi16(__tmp, __m128i()); 763 __b = is_signed_v<_Tp> 764 ? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16) 765 : _mm_unpackhi_epi16(__tmp, __m128i()); 766 } 767 return __convert_x86<_To>(__vector_bitcast
(__a), 768 __vector_bitcast
(__b)); 769 } 770 } 771 else if constexpr (__s64_to_f64) //{{{2 772 { 773 if constexpr (__x_to_x && __have_avx512dq_vl) 774 return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin)); 775 else if constexpr (__y_to_y && __have_avx512dq_vl) 776 return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin)); 777 else if constexpr (__z_to_z && __have_avx512dq) 778 return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin)); 779 else if constexpr (__z_to_z) 780 { 781 return __intrin_bitcast<_To>( 782 _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) 783 * 0x100000000LL 784 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); 785 } 786 } 787 else if constexpr (__u64_to_f64) //{{{2 788 { 789 if constexpr (__x_to_x && __have_avx512dq_vl) 790 return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin)); 791 else if constexpr (__y_to_y && __have_avx512dq_vl) 792 return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin)); 793 else if constexpr (__z_to_z && __have_avx512dq) 794 return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin)); 795 else if constexpr (__z_to_z) 796 { 797 return __intrin_bitcast<_To>( 798 _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32))) 799 * 0x100000000LL 800 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin))); 801 } 802 } 803 else if constexpr (__s32_to_f64) //{{{2 804 { 805 if constexpr (__x_to_x) 806 return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin)); 807 else if constexpr (__x_to_y) 808 return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin)); 809 else if constexpr (__y_to_z) 810 return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin)); 811 } 812 else if constexpr (__u32_to_f64) //{{{2 813 { 814 if constexpr (__x_to_x && __have_avx512vl) 815 return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin)); 816 else if constexpr (__x_to_x && __have_avx512f) 817 return __intrin_bitcast<_To>( 818 __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); 819 else if constexpr (__x_to_x) 820 return __intrin_bitcast<_To>( 821 _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); 822 else if constexpr (__x_to_y && __have_avx512vl) 823 return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin)); 824 else if constexpr (__x_to_y && __have_avx512f) 825 return __intrin_bitcast<_To>( 826 __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v)))); 827 else if constexpr (__x_to_y) 828 return __intrin_bitcast<_To>( 829 _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000'0000u)) + 0x8000'0000u); 830 else if constexpr (__y_to_z) 831 return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin)); 832 } 833 else if constexpr (__ibw_to_f64) //{{{2 834 { 835 return __convert_x86<_To>( 836 __convert_x86<__vector_type_t
>(__v)); 837 } 838 else if constexpr (__f32_to_f64) //{{{2 839 { 840 if constexpr (__x_to_x) 841 return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin)); 842 else if constexpr (__x_to_y) 843 return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin)); 844 else if constexpr (__y_to_z) 845 return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin)); 846 } 847 else if constexpr (__f64_to_f32) //{{{2 848 { 849 if constexpr (__x_to_x) 850 return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin)); 851 else if constexpr (__y_to_x) 852 return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin)); 853 else if constexpr (__z_to_y) 854 return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin)); 855 } 856 else //{{{2 857 __assert_unreachable<_Tp>(); 858 859 // fallback:{{{2 860 return __vector_convert<_To>(__v, make_index_sequence
()); 861 //}}} 862 } 863 864 // }}} 865 // 2-arg __convert_x86 {{{1 866 template
867 _GLIBCXX_SIMD_INTRINSIC _To 868 __convert_x86(_V __v0, _V __v1) 869 { 870 static_assert(__is_vector_type_v<_V>); 871 using _Tp = typename _Traits::value_type; 872 constexpr size_t _Np = _Traits::_S_full_size; 873 [[maybe_unused]] const auto __i0 = __to_intrin(__v0); 874 [[maybe_unused]] const auto __i1 = __to_intrin(__v1); 875 using _Up = typename _VectorTraits<_To>::value_type; 876 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 877 878 static_assert(2 * _Np <= _M, 879 "__v1 would be discarded; use the one-argument " 880 "__convert_x86 overload instead"); 881 882 // [xyz]_to_[xyz] {{{2 883 [[maybe_unused]] constexpr bool __x_to_x 884 = sizeof(__v0) <= 16 && sizeof(_To) <= 16; 885 [[maybe_unused]] constexpr bool __x_to_y 886 = sizeof(__v0) <= 16 && sizeof(_To) == 32; 887 [[maybe_unused]] constexpr bool __x_to_z 888 = sizeof(__v0) <= 16 && sizeof(_To) == 64; 889 [[maybe_unused]] constexpr bool __y_to_x 890 = sizeof(__v0) == 32 && sizeof(_To) <= 16; 891 [[maybe_unused]] constexpr bool __y_to_y 892 = sizeof(__v0) == 32 && sizeof(_To) == 32; 893 [[maybe_unused]] constexpr bool __y_to_z 894 = sizeof(__v0) == 32 && sizeof(_To) == 64; 895 [[maybe_unused]] constexpr bool __z_to_x 896 = sizeof(__v0) == 64 && sizeof(_To) <= 16; 897 [[maybe_unused]] constexpr bool __z_to_y 898 = sizeof(__v0) == 64 && sizeof(_To) == 32; 899 [[maybe_unused]] constexpr bool __z_to_z 900 = sizeof(__v0) == 64 && sizeof(_To) == 64; 901 902 // iX_to_iX {{{2 903 [[maybe_unused]] constexpr bool __i_to_i 904 = is_integral_v<_Up> && is_integral_v<_Tp>; 905 [[maybe_unused]] constexpr bool __i8_to_i16 906 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; 907 [[maybe_unused]] constexpr bool __i8_to_i32 908 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; 909 [[maybe_unused]] constexpr bool __i8_to_i64 910 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; 911 [[maybe_unused]] constexpr bool __i16_to_i8 912 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; 913 [[maybe_unused]] constexpr bool __i16_to_i32 914 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; 915 [[maybe_unused]] constexpr bool __i16_to_i64 916 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; 917 [[maybe_unused]] constexpr bool __i32_to_i8 918 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; 919 [[maybe_unused]] constexpr bool __i32_to_i16 920 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; 921 [[maybe_unused]] constexpr bool __i32_to_i64 922 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; 923 [[maybe_unused]] constexpr bool __i64_to_i8 924 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 925 [[maybe_unused]] constexpr bool __i64_to_i16 926 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; 927 [[maybe_unused]] constexpr bool __i64_to_i32 928 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; 929 930 // [fsu]X_to_[fsu]X {{{2 931 // ibw = integral && byte or word, i.e. char and short with any signedness 932 [[maybe_unused]] constexpr bool __i64_to_f32 933 = is_integral_v<_Tp> && sizeof(_Tp) == 8 934 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 935 [[maybe_unused]] constexpr bool __s32_to_f32 936 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 937 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 938 [[maybe_unused]] constexpr bool __s16_to_f32 939 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 940 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 941 [[maybe_unused]] constexpr bool __s8_to_f32 942 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 943 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 944 [[maybe_unused]] constexpr bool __u32_to_f32 945 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 946 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 947 [[maybe_unused]] constexpr bool __u16_to_f32 948 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 949 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 950 [[maybe_unused]] constexpr bool __u8_to_f32 951 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 952 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 953 [[maybe_unused]] constexpr bool __s64_to_f64 954 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 955 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 956 [[maybe_unused]] constexpr bool __s32_to_f64 957 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 958 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 959 [[maybe_unused]] constexpr bool __s16_to_f64 960 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 961 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 962 [[maybe_unused]] constexpr bool __s8_to_f64 963 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 964 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 965 [[maybe_unused]] constexpr bool __u64_to_f64 966 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 967 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 968 [[maybe_unused]] constexpr bool __u32_to_f64 969 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 970 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 971 [[maybe_unused]] constexpr bool __u16_to_f64 972 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 973 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 974 [[maybe_unused]] constexpr bool __u8_to_f64 975 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 976 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 977 [[maybe_unused]] constexpr bool __f32_to_s64 978 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 979 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 980 [[maybe_unused]] constexpr bool __f32_to_s32 981 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 982 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 983 [[maybe_unused]] constexpr bool __f32_to_u64 984 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 985 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 986 [[maybe_unused]] constexpr bool __f32_to_u32 987 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 988 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 989 [[maybe_unused]] constexpr bool __f64_to_s64 990 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 991 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 992 [[maybe_unused]] constexpr bool __f64_to_s32 993 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 994 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 995 [[maybe_unused]] constexpr bool __f64_to_u64 996 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 997 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 998 [[maybe_unused]] constexpr bool __f64_to_u32 999 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 1000 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1001 [[maybe_unused]] constexpr bool __f32_to_ibw 1002 = is_integral_v<_Up> && sizeof(_Up) <= 2 1003 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1004 [[maybe_unused]] constexpr bool __f64_to_ibw 1005 = is_integral_v<_Up> && sizeof(_Up) <= 2 1006 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1007 [[maybe_unused]] constexpr bool __f32_to_f64 1008 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 1009 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1010 [[maybe_unused]] constexpr bool __f64_to_f32 1011 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 1012 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1013 1014 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 1015 //
,
=>
1016 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), 1017 __hi128(__v1)); 1018 else if constexpr (__i_to_i) // assert ISA {{{2 1019 { 1020 static_assert(__x_to_x || __have_avx2, 1021 "integral conversions with ymm registers require AVX2"); 1022 static_assert(__have_avx512bw 1023 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) 1024 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 1025 "8/16-bit integers in zmm registers require AVX512BW"); 1026 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, 1027 "integral conversions with ymm registers require AVX2"); 1028 } 1029 // concat => use 1-arg __convert_x86 {{{2 1030 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) 1031 || (sizeof(__v0) == 16 && __have_avx 1032 && is_floating_point_v<_Tp>) 1033 || (sizeof(__v0) == 32 && __have_avx512f 1034 && (sizeof(_Tp) >= 4 || __have_avx512bw))) 1035 { 1036 // The ISA can handle wider input registers, so concat and use one-arg 1037 // implementation. This reduces code duplication considerably. 1038 return __convert_x86<_To>(__concat(__v0, __v1)); 1039 } 1040 else //{{{2 1041 { 1042 // conversion using bit reinterpretation (or no conversion at all) 1043 // should all go through the concat branch above: 1044 static_assert( 1045 !(is_floating_point_v< 1046 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); 1047 // handle all zero extension{{{2 1048 if constexpr (2 * _Np < _M && sizeof(_To) > 16) 1049 { 1050 constexpr size_t Min = 16 / sizeof(_Up); 1051 return __zero_extend( 1052 __convert_x86< 1053 __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0, 1054 __v1)); 1055 } 1056 else if constexpr (__i64_to_i32) //{{{2 1057 { 1058 if constexpr (__x_to_x) 1059 return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0), 1060 __auto_bitcast(__v1), 0x88)); 1061 else if constexpr (__y_to_y) 1062 { 1063 // AVX512F is not available (would concat otherwise) 1064 return __auto_bitcast( 1065 __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0), 1066 __auto_bitcast(__v1), 0x88))); 1067 // alternative: 1068 // const auto v0_abxxcdxx = _mm256_shuffle_epi32(__v0, 8); 1069 // const auto v1_efxxghxx = _mm256_shuffle_epi32(__v1, 8); 1070 // const auto v_abefcdgh = _mm256_unpacklo_epi64(v0_abxxcdxx, 1071 // v1_efxxghxx); return _mm256_permute4x64_epi64(v_abefcdgh, 1072 // 0x01 * 0 + 0x04 * 2 + 0x10 * 1 + 0x40 * 3); // abcdefgh 1073 } 1074 else if constexpr (__z_to_z) 1075 return __intrin_bitcast<_To>( 1076 __concat(_mm512_cvtepi64_epi32(__i0), 1077 _mm512_cvtepi64_epi32(__i1))); 1078 } 1079 else if constexpr (__i64_to_i16) //{{{2 1080 { 1081 if constexpr (__x_to_x) 1082 { 1083 // AVX2 is not available (would concat otherwise) 1084 if constexpr (__have_sse4_1) 1085 { 1086 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1087 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), 1088 _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80, 1089 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80))); 1090 } 1091 else 1092 { 1093 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), 1094 _Up(__v1[0]), _Up(__v1[1])}; 1095 } 1096 } 1097 else if constexpr (__y_to_x) 1098 { 1099 auto __a 1100 = _mm256_unpacklo_epi16(__i0, __i1); // 04.. .... 26.. .... 1101 auto __b 1102 = _mm256_unpackhi_epi16(__i0, __i1); // 15.. .... 37.. .... 1103 auto __c 1104 = _mm256_unpacklo_epi16(__a, __b); // 0145 .... 2367 .... 1105 return __intrin_bitcast<_To>( 1106 _mm_unpacklo_epi32(__lo128(__c), __hi128(__c))); // 0123 4567 1107 } 1108 else if constexpr (__z_to_y) 1109 return __intrin_bitcast<_To>( 1110 __concat(_mm512_cvtepi64_epi16(__i0), 1111 _mm512_cvtepi64_epi16(__i1))); 1112 } 1113 else if constexpr (__i64_to_i8) //{{{2 1114 { 1115 if constexpr (__x_to_x && __have_sse4_1) 1116 { 1117 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1118 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44), 1119 _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80, 1120 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 1121 -0x80))); 1122 } 1123 else if constexpr (__x_to_x && __have_ssse3) 1124 { 1125 return __intrin_bitcast<_To>(_mm_unpacklo_epi16( 1126 _mm_shuffle_epi8( 1127 __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80, 1128 -0x80, -0x80, -0x80, -0x80, -0x80, 1129 -0x80, -0x80, -0x80, -0x80)), 1130 _mm_shuffle_epi8( 1131 __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80, 1132 -0x80, -0x80, -0x80, -0x80, -0x80, 1133 -0x80, -0x80, -0x80, -0x80)))); 1134 } 1135 else if constexpr (__x_to_x) 1136 { 1137 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]), 1138 _Up(__v1[0]), _Up(__v1[1])}; 1139 } 1140 else if constexpr (__y_to_x) 1141 { 1142 const auto __a = _mm256_shuffle_epi8( 1143 _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA), 1144 _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80, 1145 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 1146 -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80, 1147 -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80, 1148 -0x80, -0x80, -0x80, -0x80)); 1149 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); 1150 } // __z_to_x uses concat fallback 1151 } 1152 else if constexpr (__i32_to_i16) //{{{2 1153 { 1154 if constexpr (__x_to_x) 1155 { 1156 // AVX2 is not available (would concat otherwise) 1157 if constexpr (__have_sse4_1) 1158 { 1159 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1160 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa), 1161 _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 1162 11, 14, 15))); 1163 } 1164 else if constexpr (__have_ssse3) 1165 { 1166 return __intrin_bitcast<_To>( 1167 _mm_hadd_epi16(__to_intrin(__v0 << 16), 1168 __to_intrin(__v1 << 16))); 1169 /* 1170 return _mm_unpacklo_epi64( 1171 _mm_shuffle_epi8(__i0, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 1172 12, 13, 8, 9, 12, 13, 12, 13, 14, 15)), 1173 _mm_shuffle_epi8(__i1, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 1174 13, 8, 9, 12, 13, 12, 13, 14, 15))); 1175 */ 1176 } 1177 else 1178 { 1179 auto __a = _mm_unpacklo_epi16(__i0, __i1); // 04.. 15.. 1180 auto __b = _mm_unpackhi_epi16(__i0, __i1); // 26.. 37.. 1181 auto __c = _mm_unpacklo_epi16(__a, __b); // 0246 .... 1182 auto __d = _mm_unpackhi_epi16(__a, __b); // 1357 .... 1183 return __intrin_bitcast<_To>( 1184 _mm_unpacklo_epi16(__c, __d)); // 0123 4567 1185 } 1186 } 1187 else if constexpr (__y_to_y) 1188 { 1189 const auto __shuf 1190 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, 1191 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 1192 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, 1193 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80); 1194 auto __a = _mm256_shuffle_epi8(__i0, __shuf); 1195 auto __b = _mm256_shuffle_epi8(__i1, __shuf); 1196 return __intrin_bitcast<_To>( 1197 __xzyw(_mm256_unpacklo_epi64(__a, __b))); 1198 } // __z_to_z uses concat fallback 1199 } 1200 else if constexpr (__i32_to_i8) //{{{2 1201 { 1202 if constexpr (__x_to_x && __have_ssse3) 1203 { 1204 const auto shufmask 1205 = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 1206 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 1207 -0x80, -0x80); 1208 return __intrin_bitcast<_To>( 1209 _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask), 1210 _mm_shuffle_epi8(__i1, shufmask))); 1211 } 1212 else if constexpr (__x_to_x) 1213 { 1214 auto __a = _mm_unpacklo_epi8(__i0, __i1); // 04.. .... 15.. .... 1215 auto __b = _mm_unpackhi_epi8(__i0, __i1); // 26.. .... 37.. .... 1216 auto __c = _mm_unpacklo_epi8(__a, __b); // 0246 .... .... .... 1217 auto __d = _mm_unpackhi_epi8(__a, __b); // 1357 .... .... .... 1218 auto __e = _mm_unpacklo_epi8(__c, __d); // 0123 4567 .... .... 1219 return __intrin_bitcast<_To>(__e & __m128i{-1, 0}); 1220 } 1221 else if constexpr (__y_to_x) 1222 { 1223 const auto __a = _mm256_shuffle_epi8( 1224 _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA), 1225 _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2, 1226 6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80, 1227 -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80, 1228 -0x80, -0x80, -0x80, 2, 6, 10, 14)); 1229 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a)); 1230 } // __z_to_y uses concat fallback 1231 } 1232 else if constexpr (__i16_to_i8) //{{{2 1233 { 1234 if constexpr (__x_to_x && __have_ssse3) 1235 { 1236 const auto __shuf = reinterpret_cast<__m128i>( 1237 __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80, 1238 0x80, 0x80, 0x80, 0x80, 0x80, 1239 0x80, 0x80}); 1240 return __intrin_bitcast<_To>( 1241 _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf), 1242 _mm_shuffle_epi8(__i1, __shuf))); 1243 } 1244 else if constexpr (__x_to_x) 1245 { 1246 auto __a = _mm_unpacklo_epi8(__i0, __i1); // 08.. 19.. 2A.. 3B.. 1247 auto __b = _mm_unpackhi_epi8(__i0, __i1); // 4C.. 5D.. 6E.. 7F.. 1248 auto __c = _mm_unpacklo_epi8(__a, __b); // 048C .... 159D .... 1249 auto __d = _mm_unpackhi_epi8(__a, __b); // 26AE .... 37BF .... 1250 auto __e = _mm_unpacklo_epi8(__c, __d); // 0246 8ACE .... .... 1251 auto __f = _mm_unpackhi_epi8(__c, __d); // 1357 9BDF .... .... 1252 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f)); 1253 } 1254 else if constexpr (__y_to_y) 1255 { 1256 return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8( 1257 (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff)) 1258 | _mm256_slli_epi16(__i1, 8), 1259 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 1260 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 1261 7, 9, 11, 13, 15)))); 1262 } // __z_to_z uses concat fallback 1263 } 1264 else if constexpr (__i64_to_f32) //{{{2 1265 { 1266 if constexpr (__x_to_x) 1267 return __make_wrapper
(__v0[0], __v0[1], __v1[0], __v1[1]); 1268 else if constexpr (__y_to_y) 1269 { 1270 static_assert(__y_to_y && __have_avx2); 1271 const auto __a = _mm256_unpacklo_epi32(__i0, __i1); // aeAE cgCG 1272 const auto __b = _mm256_unpackhi_epi32(__i0, __i1); // bfBF dhDH 1273 const auto __lo32 1274 = _mm256_unpacklo_epi32(__a, __b); // abef cdgh 1275 const auto __hi32 = __vector_bitcast< 1276 conditional_t
, int, _UInt>>( 1277 _mm256_unpackhi_epi32(__a, __b)); // ABEF CDGH 1278 const auto __hi 1279 = 0x100000000LL 1280 * __convert_x86<__vector_type_t
>(__hi32); 1281 const auto __mid 1282 = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16)); 1283 const auto __lo 1284 = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32); 1285 return __xzyw((__hi + __mid) + __lo); 1286 } 1287 else if constexpr (__z_to_z && __have_avx512dq) 1288 { 1289 return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0), 1290 _mm512_cvtepi64_ps(__i1)) 1291 : __concat(_mm512_cvtepu64_ps(__i0), 1292 _mm512_cvtepu64_ps(__i1)); 1293 } 1294 else if constexpr (__z_to_z && is_signed_v<_Tp>) 1295 { 1296 const __m512 __hi32 = _mm512_cvtepi32_ps( 1297 __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)), 1298 _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32)))); 1299 const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0), 1300 _mm512_cvtepi64_epi32(__i1)); 1301 // split low 32-bits, because if __hi32 is a small negative 1302 // number, the 24-bit mantissa may lose important information if 1303 // any of the high 8 bits of __lo32 is set, leading to 1304 // catastrophic cancelation in the FMA 1305 const __m512 __hi16 1306 = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32); 1307 const __m512 __lo16 1308 = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32); 1309 return (__hi32 * 0x100000000LL + __hi16) + __lo16; 1310 } 1311 else if constexpr (__z_to_z && is_unsigned_v<_Tp>) 1312 { 1313 return __intrin_bitcast<_To>( 1314 _mm512_cvtepu32_ps(__concat( 1315 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)), 1316 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32)))) 1317 * 0x100000000LL 1318 + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0), 1319 _mm512_cvtepi64_epi32(__i1)))); 1320 } 1321 } 1322 else if constexpr (__f64_to_s32) //{{{2 1323 { 1324 // use concat fallback 1325 } 1326 else if constexpr (__f64_to_u32) //{{{2 1327 { 1328 if constexpr (__x_to_x && __have_sse4_1) 1329 { 1330 return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64( 1331 _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000'0000u), 1332 _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u))) 1333 ^ 0x8000'0000u; 1334 // without SSE4.1 just use the scalar fallback, it's only four 1335 // values 1336 } 1337 else if constexpr (__y_to_y) 1338 { 1339 return __vector_bitcast<_Up>( 1340 __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0) 1341 - 0x8000'0000u), 1342 _mm256_cvttpd_epi32(_mm256_floor_pd(__i1) 1343 - 0x8000'0000u))) 1344 ^ 0x8000'0000u; 1345 } // __z_to_z uses fallback 1346 } 1347 else if constexpr (__f64_to_ibw) //{{{2 1348 { 1349 // one-arg __f64_to_ibw goes via _SimdWrapper
. The fallback 1350 // would go via two independet conversions to _SimdWrapper<_To> and 1351 // subsequent interleaving. This is better, because f64->__i32 1352 // allows to combine __v0 and __v1 into one register: if constexpr 1353 // (__z_to_x || __y_to_x) { 1354 return __convert_x86<_To>( 1355 __convert_x86<__vector_type_t
>(__v0, __v1)); 1356 //} 1357 } 1358 else if constexpr (__f32_to_ibw) //{{{2 1359 { 1360 return __convert_x86<_To>( 1361 __convert_x86<__vector_type_t
>(__v0), 1362 __convert_x86<__vector_type_t
>(__v1)); 1363 } //}}} 1364 1365 // fallback: {{{2 1366 if constexpr (sizeof(_To) >= 32) 1367 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1368 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0), 1369 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1)); 1370 else if constexpr (sizeof(_To) == 16) 1371 { 1372 const auto __lo = __to_intrin(__convert_x86<_To>(__v0)); 1373 const auto __hi = __to_intrin(__convert_x86<_To>(__v1)); 1374 if constexpr (sizeof(_Up) * _Np == 8) 1375 { 1376 if constexpr (is_floating_point_v<_Up>) 1377 return __auto_bitcast( 1378 _mm_unpacklo_pd(__vector_bitcast
(__lo), 1379 __vector_bitcast
(__hi))); 1380 else 1381 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 1382 } 1383 else if constexpr (sizeof(_Up) * _Np == 4) 1384 { 1385 if constexpr (is_floating_point_v<_Up>) 1386 return __auto_bitcast( 1387 _mm_unpacklo_ps(__vector_bitcast
(__lo), 1388 __vector_bitcast
(__hi))); 1389 else 1390 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); 1391 } 1392 else if constexpr (sizeof(_Up) * _Np == 2) 1393 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi)); 1394 else 1395 __assert_unreachable<_Tp>(); 1396 } 1397 else 1398 return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>()); 1399 //}}} 1400 } 1401 } 1402 1403 //}}}1 1404 // 4-arg __convert_x86 {{{1 1405 template
1406 _GLIBCXX_SIMD_INTRINSIC _To 1407 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3) 1408 { 1409 static_assert(__is_vector_type_v<_V>); 1410 using _Tp = typename _Traits::value_type; 1411 constexpr size_t _Np = _Traits::_S_full_size; 1412 [[maybe_unused]] const auto __i0 = __to_intrin(__v0); 1413 [[maybe_unused]] const auto __i1 = __to_intrin(__v1); 1414 [[maybe_unused]] const auto __i2 = __to_intrin(__v2); 1415 [[maybe_unused]] const auto __i3 = __to_intrin(__v3); 1416 using _Up = typename _VectorTraits<_To>::value_type; 1417 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 1418 1419 static_assert(4 * _Np <= _M, 1420 "__v2/__v3 would be discarded; use the two/one-argument " 1421 "__convert_x86 overload instead"); 1422 1423 // [xyz]_to_[xyz] {{{2 1424 [[maybe_unused]] constexpr bool __x_to_x 1425 = sizeof(__v0) <= 16 && sizeof(_To) <= 16; 1426 [[maybe_unused]] constexpr bool __x_to_y 1427 = sizeof(__v0) <= 16 && sizeof(_To) == 32; 1428 [[maybe_unused]] constexpr bool __x_to_z 1429 = sizeof(__v0) <= 16 && sizeof(_To) == 64; 1430 [[maybe_unused]] constexpr bool __y_to_x 1431 = sizeof(__v0) == 32 && sizeof(_To) <= 16; 1432 [[maybe_unused]] constexpr bool __y_to_y 1433 = sizeof(__v0) == 32 && sizeof(_To) == 32; 1434 [[maybe_unused]] constexpr bool __y_to_z 1435 = sizeof(__v0) == 32 && sizeof(_To) == 64; 1436 [[maybe_unused]] constexpr bool __z_to_x 1437 = sizeof(__v0) == 64 && sizeof(_To) <= 16; 1438 [[maybe_unused]] constexpr bool __z_to_y 1439 = sizeof(__v0) == 64 && sizeof(_To) == 32; 1440 [[maybe_unused]] constexpr bool __z_to_z 1441 = sizeof(__v0) == 64 && sizeof(_To) == 64; 1442 1443 // iX_to_iX {{{2 1444 [[maybe_unused]] constexpr bool __i_to_i 1445 = is_integral_v<_Up> && is_integral_v<_Tp>; 1446 [[maybe_unused]] constexpr bool __i8_to_i16 1447 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; 1448 [[maybe_unused]] constexpr bool __i8_to_i32 1449 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; 1450 [[maybe_unused]] constexpr bool __i8_to_i64 1451 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; 1452 [[maybe_unused]] constexpr bool __i16_to_i8 1453 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; 1454 [[maybe_unused]] constexpr bool __i16_to_i32 1455 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; 1456 [[maybe_unused]] constexpr bool __i16_to_i64 1457 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; 1458 [[maybe_unused]] constexpr bool __i32_to_i8 1459 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; 1460 [[maybe_unused]] constexpr bool __i32_to_i16 1461 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; 1462 [[maybe_unused]] constexpr bool __i32_to_i64 1463 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; 1464 [[maybe_unused]] constexpr bool __i64_to_i8 1465 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 1466 [[maybe_unused]] constexpr bool __i64_to_i16 1467 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; 1468 [[maybe_unused]] constexpr bool __i64_to_i32 1469 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; 1470 1471 // [fsu]X_to_[fsu]X {{{2 1472 // ibw = integral && byte or word, i.e. char and short with any signedness 1473 [[maybe_unused]] constexpr bool __i64_to_f32 1474 = is_integral_v<_Tp> && sizeof(_Tp) == 8 1475 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1476 [[maybe_unused]] constexpr bool __s32_to_f32 1477 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 1478 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1479 [[maybe_unused]] constexpr bool __s16_to_f32 1480 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 1481 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1482 [[maybe_unused]] constexpr bool __s8_to_f32 1483 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 1484 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1485 [[maybe_unused]] constexpr bool __u32_to_f32 1486 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 1487 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1488 [[maybe_unused]] constexpr bool __u16_to_f32 1489 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 1490 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1491 [[maybe_unused]] constexpr bool __u8_to_f32 1492 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 1493 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1494 [[maybe_unused]] constexpr bool __s64_to_f64 1495 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 1496 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1497 [[maybe_unused]] constexpr bool __s32_to_f64 1498 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 1499 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1500 [[maybe_unused]] constexpr bool __s16_to_f64 1501 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 1502 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1503 [[maybe_unused]] constexpr bool __s8_to_f64 1504 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 1505 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1506 [[maybe_unused]] constexpr bool __u64_to_f64 1507 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 1508 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1509 [[maybe_unused]] constexpr bool __u32_to_f64 1510 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 1511 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1512 [[maybe_unused]] constexpr bool __u16_to_f64 1513 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 1514 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1515 [[maybe_unused]] constexpr bool __u8_to_f64 1516 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 1517 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1518 [[maybe_unused]] constexpr bool __f32_to_s64 1519 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 1520 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1521 [[maybe_unused]] constexpr bool __f32_to_s32 1522 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 1523 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1524 [[maybe_unused]] constexpr bool __f32_to_u64 1525 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 1526 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1527 [[maybe_unused]] constexpr bool __f32_to_u32 1528 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 1529 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1530 [[maybe_unused]] constexpr bool __f64_to_s64 1531 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 1532 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1533 [[maybe_unused]] constexpr bool __f64_to_s32 1534 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 1535 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1536 [[maybe_unused]] constexpr bool __f64_to_u64 1537 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 1538 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1539 [[maybe_unused]] constexpr bool __f64_to_u32 1540 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 1541 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1542 [[maybe_unused]] constexpr bool __f32_to_ibw 1543 = is_integral_v<_Up> && sizeof(_Up) <= 2 1544 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1545 [[maybe_unused]] constexpr bool __f64_to_ibw 1546 = is_integral_v<_Up> && sizeof(_Up) <= 2 1547 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1548 [[maybe_unused]] constexpr bool __f32_to_f64 1549 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 1550 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1551 [[maybe_unused]] constexpr bool __f64_to_f32 1552 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 1553 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1554 1555 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 1556 { 1557 //
,
,
,
=>
1558 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), 1559 __hi128(__v1), __lo128(__v2), __hi128(__v2), 1560 __lo128(__v3), __hi128(__v3)); 1561 } 1562 else if constexpr (__i_to_i) // assert ISA {{{2 1563 { 1564 static_assert(__x_to_x || __have_avx2, 1565 "integral conversions with ymm registers require AVX2"); 1566 static_assert(__have_avx512bw 1567 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) 1568 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 1569 "8/16-bit integers in zmm registers require AVX512BW"); 1570 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, 1571 "integral conversions with ymm registers require AVX2"); 1572 } 1573 // concat => use 2-arg __convert_x86 {{{2 1574 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) 1575 || (sizeof(__v0) == 16 && __have_avx 1576 && is_floating_point_v<_Tp>) 1577 || (sizeof(__v0) == 32 && __have_avx512f)) 1578 { 1579 // The ISA can handle wider input registers, so concat and use two-arg 1580 // implementation. This reduces code duplication considerably. 1581 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3)); 1582 } 1583 else //{{{2 1584 { 1585 // conversion using bit reinterpretation (or no conversion at all) 1586 // should all go through the concat branch above: 1587 static_assert( 1588 !(is_floating_point_v< 1589 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); 1590 // handle all zero extension{{{2 1591 if constexpr (4 * _Np < _M && sizeof(_To) > 16) 1592 { 1593 constexpr size_t Min = 16 / sizeof(_Up); 1594 return __zero_extend( 1595 __convert_x86< 1596 __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>( 1597 __v0, __v1, __v2, __v3)); 1598 } 1599 else if constexpr (__i64_to_i16) //{{{2 1600 { 1601 if constexpr (__x_to_x && __have_sse4_1) 1602 { 1603 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1604 _mm_blend_epi16( 1605 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22), 1606 _mm_blend_epi16(_mm_slli_si128(__i2, 4), 1607 _mm_slli_si128(__i3, 6), 0x88), 1608 0xcc), 1609 _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 1610 14, 15))); 1611 } 1612 else if constexpr (__y_to_y && __have_avx2) 1613 { 1614 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 1615 __xzyw(_mm256_blend_epi16( 1616 __auto_bitcast( 1617 _mm256_shuffle_ps(__vector_bitcast
(__v0), 1618 __vector_bitcast
(__v2), 1619 0x88)), // 0.1. 8.9. 2.3. A.B. 1620 __to_intrin(__vector_bitcast
(_mm256_shuffle_ps( 1621 __vector_bitcast
(__v1), 1622 __vector_bitcast
(__v3), 0x88)) 1623 << 16), // .4.5 .C.D .6.7 .E.F 1624 0xaa) // 0415 8C9D 2637 AEBF 1625 ), // 0415 2637 8C9D AEBF 1626 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 1627 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 1628 10, 11, 14, 15))); 1629 /* 1630 auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26.. 1631 .... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15.. 1632 .... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); // 1633 8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2, 1634 __v3); 1635 // 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a, 1636 __b); 1637 // 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c, 1638 __d); 1639 // 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e, 1640 __f); 1641 // 0145 89CD 2367 ABEF return __concat( 1642 _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)), 1643 _mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123 1644 4567 89AB CDEF 1645 */ 1646 } // else use fallback 1647 } 1648 else if constexpr (__i64_to_i8) //{{{2 1649 { 1650 if constexpr (__x_to_x) 1651 { 1652 // TODO: use fallback for now 1653 } 1654 else if constexpr (__y_to_x) 1655 { 1656 auto __a 1657 = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24) 1658 | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16) 1659 | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8) 1660 | _mm256_slli_epi32( 1661 __i3, 24); // 048C .... 159D .... 26AE .... 37BF .... 1662 /*return _mm_shuffle_epi8( 1663 _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5), 1664 _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15, 1665 3, 11));*/ 1666 auto __b = _mm256_unpackhi_epi64( 1667 __a, __a); // 159D .... 159D .... 37BF .... 37BF .... 1668 auto __c = _mm256_unpacklo_epi8( 1669 __a, __b); // 0145 89CD .... .... 2367 ABEF .... .... 1670 return __intrin_bitcast<_To>( 1671 _mm_unpacklo_epi16(__lo128(__c), 1672 __hi128(__c))); // 0123 4567 89AB CDEF 1673 } 1674 } 1675 else if constexpr (__i32_to_i8) //{{{2 1676 { 1677 if constexpr (__x_to_x) 1678 { 1679 if constexpr (__have_ssse3) 1680 { 1681 const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff; 1682 const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff) 1683 << 8; 1684 const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff) 1685 << 16; 1686 const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24; 1687 return __intrin_bitcast<_To>( 1688 _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3), 1689 _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 1690 2, 6, 10, 14, 3, 7, 11, 1691 15))); 1692 } 1693 else 1694 { 1695 auto __a 1696 = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. .... 1697 auto __b 1698 = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. .... 1699 auto __c 1700 = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. .... 1701 auto __d 1702 = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. .... 1703 auto __e 1704 = _mm_unpacklo_epi8(__a, __c); // 048C .... .... .... 1705 auto __f 1706 = _mm_unpackhi_epi8(__a, __c); // 159D .... .... .... 1707 auto __g 1708 = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... .... 1709 auto __h 1710 = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... .... 1711 return __intrin_bitcast<_To>(_mm_unpacklo_epi8( 1712 _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... .... 1713 _mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... .... 1714 )); // 0123 4567 89AB CDEF 1715 } 1716 } 1717 else if constexpr (__y_to_y) 1718 { 1719 const auto __a = _mm256_shuffle_epi8( 1720 __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16( 1721 __i0, _mm256_slli_epi32(__i1, 16), 0xAA)) 1722 & 0xff) 1723 | (__vector_bitcast<_UShort>(_mm256_blend_epi16( 1724 __i2, _mm256_slli_epi32(__i3, 16), 0xAA)) 1725 << 8)), 1726 _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 1727 11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 1728 13, 3, 7, 11, 15)); 1729 return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32( 1730 __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7))); 1731 } 1732 } 1733 else if constexpr (__i64_to_f32) //{{{2 1734 { 1735 // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm 1736 // integers) 1737 if constexpr (__x_to_y) 1738 { 1739 return __make_wrapper
(__v0[0], __v0[1], __v1[0], __v1[1], 1740 __v2[0], __v2[1], __v3[0], 1741 __v3[1]); 1742 1743 const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC 1744 const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD 1745 const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG 1746 const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH 1747 const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd 1748 const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh 1749 const auto __hi32 = __vector_bitcast< 1750 conditional_t
, int, _UInt>>( 1751 __concat(_mm_unpackhi_epi32(__a, __b), 1752 _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH 1753 const auto __hi 1754 = 0x100000000LL 1755 * __convert_x86<__vector_type_t
>(__hi32); 1756 const auto __mid 1757 = 0x10000 1758 * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16), 1759 _mm_srli_epi32(__lo32b, 16))); 1760 const auto __lo = _mm256_cvtepi32_ps( 1761 __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a, 1762 _mm_set1_epi32(0x0000ffffu) & __lo32b)); 1763 return (__hi + __mid) + __lo; 1764 } 1765 } 1766 else if constexpr (__f64_to_ibw) //{{{2 1767 { 1768 return __convert_x86<_To>( 1769 __convert_x86<__vector_type_t
>(__v0, __v1), 1770 __convert_x86<__vector_type_t
>(__v2, __v3)); 1771 } 1772 else if constexpr (__f32_to_ibw) //{{{2 1773 { 1774 return __convert_x86<_To>( 1775 __convert_x86<__vector_type_t
>(__v0), 1776 __convert_x86<__vector_type_t
>(__v1), 1777 __convert_x86<__vector_type_t
>(__v2), 1778 __convert_x86<__vector_type_t
>(__v3)); 1779 } //}}} 1780 1781 // fallback: {{{2 1782 if constexpr (sizeof(_To) >= 32) 1783 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1784 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, 1785 __v1), 1786 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2, 1787 __v3)); 1788 else if constexpr (sizeof(_To) == 16) 1789 { 1790 const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1)); 1791 const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3)); 1792 if constexpr (sizeof(_Up) * _Np * 2 == 8) 1793 { 1794 if constexpr (is_floating_point_v<_Up>) 1795 return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi)); 1796 else 1797 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 1798 } 1799 else if constexpr (sizeof(_Up) * _Np * 2 == 4) 1800 { 1801 if constexpr (is_floating_point_v<_Up>) 1802 return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi)); 1803 else 1804 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); 1805 } 1806 else 1807 __assert_unreachable<_Tp>(); 1808 } 1809 else 1810 return __vector_convert<_To>(__v0, __v1, __v2, __v3, 1811 make_index_sequence<_Np>()); 1812 //}}}2 1813 } 1814 } 1815 1816 //}}} 1817 // 8-arg __convert_x86 {{{1 1818 template
1819 _GLIBCXX_SIMD_INTRINSIC _To 1820 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, 1821 _V __v7) 1822 { 1823 static_assert(__is_vector_type_v<_V>); 1824 using _Tp = typename _Traits::value_type; 1825 constexpr size_t _Np = _Traits::_S_full_size; 1826 [[maybe_unused]] const auto __i0 = __to_intrin(__v0); 1827 [[maybe_unused]] const auto __i1 = __to_intrin(__v1); 1828 [[maybe_unused]] const auto __i2 = __to_intrin(__v2); 1829 [[maybe_unused]] const auto __i3 = __to_intrin(__v3); 1830 [[maybe_unused]] const auto __i4 = __to_intrin(__v4); 1831 [[maybe_unused]] const auto __i5 = __to_intrin(__v5); 1832 [[maybe_unused]] const auto __i6 = __to_intrin(__v6); 1833 [[maybe_unused]] const auto __i7 = __to_intrin(__v7); 1834 using _Up = typename _VectorTraits<_To>::value_type; 1835 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 1836 1837 static_assert(8 * _Np <= _M, 1838 "__v4-__v7 would be discarded; use the four/two/one-argument " 1839 "__convert_x86 overload instead"); 1840 1841 // [xyz]_to_[xyz] {{{2 1842 [[maybe_unused]] constexpr bool __x_to_x 1843 = sizeof(__v0) <= 16 && sizeof(_To) <= 16; 1844 [[maybe_unused]] constexpr bool __x_to_y 1845 = sizeof(__v0) <= 16 && sizeof(_To) == 32; 1846 [[maybe_unused]] constexpr bool __x_to_z 1847 = sizeof(__v0) <= 16 && sizeof(_To) == 64; 1848 [[maybe_unused]] constexpr bool __y_to_x 1849 = sizeof(__v0) == 32 && sizeof(_To) <= 16; 1850 [[maybe_unused]] constexpr bool __y_to_y 1851 = sizeof(__v0) == 32 && sizeof(_To) == 32; 1852 [[maybe_unused]] constexpr bool __y_to_z 1853 = sizeof(__v0) == 32 && sizeof(_To) == 64; 1854 [[maybe_unused]] constexpr bool __z_to_x 1855 = sizeof(__v0) == 64 && sizeof(_To) <= 16; 1856 [[maybe_unused]] constexpr bool __z_to_y 1857 = sizeof(__v0) == 64 && sizeof(_To) == 32; 1858 [[maybe_unused]] constexpr bool __z_to_z 1859 = sizeof(__v0) == 64 && sizeof(_To) == 64; 1860 1861 // [if]X_to_i8 {{{2 1862 [[maybe_unused]] constexpr bool __i_to_i 1863 = is_integral_v<_Up> && is_integral_v<_Tp>; 1864 [[maybe_unused]] constexpr bool __i64_to_i8 1865 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 1866 [[maybe_unused]] constexpr bool __f64_to_i8 1867 = is_integral_v<_Up> && sizeof(_Up) == 1 1868 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1869 1870 if constexpr (__i_to_i) // assert ISA {{{2 1871 { 1872 static_assert(__x_to_x || __have_avx2, 1873 "integral conversions with ymm registers require AVX2"); 1874 static_assert(__have_avx512bw 1875 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) 1876 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 1877 "8/16-bit integers in zmm registers require AVX512BW"); 1878 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, 1879 "integral conversions with ymm registers require AVX2"); 1880 } 1881 // concat => use 4-arg __convert_x86 {{{2 1882 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) 1883 || (sizeof(__v0) == 16 && __have_avx 1884 && is_floating_point_v<_Tp>) 1885 || (sizeof(__v0) == 32 && __have_avx512f)) 1886 { 1887 // The ISA can handle wider input registers, so concat and use two-arg 1888 // implementation. This reduces code duplication considerably. 1889 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), 1890 __concat(__v4, __v5), __concat(__v6, __v7)); 1891 } 1892 else //{{{2 1893 { 1894 // conversion using bit reinterpretation (or no conversion at all) 1895 // should all go through the concat branch above: 1896 static_assert( 1897 !(is_floating_point_v< 1898 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); 1899 static_assert(!(8 * _Np < _M && sizeof(_To) > 16), 1900 "zero extension should be impossible"); 1901 if constexpr (__i64_to_i8) //{{{2 1902 { 1903 if constexpr (__x_to_x && __have_ssse3) 1904 { 1905 // unsure whether this is better than the variant below 1906 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1907 __to_intrin( 1908 (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) 1909 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) 1910 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) 1911 | (((__v6 & 0xff) << 48) | (__v7 << 56)))), 1912 _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 1913 7, 15))); 1914 } 1915 else if constexpr (__x_to_x) 1916 { 1917 const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac 1918 const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd 1919 const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg 1920 const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh 1921 const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik 1922 const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl 1923 const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo 1924 const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np 1925 return __intrin_bitcast<_To>(_mm_unpacklo_epi64( 1926 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd 1927 _mm_unpacklo_epi8(__c, __d)), // efgh 1928 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl 1929 _mm_unpacklo_epi8(__g, __h)) // mnop 1930 )); 1931 } 1932 else if constexpr (__y_to_y) 1933 { 1934 auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV 1935 __to_intrin( 1936 (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) 1937 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) 1938 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) 1939 | (((__v6 & 0xff) << 48) | ((__v7 << 56))))); 1940 /* 1941 auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D 1942 HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a, 1943 __b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d = 1944 __xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return 1945 _mm256_shuffle_epi8( 1946 __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 1947 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 1948 14, 15)); 1949 */ 1950 auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF 1951 // IJMN QRUV 1952 __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 1953 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 1954 4, 12, 5, 13, 6, 14, 7, 15)); 1955 auto __c 1956 = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV 1957 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 1958 __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 1959 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 1960 4, 5, 12, 13, 6, 7, 14, 15))); 1961 } 1962 else if constexpr (__z_to_z) 1963 { 1964 return __concat( 1965 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, 1966 __v3), 1967 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, 1968 __v7)); 1969 } 1970 } 1971 else if constexpr (__f64_to_i8) //{{{2 1972 { 1973 return __convert_x86<_To>( 1974 __convert_x86<__vector_type_t
>(__v0, __v1), 1975 __convert_x86<__vector_type_t
>(__v2, __v3), 1976 __convert_x86<__vector_type_t
>(__v4, __v5), 1977 __convert_x86<__vector_type_t
>(__v6, __v7)); 1978 } 1979 else // unreachable {{{2 1980 __assert_unreachable<_Tp>(); 1981 //}}} 1982 1983 // fallback: {{{2 1984 if constexpr (sizeof(_To) >= 32) 1985 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1986 return __concat( 1987 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3), 1988 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, 1989 __v7)); 1990 else if constexpr (sizeof(_To) == 16) 1991 { 1992 const auto __lo 1993 = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3)); 1994 const auto __hi 1995 = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7)); 1996 static_assert(sizeof(_Up) == 1 && _Np == 2); 1997 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 1998 } 1999 else 2000 { 2001 __assert_unreachable<_Tp>(); 2002 // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5, 2003 // __v6, __v7, 2004 // make_index_sequence<_Np>()); 2005 } //}}}2 2006 } 2007 } 2008 2009 //}}} 2010 // 16-arg __convert_x86 {{{1 2011 template
2012 _GLIBCXX_SIMD_INTRINSIC _To 2013 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, 2014 _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12, 2015 _V __v13, _V __v14, _V __v15) 2016 { 2017 // concat => use 8-arg __convert_x86 2018 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), 2019 __concat(__v4, __v5), __concat(__v6, __v7), 2020 __concat(__v8, __v9), __concat(__v10, __v11), 2021 __concat(__v12, __v13), __concat(__v14, __v15)); 2022 } 2023 2024 //}}} 2025 2026 #endif // __cplusplus >= 201703L 2027 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 2028 2029 // vim: foldmethod=marker
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2025 MyWebUniversity.com ™