1 /* 2 * Simple C functions to supplement the C library 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qemu/cutils.h" 26 #include "qemu/bswap.h" 27 #include "host/cpuinfo.h" 28 29 static bool (*buffer_is_zero_accel)(const void *, size_t); 30 31 static bool buffer_is_zero_integer(const void *buf, size_t len) 32 { 33 if (unlikely(len < 8)) { 34 /* For a very small buffer, simply accumulate all the bytes. */ 35 const unsigned char *p = buf; 36 const unsigned char *e = buf + len; 37 unsigned char t = 0; 38 39 do { 40 t |= *p++; 41 } while (p < e); 42 43 return t == 0; 44 } else { 45 /* Otherwise, use the unaligned memory access functions to 46 handle the beginning and end of the buffer, with a couple 47 of loops handling the middle aligned section. */ 48 uint64_t t = ldq_he_p(buf); 49 const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); 50 const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); 51 52 for (; p + 8 <= e; p += 8) { 53 if (t) { 54 return false; 55 } 56 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; 57 } 58 while (p < e) { 59 t |= *p++; 60 } 61 t |= ldq_he_p(buf + len - 8); 62 63 return t == 0; 64 } 65 } 66 67 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) 68 #include <immintrin.h> 69 70 /* Helper for preventing the compiler from reassociating 71 chains of binary vector operations. */ 72 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1)) 73 74 /* Note that these vectorized functions may assume len >= 256. */ 75 76 static bool __attribute__((target("sse2"))) 77 buffer_zero_sse2(const void *buf, size_t len) 78 { 79 /* Unaligned loads at head/tail. */ 80 __m128i v = *(__m128i_u *)(buf); 81 __m128i w = *(__m128i_u *)(buf + len - 16); 82 /* Align head/tail to 16-byte boundaries. */ 83 const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); 84 const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); 85 __m128i zero = { 0 }; 86 87 /* Collect a partial block at tail end. */ 88 v |= e[-1]; w |= e[-2]; 89 SSE_REASSOC_BARRIER(v, w); 90 v |= e[-3]; w |= e[-4]; 91 SSE_REASSOC_BARRIER(v, w); 92 v |= e[-5]; w |= e[-6]; 93 SSE_REASSOC_BARRIER(v, w); 94 v |= e[-7]; v |= w; 95 96 /* 97 * Loop over complete 128-byte blocks. 98 * With the head and tail removed, e - p >= 14, so the loop 99 * must iterate at least once. 100 */ 101 do { 102 v = _mm_cmpeq_epi8(v, zero); 103 if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) { 104 return false; 105 } 106 v = p[0]; w = p[1]; 107 SSE_REASSOC_BARRIER(v, w); 108 v |= p[2]; w |= p[3]; 109 SSE_REASSOC_BARRIER(v, w); 110 v |= p[4]; w |= p[5]; 111 SSE_REASSOC_BARRIER(v, w); 112 v |= p[6]; w |= p[7]; 113 SSE_REASSOC_BARRIER(v, w); 114 v |= w; 115 p += 8; 116 } while (p < e - 7); 117 118 return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF; 119 } 120 121 #ifdef CONFIG_AVX2_OPT 122 static bool __attribute__((target("avx2"))) 123 buffer_zero_avx2(const void *buf, size_t len) 124 { 125 /* Unaligned loads at head/tail. */ 126 __m256i v = *(__m256i_u *)(buf); 127 __m256i w = *(__m256i_u *)(buf + len - 32); 128 /* Align head/tail to 32-byte boundaries. */ 129 const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32); 130 const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32); 131 __m256i zero = { 0 }; 132 133 /* Collect a partial block at tail end. */ 134 v |= e[-1]; w |= e[-2]; 135 SSE_REASSOC_BARRIER(v, w); 136 v |= e[-3]; w |= e[-4]; 137 SSE_REASSOC_BARRIER(v, w); 138 v |= e[-5]; w |= e[-6]; 139 SSE_REASSOC_BARRIER(v, w); 140 v |= e[-7]; v |= w; 141 142 /* Loop over complete 256-byte blocks. */ 143 for (; p < e - 7; p += 8) { 144 /* PTEST is not profitable here. */ 145 v = _mm256_cmpeq_epi8(v, zero); 146 if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) { 147 return false; 148 } 149 v = p[0]; w = p[1]; 150 SSE_REASSOC_BARRIER(v, w); 151 v |= p[2]; w |= p[3]; 152 SSE_REASSOC_BARRIER(v, w); 153 v |= p[4]; w |= p[5]; 154 SSE_REASSOC_BARRIER(v, w); 155 v |= p[6]; w |= p[7]; 156 SSE_REASSOC_BARRIER(v, w); 157 v |= w; 158 } 159 160 return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF; 161 } 162 #endif /* CONFIG_AVX2_OPT */ 163 164 static unsigned __attribute__((noinline)) 165 select_accel_cpuinfo(unsigned info) 166 { 167 /* Array is sorted in order of algorithm preference. */ 168 static const struct { 169 unsigned bit; 170 bool (*fn)(const void *, size_t); 171 } all[] = { 172 #ifdef CONFIG_AVX2_OPT 173 { CPUINFO_AVX2, buffer_zero_avx2 }, 174 #endif 175 { CPUINFO_SSE2, buffer_zero_sse2 }, 176 { CPUINFO_ALWAYS, buffer_is_zero_integer }, 177 }; 178 179 for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { 180 if (info & all[i].bit) { 181 buffer_is_zero_accel = all[i].fn; 182 return all[i].bit; 183 } 184 } 185 return 0; 186 } 187 188 static unsigned used_accel; 189 190 static void __attribute__((constructor)) init_accel(void) 191 { 192 used_accel = select_accel_cpuinfo(cpuinfo_init()); 193 } 194 195 #define INIT_ACCEL NULL 196 197 bool test_buffer_is_zero_next_accel(void) 198 { 199 /* 200 * Accumulate the accelerators that we've already tested, and 201 * remove them from the set to test this round. We'll get back 202 * a zero from select_accel_cpuinfo when there are no more. 203 */ 204 unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); 205 used_accel |= used; 206 return used; 207 } 208 #else 209 bool test_buffer_is_zero_next_accel(void) 210 { 211 return false; 212 } 213 214 #define INIT_ACCEL buffer_is_zero_integer 215 #endif 216 217 static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; 218 219 bool buffer_is_zero_ool(const void *buf, size_t len) 220 { 221 if (unlikely(len == 0)) { 222 return true; 223 } 224 if (!buffer_is_zero_sample3(buf, len)) { 225 return false; 226 } 227 /* All bytes are covered for any len <= 3. */ 228 if (unlikely(len <= 3)) { 229 return true; 230 } 231 232 if (likely(len >= 256)) { 233 return buffer_is_zero_accel(buf, len); 234 } 235 return buffer_is_zero_integer(buf, len); 236 } 237 238 bool buffer_is_zero_ge256(const void *buf, size_t len) 239 { 240 return buffer_is_zero_accel(buf, len); 241 } 242