1 /* 2 * Simple C functions to supplement the C library 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qemu/cutils.h" 26 #include "qemu/bswap.h" 27 #include "host/cpuinfo.h" 28 29 static bool (*buffer_is_zero_accel)(const void *, size_t); 30 31 static bool buffer_is_zero_integer(const void *buf, size_t len) 32 { 33 if (unlikely(len < 8)) { 34 /* For a very small buffer, simply accumulate all the bytes. */ 35 const unsigned char *p = buf; 36 const unsigned char *e = buf + len; 37 unsigned char t = 0; 38 39 do { 40 t |= *p++; 41 } while (p < e); 42 43 return t == 0; 44 } else { 45 /* Otherwise, use the unaligned memory access functions to 46 handle the beginning and end of the buffer, with a couple 47 of loops handling the middle aligned section. */ 48 uint64_t t = ldq_he_p(buf); 49 const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); 50 const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); 51 52 for (; p + 8 <= e; p += 8) { 53 __builtin_prefetch(p + 8); 54 if (t) { 55 return false; 56 } 57 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; 58 } 59 while (p < e) { 60 t |= *p++; 61 } 62 t |= ldq_he_p(buf + len - 8); 63 64 return t == 0; 65 } 66 } 67 68 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) 69 #include <immintrin.h> 70 71 /* Note that each of these vectorized functions require len >= 64. */ 72 73 static bool __attribute__((target("sse2"))) 74 buffer_zero_sse2(const void *buf, size_t len) 75 { 76 __m128i t = _mm_loadu_si128(buf); 77 __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); 78 __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); 79 __m128i zero = _mm_setzero_si128(); 80 81 /* Loop over 16-byte aligned blocks of 64. */ 82 while (likely(p <= e)) { 83 __builtin_prefetch(p); 84 t = _mm_cmpeq_epi8(t, zero); 85 if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { 86 return false; 87 } 88 t = p[-4] | p[-3] | p[-2] | p[-1]; 89 p += 4; 90 } 91 92 /* Finish the aligned tail. */ 93 t |= e[-3]; 94 t |= e[-2]; 95 t |= e[-1]; 96 97 /* Finish the unaligned tail. */ 98 t |= _mm_loadu_si128(buf + len - 16); 99 100 return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; 101 } 102 103 #ifdef CONFIG_AVX2_OPT 104 static bool __attribute__((target("avx2"))) 105 buffer_zero_avx2(const void *buf, size_t len) 106 { 107 /* Begin with an unaligned head of 32 bytes. */ 108 __m256i t = _mm256_loadu_si256(buf); 109 __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); 110 __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); 111 112 /* Loop over 32-byte aligned blocks of 128. */ 113 while (p <= e) { 114 __builtin_prefetch(p); 115 if (unlikely(!_mm256_testz_si256(t, t))) { 116 return false; 117 } 118 t = p[-4] | p[-3] | p[-2] | p[-1]; 119 p += 4; 120 } ; 121 122 /* Finish the last block of 128 unaligned. */ 123 t |= _mm256_loadu_si256(buf + len - 4 * 32); 124 t |= _mm256_loadu_si256(buf + len - 3 * 32); 125 t |= _mm256_loadu_si256(buf + len - 2 * 32); 126 t |= _mm256_loadu_si256(buf + len - 1 * 32); 127 128 return _mm256_testz_si256(t, t); 129 } 130 #endif /* CONFIG_AVX2_OPT */ 131 132 static unsigned __attribute__((noinline)) 133 select_accel_cpuinfo(unsigned info) 134 { 135 /* Array is sorted in order of algorithm preference. */ 136 static const struct { 137 unsigned bit; 138 bool (*fn)(const void *, size_t); 139 } all[] = { 140 #ifdef CONFIG_AVX2_OPT 141 { CPUINFO_AVX2, buffer_zero_avx2 }, 142 #endif 143 { CPUINFO_SSE2, buffer_zero_sse2 }, 144 { CPUINFO_ALWAYS, buffer_is_zero_integer }, 145 }; 146 147 for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { 148 if (info & all[i].bit) { 149 buffer_is_zero_accel = all[i].fn; 150 return all[i].bit; 151 } 152 } 153 return 0; 154 } 155 156 static unsigned used_accel; 157 158 static void __attribute__((constructor)) init_accel(void) 159 { 160 used_accel = select_accel_cpuinfo(cpuinfo_init()); 161 } 162 163 #define INIT_ACCEL NULL 164 165 bool test_buffer_is_zero_next_accel(void) 166 { 167 /* 168 * Accumulate the accelerators that we've already tested, and 169 * remove them from the set to test this round. We'll get back 170 * a zero from select_accel_cpuinfo when there are no more. 171 */ 172 unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); 173 used_accel |= used; 174 return used; 175 } 176 #else 177 bool test_buffer_is_zero_next_accel(void) 178 { 179 return false; 180 } 181 182 #define INIT_ACCEL buffer_is_zero_integer 183 #endif 184 185 static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; 186 187 bool buffer_is_zero_ool(const void *buf, size_t len) 188 { 189 if (unlikely(len == 0)) { 190 return true; 191 } 192 if (!buffer_is_zero_sample3(buf, len)) { 193 return false; 194 } 195 /* All bytes are covered for any len <= 3. */ 196 if (unlikely(len <= 3)) { 197 return true; 198 } 199 200 if (likely(len >= 256)) { 201 return buffer_is_zero_accel(buf, len); 202 } 203 return buffer_is_zero_integer(buf, len); 204 } 205 206 bool buffer_is_zero_ge256(const void *buf, size_t len) 207 { 208 return buffer_is_zero_accel(buf, len); 209 } 210