xref: /qemu/util/bufferiszero.c (revision 8212ff86f4405b6128d89dd1d97ff2d6cfcf9842)
1 /*
2  * Simple C functions to supplement the C library
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qemu-common.h"
26 #include "qemu/cutils.h"
27 #include "qemu/bswap.h"
28 
29 
30 /* vector definitions */
31 
32 extern void link_error(void);
33 
34 #define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO)         \
35 static bool NAME(const void *buf, size_t len)                   \
36 {                                                               \
37     const void *end = buf + len;                                \
38     do {                                                        \
39         const VECTYPE *p = buf;                                 \
40         VECTYPE t;                                              \
41         __builtin_prefetch(buf + SIZE);                         \
42         barrier();                                              \
43         if (SIZE == sizeof(VECTYPE) * 4) {                      \
44             t = (p[0] | p[1]) | (p[2] | p[3]);                  \
45         } else if (SIZE == sizeof(VECTYPE) * 8) {               \
46             t  = p[0] | p[1];                                   \
47             t |= p[2] | p[3];                                   \
48             t |= p[4] | p[5];                                   \
49             t |= p[6] | p[7];                                   \
50         } else {                                                \
51             link_error();                                       \
52         }                                                       \
53         if (unlikely(NONZERO(t))) {                             \
54             return false;                                       \
55         }                                                       \
56         buf += SIZE;                                            \
57     } while (buf < end);                                        \
58     return true;                                                \
59 }
60 
61 static bool
62 buffer_zero_int(const void *buf, size_t len)
63 {
64     if (unlikely(len < 8)) {
65         /* For a very small buffer, simply accumulate all the bytes.  */
66         const unsigned char *p = buf;
67         const unsigned char *e = buf + len;
68         unsigned char t = 0;
69 
70         do {
71             t |= *p++;
72         } while (p < e);
73 
74         return t == 0;
75     } else {
76         /* Otherwise, use the unaligned memory access functions to
77            handle the beginning and end of the buffer, with a couple
78            of loops handling the middle aligned section.  */
79         uint64_t t = ldq_he_p(buf);
80         const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
81         const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
82 
83         for (; p + 8 <= e; p += 8) {
84             __builtin_prefetch(p + 8);
85             if (t) {
86                 return false;
87             }
88             t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
89         }
90         while (p < e) {
91             t |= *p++;
92         }
93         t |= ldq_he_p(buf + len - 8);
94 
95         return t == 0;
96     }
97 }
98 
99 #if defined(CONFIG_AVX2_OPT) || (defined(CONFIG_CPUID_H) && defined(__SSE2__))
100 #include <cpuid.h>
101 
102 /* Do not use push_options pragmas unnecessarily, because clang
103  * does not support them.
104  */
105 #ifndef __SSE2__
106 #pragma GCC push_options
107 #pragma GCC target("sse2")
108 #endif
109 #include <emmintrin.h>
110 #define SSE2_NONZERO(X) \
111     (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
112 ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
113 #ifndef __SSE2__
114 #pragma GCC pop_options
115 #endif
116 
117 #ifdef CONFIG_AVX2_OPT
118 #pragma GCC push_options
119 #pragma GCC target("sse4")
120 #include <smmintrin.h>
121 #define SSE4_NONZERO(X)  !_mm_testz_si128((X), (X))
122 ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO)
123 #pragma GCC pop_options
124 
125 #pragma GCC push_options
126 #pragma GCC target("avx2")
127 #include <immintrin.h>
128 #define AVX2_NONZERO(X)  !_mm256_testz_si256((X), (X))
129 ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
130 #pragma GCC pop_options
131 #endif
132 
133 #define CACHE_AVX2    2
134 #define CACHE_AVX1    4
135 #define CACHE_SSE4    8
136 #define CACHE_SSE2    16
137 
138 static unsigned cpuid_cache;
139 
140 static void __attribute__((constructor)) init_cpuid_cache(void)
141 {
142     int max = __get_cpuid_max(0, NULL);
143     int a, b, c, d;
144     unsigned cache = 0;
145 
146     if (max >= 1) {
147         __cpuid(1, a, b, c, d);
148         if (d & bit_SSE2) {
149             cache |= CACHE_SSE2;
150         }
151 #ifdef CONFIG_AVX2_OPT
152         if (c & bit_SSE4_1) {
153             cache |= CACHE_SSE4;
154         }
155 
156         /* We must check that AVX is not just available, but usable.  */
157         if ((c & bit_OSXSAVE) && (c & bit_AVX)) {
158             __asm("xgetbv" : "=a"(a), "=d"(d) : "c"(0));
159             if ((a & 6) == 6) {
160                 cache |= CACHE_AVX1;
161                 if (max >= 7) {
162                     __cpuid_count(7, 0, a, b, c, d);
163                     if (b & bit_AVX2) {
164                         cache |= CACHE_AVX2;
165                     }
166                 }
167             }
168         }
169 #endif
170     }
171     cpuid_cache = cache;
172 }
173 
174 #define HAVE_NEXT_ACCEL
175 bool test_buffer_is_zero_next_accel(void)
176 {
177     /* If no bits set, we just tested buffer_zero_int, and there
178        are no more acceleration options to test.  */
179     if (cpuid_cache == 0) {
180         return false;
181     }
182     /* Disable the accelerator we used before and select a new one.  */
183     cpuid_cache &= cpuid_cache - 1;
184     return true;
185 }
186 
187 static bool select_accel_fn(const void *buf, size_t len)
188 {
189     uintptr_t ibuf = (uintptr_t)buf;
190 #ifdef CONFIG_AVX2_OPT
191     if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
192         return buffer_zero_avx2(buf, len);
193     }
194     if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
195         return buffer_zero_sse4(buf, len);
196     }
197 #endif
198     if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
199         return buffer_zero_sse2(buf, len);
200     }
201     return buffer_zero_int(buf, len);
202 }
203 
204 #else
205 #define select_accel_fn  buffer_zero_int
206 #endif
207 
208 #ifndef HAVE_NEXT_ACCEL
209 bool test_buffer_is_zero_next_accel(void)
210 {
211     return false;
212 }
213 #endif
214 
215 /*
216  * Checks if a buffer is all zeroes
217  */
218 bool buffer_is_zero(const void *buf, size_t len)
219 {
220     if (unlikely(len == 0)) {
221         return true;
222     }
223 
224     /* Fetch the beginning of the buffer while we select the accelerator.  */
225     __builtin_prefetch(buf);
226 
227     /* Use an optimized zero check if possible.  Note that this also
228        includes a check for an unrolled loop over 64-bit integers.  */
229     return select_accel_fn(buf, len);
230 }
231