xref: /qemu/util/bufferiszero.c (revision bf67aa3dd2d8b28d7618d8ec62cd9f6055366751)
1 /*
2  * Simple C functions to supplement the C library
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qemu/cutils.h"
26 #include "qemu/bswap.h"
27 #include "host/cpuinfo.h"
28 
29 typedef bool (*biz_accel_fn)(const void *, size_t);
30 
31 static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
32 {
33     uint64_t t;
34     const uint64_t *p, *e;
35 
36     /*
37      * Use unaligned memory access functions to handle
38      * the beginning and end of the buffer.
39      */
40     if (unlikely(len <= 8)) {
41         return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
42     }
43 
44     t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
45     p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
46     e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
47 
48     /* Read 0 to 31 aligned words from the middle. */
49     while (p < e) {
50         t |= *p++;
51     }
52     return t == 0;
53 }
54 
55 static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
56 {
57     /*
58      * Use unaligned memory access functions to handle
59      * the beginning and end of the buffer.
60      */
61     uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
62     const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
63     const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
64 
65     /* Collect a partial block at the tail end. */
66     t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
67 
68     /*
69      * Loop over 64 byte blocks.
70      * With the head and tail removed, e - p >= 30,
71      * so the loop must iterate at least 3 times.
72      */
73     do {
74         if (t) {
75             return false;
76         }
77         t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
78         p += 8;
79     } while (p < e - 7);
80 
81     return t == 0;
82 }
83 
84 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
85 #include <immintrin.h>
86 
87 /* Helper for preventing the compiler from reassociating
88    chains of binary vector operations.  */
89 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
90 
91 /* Note that these vectorized functions may assume len >= 256.  */
92 
93 static bool __attribute__((target("sse2")))
94 buffer_zero_sse2(const void *buf, size_t len)
95 {
96     /* Unaligned loads at head/tail.  */
97     __m128i v = *(__m128i_u *)(buf);
98     __m128i w = *(__m128i_u *)(buf + len - 16);
99     /* Align head/tail to 16-byte boundaries.  */
100     const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
101     const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
102     __m128i zero = { 0 };
103 
104     /* Collect a partial block at tail end.  */
105     v |= e[-1]; w |= e[-2];
106     SSE_REASSOC_BARRIER(v, w);
107     v |= e[-3]; w |= e[-4];
108     SSE_REASSOC_BARRIER(v, w);
109     v |= e[-5]; w |= e[-6];
110     SSE_REASSOC_BARRIER(v, w);
111     v |= e[-7]; v |= w;
112 
113     /*
114      * Loop over complete 128-byte blocks.
115      * With the head and tail removed, e - p >= 14, so the loop
116      * must iterate at least once.
117      */
118     do {
119         v = _mm_cmpeq_epi8(v, zero);
120         if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
121             return false;
122         }
123         v = p[0]; w = p[1];
124         SSE_REASSOC_BARRIER(v, w);
125         v |= p[2]; w |= p[3];
126         SSE_REASSOC_BARRIER(v, w);
127         v |= p[4]; w |= p[5];
128         SSE_REASSOC_BARRIER(v, w);
129         v |= p[6]; w |= p[7];
130         SSE_REASSOC_BARRIER(v, w);
131         v |= w;
132         p += 8;
133     } while (p < e - 7);
134 
135     return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
136 }
137 
138 #ifdef CONFIG_AVX2_OPT
139 static bool __attribute__((target("avx2")))
140 buffer_zero_avx2(const void *buf, size_t len)
141 {
142     /* Unaligned loads at head/tail.  */
143     __m256i v = *(__m256i_u *)(buf);
144     __m256i w = *(__m256i_u *)(buf + len - 32);
145     /* Align head/tail to 32-byte boundaries.  */
146     const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
147     const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
148     __m256i zero = { 0 };
149 
150     /* Collect a partial block at tail end.  */
151     v |= e[-1]; w |= e[-2];
152     SSE_REASSOC_BARRIER(v, w);
153     v |= e[-3]; w |= e[-4];
154     SSE_REASSOC_BARRIER(v, w);
155     v |= e[-5]; w |= e[-6];
156     SSE_REASSOC_BARRIER(v, w);
157     v |= e[-7]; v |= w;
158 
159     /* Loop over complete 256-byte blocks.  */
160     for (; p < e - 7; p += 8) {
161         /* PTEST is not profitable here.  */
162         v = _mm256_cmpeq_epi8(v, zero);
163         if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
164             return false;
165         }
166         v = p[0]; w = p[1];
167         SSE_REASSOC_BARRIER(v, w);
168         v |= p[2]; w |= p[3];
169         SSE_REASSOC_BARRIER(v, w);
170         v |= p[4]; w |= p[5];
171         SSE_REASSOC_BARRIER(v, w);
172         v |= p[6]; w |= p[7];
173         SSE_REASSOC_BARRIER(v, w);
174         v |= w;
175     }
176 
177     return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
178 }
179 #endif /* CONFIG_AVX2_OPT */
180 
181 static biz_accel_fn const accel_table[] = {
182     buffer_is_zero_int_ge256,
183     buffer_zero_sse2,
184 #ifdef CONFIG_AVX2_OPT
185     buffer_zero_avx2,
186 #endif
187 };
188 
189 static unsigned best_accel(void)
190 {
191     unsigned info = cpuinfo_init();
192 
193 #ifdef CONFIG_AVX2_OPT
194     if (info & CPUINFO_AVX2) {
195         return 2;
196     }
197 #endif
198     return info & CPUINFO_SSE2 ? 1 : 0;
199 }
200 
201 #else
202 #define best_accel() 0
203 static biz_accel_fn const accel_table[1] = {
204     buffer_is_zero_int_ge256
205 };
206 #endif
207 
208 static biz_accel_fn buffer_is_zero_accel;
209 static unsigned accel_index;
210 
211 bool buffer_is_zero_ool(const void *buf, size_t len)
212 {
213     if (unlikely(len == 0)) {
214         return true;
215     }
216     if (!buffer_is_zero_sample3(buf, len)) {
217         return false;
218     }
219     /* All bytes are covered for any len <= 3.  */
220     if (unlikely(len <= 3)) {
221         return true;
222     }
223 
224     if (likely(len >= 256)) {
225         return buffer_is_zero_accel(buf, len);
226     }
227     return buffer_is_zero_int_lt256(buf, len);
228 }
229 
230 bool buffer_is_zero_ge256(const void *buf, size_t len)
231 {
232     return buffer_is_zero_accel(buf, len);
233 }
234 
235 bool test_buffer_is_zero_next_accel(void)
236 {
237     if (accel_index != 0) {
238         buffer_is_zero_accel = accel_table[--accel_index];
239         return true;
240     }
241     return false;
242 }
243 
244 static void __attribute__((constructor)) init_accel(void)
245 {
246     accel_index = best_accel();
247     buffer_is_zero_accel = accel_table[accel_index];
248 }
249