1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3  * rseq.c
4  *
5  * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; only
10  * version 2.1 of the License.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  */
17 
18 #define _GNU_SOURCE
19 #include <errno.h>
20 #include <sched.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syscall.h>
26 #include <assert.h>
27 #include <signal.h>
28 #include <limits.h>
29 #include <dlfcn.h>
30 #include <stddef.h>
31 #include <sys/auxv.h>
32 #include <linux/auxvec.h>
33 
34 #include <linux/compiler.h>
35 
36 #include "../kselftest.h"
37 #include "rseq.h"
38 
39 /*
40  * Define weak versions to play nice with binaries that are statically linked
41  * against a libc that doesn't support registering its own rseq.
42  */
43 __weak ptrdiff_t __rseq_offset;
44 __weak unsigned int __rseq_size;
45 __weak unsigned int __rseq_flags;
46 
47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48 static const unsigned int *libc_rseq_size_p = &__rseq_size;
49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50 
51 /* Offset from the thread pointer to the rseq area. */
52 ptrdiff_t rseq_offset;
53 
54 /*
55  * Size of the registered rseq area. 0 if the registration was
56  * unsuccessful.
57  */
58 unsigned int rseq_size = -1U;
59 
60 /* Flags used during rseq registration.  */
61 unsigned int rseq_flags;
62 
63 static int rseq_ownership;
64 
65 /* Allocate a large area for the TLS. */
66 #define RSEQ_THREAD_AREA_ALLOC_SIZE	1024
67 
68 /* Original struct rseq feature size is 20 bytes. */
69 #define ORIG_RSEQ_FEATURE_SIZE		20
70 
71 /* Original struct rseq allocation size is 32 bytes. */
72 #define ORIG_RSEQ_ALLOC_SIZE		32
73 
74 /*
75  * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
76  * rseq registration that is larger than the current rseq ABI.
77  */
78 union rseq_tls {
79 	struct rseq_abi abi;
80 	char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
81 };
82 
83 static
84 __thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
85 	.abi = {
86 		.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
87 	},
88 };
89 
sys_rseq(struct rseq_abi * rseq_abi,uint32_t rseq_len,int flags,uint32_t sig)90 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
91 		    int flags, uint32_t sig)
92 {
93 	return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
94 }
95 
sys_getcpu(unsigned * cpu,unsigned * node)96 static int sys_getcpu(unsigned *cpu, unsigned *node)
97 {
98 	return syscall(__NR_getcpu, cpu, node, NULL);
99 }
100 
rseq_available(void)101 bool rseq_available(void)
102 {
103 	int rc;
104 
105 	rc = sys_rseq(NULL, 0, 0, 0);
106 	if (rc != -1)
107 		abort();
108 	switch (errno) {
109 	case ENOSYS:
110 		return false;
111 	case EINVAL:
112 		return true;
113 	default:
114 		abort();
115 	}
116 }
117 
118 /* The rseq areas need to be at least 32 bytes. */
119 static
get_rseq_min_alloc_size(void)120 unsigned int get_rseq_min_alloc_size(void)
121 {
122 	unsigned int alloc_size = rseq_size;
123 
124 	if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
125 		alloc_size = ORIG_RSEQ_ALLOC_SIZE;
126 	return alloc_size;
127 }
128 
129 /*
130  * Return the feature size supported by the kernel.
131  *
132  * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
133  *
134  * 0:   Return ORIG_RSEQ_FEATURE_SIZE (20)
135  * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
136  *
137  * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
138  */
139 static
get_rseq_kernel_feature_size(void)140 unsigned int get_rseq_kernel_feature_size(void)
141 {
142 	unsigned long auxv_rseq_feature_size, auxv_rseq_align;
143 
144 	auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
145 	assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
146 
147 	auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
148 	assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
149 	if (auxv_rseq_feature_size)
150 		return auxv_rseq_feature_size;
151 	else
152 		return ORIG_RSEQ_FEATURE_SIZE;
153 }
154 
rseq_register_current_thread(void)155 int rseq_register_current_thread(void)
156 {
157 	int rc;
158 
159 	if (!rseq_ownership) {
160 		/* Treat libc's ownership as a successful registration. */
161 		return 0;
162 	}
163 	rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
164 	if (rc) {
165 		/*
166 		 * After at least one thread has registered successfully
167 		 * (rseq_size > 0), the registration of other threads should
168 		 * never fail.
169 		 */
170 		if (RSEQ_READ_ONCE(rseq_size) > 0) {
171 			/* Incoherent success/failure within process. */
172 			abort();
173 		}
174 		return -1;
175 	}
176 	assert(rseq_current_cpu_raw() >= 0);
177 
178 	/*
179 	 * The first thread to register sets the rseq_size to mimic the libc
180 	 * behavior.
181 	 */
182 	if (RSEQ_READ_ONCE(rseq_size) == 0) {
183 		RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size());
184 	}
185 
186 	return 0;
187 }
188 
rseq_unregister_current_thread(void)189 int rseq_unregister_current_thread(void)
190 {
191 	int rc;
192 
193 	if (!rseq_ownership) {
194 		/* Treat libc's ownership as a successful unregistration. */
195 		return 0;
196 	}
197 	rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
198 	if (rc)
199 		return -1;
200 	return 0;
201 }
202 
203 static __attribute__((constructor))
rseq_init(void)204 void rseq_init(void)
205 {
206 	/*
207 	 * If the libc's registered rseq size isn't already valid, it may be
208 	 * because the binary is dynamically linked and not necessarily due to
209 	 * libc not having registered a restartable sequence.  Try to find the
210 	 * symbols if that's the case.
211 	 */
212 	if (!*libc_rseq_size_p) {
213 		libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
214 		libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
215 		libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
216 	}
217 	if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
218 			*libc_rseq_size_p != 0) {
219 		unsigned int libc_rseq_size;
220 
221 		/* rseq registration owned by glibc */
222 		rseq_offset = *libc_rseq_offset_p;
223 		libc_rseq_size = *libc_rseq_size_p;
224 		rseq_flags = *libc_rseq_flags_p;
225 
226 		/*
227 		 * Previous versions of glibc expose the value
228 		 * 32 even though the kernel only supported 20
229 		 * bytes initially. Therefore treat 32 as a
230 		 * special-case. glibc 2.40 exposes a 20 bytes
231 		 * __rseq_size without using getauxval(3) to
232 		 * query the supported size, while still allocating a 32
233 		 * bytes area. Also treat 20 as a special-case.
234 		 *
235 		 * Special-cases are handled by using the following
236 		 * value as active feature set size:
237 		 *
238 		 *   rseq_size = min(32, get_rseq_kernel_feature_size())
239 		 */
240 		switch (libc_rseq_size) {
241 		case ORIG_RSEQ_FEATURE_SIZE:
242 			fallthrough;
243 		case ORIG_RSEQ_ALLOC_SIZE:
244 		{
245 			unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
246 
247 			if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
248 				rseq_size = rseq_kernel_feature_size;
249 			else
250 				rseq_size = ORIG_RSEQ_ALLOC_SIZE;
251 			break;
252 		}
253 		default:
254 			/* Otherwise just use the __rseq_size from libc as rseq_size. */
255 			rseq_size = libc_rseq_size;
256 			break;
257 		}
258 		return;
259 	}
260 	rseq_ownership = 1;
261 
262 	/* Calculate the offset of the rseq area from the thread pointer. */
263 	rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer();
264 
265 	/* rseq flags are deprecated, always set to 0. */
266 	rseq_flags = 0;
267 
268 	/*
269 	 * Set the size to 0 until at least one thread registers to mimic the
270 	 * libc behavior.
271 	 */
272 	rseq_size = 0;
273 }
274 
275 static __attribute__((destructor))
rseq_exit(void)276 void rseq_exit(void)
277 {
278 	if (!rseq_ownership)
279 		return;
280 	rseq_offset = 0;
281 	rseq_size = -1U;
282 	rseq_ownership = 0;
283 }
284 
rseq_fallback_current_cpu(void)285 int32_t rseq_fallback_current_cpu(void)
286 {
287 	int32_t cpu;
288 
289 	cpu = sched_getcpu();
290 	if (cpu < 0) {
291 		perror("sched_getcpu()");
292 		abort();
293 	}
294 	return cpu;
295 }
296 
rseq_fallback_current_node(void)297 int32_t rseq_fallback_current_node(void)
298 {
299 	uint32_t cpu_id, node_id;
300 	int ret;
301 
302 	ret = sys_getcpu(&cpu_id, &node_id);
303 	if (ret) {
304 		perror("sys_getcpu()");
305 		return ret;
306 	}
307 	return (int32_t) node_id;
308 }
309