1 // Copyright 2021, 2023 Alyssa Ross <hi@alyssa.is>
2 // SPDX-License-Identifier: Apache-2.0
3
4 use std::cell::RefCell;
5 use std::collections::BTreeSet;
6 use std::fs::{read_dir, File};
7 use std::io::{self, ErrorKind, Read, Write};
8 use std::iter::once;
9 use std::mem::{size_of, MaybeUninit};
10 use std::os::unix::prelude::*;
11 use std::process::exit;
12 use std::ptr::null_mut;
13
14 use arch::_NSIG;
15 use hypervisor::HypervisorType;
16 use libc::{
17 c_int, c_void, close, fork, getpgrp, ioctl, pipe2, poll, pollfd, setsid, sigemptyset,
18 siginfo_t, signal, sigprocmask, syscall, tcgetpgrp, tcsetpgrp, SYS_close_range, EINVAL, ENOSYS,
19 ENOTTY, O_CLOEXEC, POLLERR, SIGCHLD, SIGWINCH, SIG_DFL, SIG_SETMASK, STDERR_FILENO, TIOCSCTTY,
20 };
21 use seccompiler::{apply_filter, BpfProgram, SeccompAction};
22 use vmm_sys_util::signal::register_signal_handler;
23
24 use crate::clone3::{clone3, clone_args, CLONE_CLEAR_SIGHAND};
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26
27 thread_local! {
28 // The tty file descriptor is stored in a global variable so it
29 // can be accessed by a signal handler.
30 static TX: RefCell<Option<File>> = const { RefCell::new(None) };
31 }
32
with_tx<R, F: FnOnce(&File) -> R>(f: F) -> R33 fn with_tx<R, F: FnOnce(&File) -> R>(f: F) -> R {
34 TX.with(|tx| f(tx.borrow().as_ref().unwrap()))
35 }
36
37 // This function has to be safe to call from a signal handler, and
38 // therefore must not panic.
notify()39 fn notify() {
40 if let Err(e) = with_tx(|mut tx| tx.write_all(b"\n")) {
41 if e.kind() == ErrorKind::BrokenPipe {
42 exit(0);
43 }
44 exit(1);
45 }
46 }
47
sigwinch_handler(_signo: c_int, _info: *mut siginfo_t, _unused: *mut c_void)48 extern "C" fn sigwinch_handler(_signo: c_int, _info: *mut siginfo_t, _unused: *mut c_void) {
49 notify();
50 }
51
unblock_all_signals() -> io::Result<()>52 fn unblock_all_signals() -> io::Result<()> {
53 let mut set = MaybeUninit::uninit();
54 // SAFETY: set is a correct structure for sigemptyset
55 if unsafe { sigemptyset(set.as_mut_ptr()) } == -1 {
56 return Err(io::Error::last_os_error());
57 }
58 // SAFETY: set is initialized above
59 let set = unsafe { set.assume_init() };
60
61 // SAFETY: all arguments are correct
62 if unsafe { sigprocmask(SIG_SETMASK, &set, null_mut()) } == -1 {
63 return Err(io::Error::last_os_error());
64 }
65
66 Ok(())
67 }
68
69 /// # Safety
70 ///
71 /// Caller is responsible for ensuring all file descriptors not listed
72 /// in `keep_fds` are not accessed after this point, and that no other
73 /// thread is opening file descriptors while this function is
74 /// running.
close_fds_fallback(keep_fds: &BTreeSet<RawFd>)75 unsafe fn close_fds_fallback(keep_fds: &BTreeSet<RawFd>) {
76 // We collect these instead of iterating through them, because we
77 // don't want to close the descriptor for /proc/self/fd while
78 // we're iterating through it.
79 let open_fds: BTreeSet<RawFd> = read_dir("/proc/self/fd")
80 .unwrap()
81 .map(Result::unwrap)
82 .filter_map(|s| s.file_name().into_string().ok()?.parse().ok())
83 .collect();
84
85 for fd in open_fds.difference(keep_fds) {
86 close(*fd);
87 }
88 }
89
90 /// # Safety
91 ///
92 /// Caller is responsible for ensuring all file descriptors not listed
93 /// in `keep_fds` are not accessed after this point, and that no other
94 /// thread is opening file descriptors while this function is
95 /// running.
close_unused_fds(keep_fds: &mut [RawFd])96 unsafe fn close_unused_fds(keep_fds: &mut [RawFd]) {
97 keep_fds.sort();
98
99 // Iterate over the gaps between descriptors we want to keep.
100 let firsts = keep_fds.iter().map(|fd| fd + 1);
101 for (i, first) in once(0).chain(firsts).enumerate() {
102 // The next fd is the one at i, because the indexes in the
103 // iterator are offset by one due to the initial 0.
104 let next_keep_fd = keep_fds.get(i);
105 let last = next_keep_fd.map(|fd| fd - 1).unwrap_or(RawFd::MAX);
106
107 if first > last {
108 continue;
109 }
110
111 if syscall(SYS_close_range, first, last, 0) == -1 {
112 // The kernel might be too old to have close_range, in
113 // which case we need to fall back to an uglier method.
114 let e = io::Error::last_os_error();
115 if e.raw_os_error() == Some(ENOSYS) {
116 return close_fds_fallback(&keep_fds.iter().copied().collect());
117 }
118
119 panic!("close_range: {e}");
120 }
121 }
122 }
123
set_foreground_process_group(tty: &File) -> io::Result<()>124 fn set_foreground_process_group(tty: &File) -> io::Result<()> {
125 // SAFETY: trivially safe.
126 let my_pgrp = unsafe { getpgrp() };
127 // SAFETY: we have borrowed tty.
128 let tty_pgrp = unsafe { tcgetpgrp(tty.as_raw_fd()) };
129
130 if tty_pgrp == -1 {
131 let e = io::Error::last_os_error();
132 if e.raw_os_error() != Some(ENOTTY) {
133 return Err(e);
134 }
135 }
136 if tty_pgrp == my_pgrp {
137 return Ok(());
138 }
139
140 // SAFETY: trivially safe.
141 let my_pgrp = unsafe { setsid() };
142 if my_pgrp == -1 {
143 return Err(io::Error::last_os_error());
144 }
145
146 // Set the tty to be this process's controlling terminal.
147 // SAFETY: we have borrowed tty.
148 if unsafe { ioctl(tty.as_raw_fd(), TIOCSCTTY, 0) } == -1 {
149 return Err(io::Error::last_os_error());
150 }
151
152 // Become the foreground process group of the tty.
153 // SAFETY: we have borrowed tty.
154 if unsafe { tcsetpgrp(tty.as_raw_fd(), my_pgrp) } == -1 {
155 return Err(io::Error::last_os_error());
156 }
157
158 Ok(())
159 }
160
sigwinch_listener_main(seccomp_filter: BpfProgram, tx: File, tty: File) -> !161 fn sigwinch_listener_main(seccomp_filter: BpfProgram, tx: File, tty: File) -> ! {
162 // SAFETY: any references to these file descriptors are
163 // unreachable, because this function never returns.
164 unsafe {
165 close_unused_fds(&mut [STDERR_FILENO, tx.as_raw_fd(), tty.as_raw_fd()]);
166 }
167
168 TX.with(|opt| opt.replace(Some(tx)));
169
170 unblock_all_signals().unwrap();
171
172 if !seccomp_filter.is_empty() {
173 apply_filter(&seccomp_filter).unwrap();
174 }
175
176 register_signal_handler(SIGWINCH, sigwinch_handler).unwrap();
177
178 set_foreground_process_group(&tty).unwrap();
179 drop(tty);
180
181 notify();
182
183 // Wait for the pipe to close, indicating the parent has exited.
184 with_tx(|tx| {
185 let mut pollfd = pollfd {
186 fd: tx.as_raw_fd(),
187 events: 0,
188 revents: 0,
189 };
190
191 // SAFETY: FFI call with valid arguments
192 while unsafe { poll(&mut pollfd, 1, -1) } == -1 {
193 let e = io::Error::last_os_error();
194 assert!(
195 matches!(e.kind(), ErrorKind::Interrupted | ErrorKind::WouldBlock),
196 "poll: {e}"
197 );
198 }
199
200 assert_eq!(pollfd.revents, POLLERR);
201 });
202
203 exit(0);
204 }
205
206 /// # Safety
207 ///
208 /// Same as [`fork`].
clone_clear_sighand() -> io::Result<u64>209 unsafe fn clone_clear_sighand() -> io::Result<u64> {
210 let mut args = clone_args {
211 exit_signal: SIGCHLD as u64,
212 ..Default::default()
213 };
214 args.flags |= CLONE_CLEAR_SIGHAND;
215 let r = clone3(&mut args, size_of::<clone_args>());
216 if r != -1 {
217 return Ok(r.try_into().unwrap());
218 }
219 let e = io::Error::last_os_error();
220 if e.raw_os_error() != Some(ENOSYS) && e.raw_os_error() != Some(EINVAL) {
221 return Err(e);
222 }
223
224 // If CLONE_CLEAR_SIGHAND isn't available, fall back to resetting
225 // all the signal handlers one by one.
226 let r = fork();
227 if r == -1 {
228 return Err(io::Error::last_os_error());
229 }
230 if r == 0 {
231 for signum in 1.._NSIG {
232 let _ = signal(signum, SIG_DFL);
233 }
234 }
235 Ok(r.try_into().unwrap())
236 }
237
start_sigwinch_listener(seccomp_filter: BpfProgram, tty_sub: File) -> io::Result<File>238 pub fn start_sigwinch_listener(seccomp_filter: BpfProgram, tty_sub: File) -> io::Result<File> {
239 let mut pipe = [-1; 2];
240 // SAFETY: FFI call with valid arguments
241 if unsafe { pipe2(pipe.as_mut_ptr(), O_CLOEXEC) } == -1 {
242 return Err(io::Error::last_os_error());
243 }
244
245 // SAFETY: pipe[0] is valid
246 let mut rx = unsafe { File::from_raw_fd(pipe[0]) };
247 // SAFETY: pipe[1] is valid
248 let tx = unsafe { File::from_raw_fd(pipe[1]) };
249
250 // SAFETY: FFI call
251 if unsafe { clone_clear_sighand() }? == 0 {
252 sigwinch_listener_main(seccomp_filter, tx, tty_sub);
253 }
254
255 drop(tx);
256
257 // Wait for a notification indicating readiness.
258 rx.read_exact(&mut [0])?;
259
260 Ok(rx)
261 }
262
listen_for_sigwinch_on_tty( pty_sub: File, seccomp_action: &SeccompAction, hypervisor_type: HypervisorType, ) -> std::io::Result<File>263 pub fn listen_for_sigwinch_on_tty(
264 pty_sub: File,
265 seccomp_action: &SeccompAction,
266 hypervisor_type: HypervisorType,
267 ) -> std::io::Result<File> {
268 let seccomp_filter =
269 get_seccomp_filter(seccomp_action, Thread::PtyForeground, hypervisor_type).unwrap();
270
271 let console_resize_pipe = start_sigwinch_listener(seccomp_filter, pty_sub)?;
272
273 Ok(console_resize_pipe)
274 }
275