1 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
2 //
3 // Copyright © 2020 Intel Corporation
4 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
5 // Use of this source code is governed by a BSD-style license that can be
6 // found in the LICENSE-BSD-3-Clause file.
7
8 use std::fs::File;
9 use std::io::{self, Read};
10 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
11 use std::result;
12 use std::sync::atomic::AtomicBool;
13 use std::sync::{Arc, Barrier, Mutex};
14 use std::time::Instant;
15
16 use anyhow::anyhow;
17 use seccompiler::SeccompAction;
18 use serde::{Deserialize, Serialize};
19 use thiserror::Error;
20 use virtio_queue::{Queue, QueueT};
21 use vm_memory::{Bytes, GuestAddressSpace, GuestMemoryAtomic};
22 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable};
23 use vmm_sys_util::eventfd::EventFd;
24
25 use super::{
26 ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler,
27 Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, EPOLL_HELPER_EVENT_LAST,
28 VIRTIO_F_VERSION_1,
29 };
30 use crate::seccomp_filters::Thread;
31 use crate::thread_helper::spawn_virtio_thread;
32 use crate::{GuestMemoryMmap, VirtioInterrupt, VirtioInterruptType};
33
34 const QUEUE_SIZE: u16 = 8;
35 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
36
37 // New descriptors are pending on the virtio queue.
38 const QUEUE_AVAIL_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1;
39 // Timer expired
40 const TIMER_EXPIRED_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2;
41
42 // Number of seconds to check to see if there has been a ping
43 // This needs to match what the driver is using.
44 const WATCHDOG_TIMER_INTERVAL: i64 = 15;
45
46 // Number of seconds since last ping to trigger reboot
47 const WATCHDOG_TIMEOUT: u64 = WATCHDOG_TIMER_INTERVAL as u64 + 5;
48
49 #[derive(Error, Debug)]
50 enum Error {
51 #[error("Error programming timer fd")]
52 TimerfdSetup(#[source] io::Error),
53 #[error("Descriptor chain too short")]
54 DescriptorChainTooShort,
55 #[error("Failed adding used index")]
56 QueueAddUsed(#[source] virtio_queue::Error),
57 #[error("Invalid descriptor")]
58 InvalidDescriptor,
59 #[error("Failed to write to guest memory")]
60 GuestMemoryWrite(#[source] vm_memory::guest_memory::Error),
61 }
62
63 struct WatchdogEpollHandler {
64 mem: GuestMemoryAtomic<GuestMemoryMmap>,
65 queue: Queue,
66 interrupt_cb: Arc<dyn VirtioInterrupt>,
67 queue_evt: EventFd,
68 kill_evt: EventFd,
69 pause_evt: EventFd,
70 timer: File,
71 last_ping_time: Arc<Mutex<Option<Instant>>>,
72 reset_evt: EventFd,
73 }
74
75 impl WatchdogEpollHandler {
76 // The main queue is very simple - the driver "pings" the device by passing it a (write-only)
77 // descriptor. In response the device writes a 1 into the descriptor and returns it to the driver
process_queue(&mut self) -> result::Result<bool, Error>78 fn process_queue(&mut self) -> result::Result<bool, Error> {
79 let queue = &mut self.queue;
80 let mut used_descs = false;
81 while let Some(mut desc_chain) = queue.pop_descriptor_chain(self.mem.memory()) {
82 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?;
83
84 if !(desc.is_write_only() && desc.len() > 0) {
85 return Err(Error::InvalidDescriptor);
86 }
87
88 desc_chain
89 .memory()
90 .write_obj(1u8, desc.addr())
91 .map_err(Error::GuestMemoryWrite)?;
92
93 // If this is the first "ping" then setup the timer
94 if self.last_ping_time.lock().unwrap().is_none() {
95 info!(
96 "First ping received. Starting timer (every {} seconds)",
97 WATCHDOG_TIMER_INTERVAL
98 );
99 timerfd_setup(&self.timer, WATCHDOG_TIMER_INTERVAL).map_err(Error::TimerfdSetup)?;
100 }
101 self.last_ping_time.lock().unwrap().replace(Instant::now());
102
103 queue
104 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len())
105 .map_err(Error::QueueAddUsed)?;
106 used_descs = true;
107 }
108
109 Ok(used_descs)
110 }
111
signal_used_queue(&self) -> result::Result<(), DeviceError>112 fn signal_used_queue(&self) -> result::Result<(), DeviceError> {
113 self.interrupt_cb
114 .trigger(VirtioInterruptType::Queue(0))
115 .map_err(|e| {
116 error!("Failed to signal used queue: {:?}", e);
117 DeviceError::FailedSignalingUsedQueue(e)
118 })
119 }
120
run( &mut self, paused: Arc<AtomicBool>, paused_sync: Arc<Barrier>, ) -> result::Result<(), EpollHelperError>121 fn run(
122 &mut self,
123 paused: Arc<AtomicBool>,
124 paused_sync: Arc<Barrier>,
125 ) -> result::Result<(), EpollHelperError> {
126 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?;
127 helper.add_event(self.queue_evt.as_raw_fd(), QUEUE_AVAIL_EVENT)?;
128 helper.add_event(self.timer.as_raw_fd(), TIMER_EXPIRED_EVENT)?;
129 helper.run(paused, paused_sync, self)?;
130
131 Ok(())
132 }
133 }
134
135 impl EpollHelperHandler for WatchdogEpollHandler {
handle_event( &mut self, _helper: &mut EpollHelper, event: &epoll::Event, ) -> result::Result<(), EpollHelperError>136 fn handle_event(
137 &mut self,
138 _helper: &mut EpollHelper,
139 event: &epoll::Event,
140 ) -> result::Result<(), EpollHelperError> {
141 let ev_type = event.data as u16;
142 match ev_type {
143 QUEUE_AVAIL_EVENT => {
144 self.queue_evt.read().map_err(|e| {
145 EpollHelperError::HandleEvent(anyhow!("Failed to get queue event: {:?}", e))
146 })?;
147
148 let needs_notification = self.process_queue().map_err(|e| {
149 EpollHelperError::HandleEvent(anyhow!("Failed to process queue : {:?}", e))
150 })?;
151 if needs_notification {
152 self.signal_used_queue().map_err(|e| {
153 EpollHelperError::HandleEvent(anyhow!(
154 "Failed to signal used queue: {:?}",
155 e
156 ))
157 })?;
158 }
159 }
160 TIMER_EXPIRED_EVENT => {
161 // When reading from the timerfd you get 8 bytes indicating
162 // the number of times this event has elapsed since the last read.
163 let mut buf = vec![0; 8];
164 self.timer.read_exact(&mut buf).map_err(|e| {
165 EpollHelperError::HandleEvent(anyhow!("Error reading from timer fd: {:}", e))
166 })?;
167
168 if let Some(last_ping_time) = self.last_ping_time.lock().unwrap().as_ref() {
169 let now = Instant::now();
170 let gap = now.duration_since(*last_ping_time).as_secs();
171 if gap > WATCHDOG_TIMEOUT {
172 error!("Watchdog triggered: {} seconds since last ping", gap);
173 self.reset_evt.write(1).ok();
174 }
175 }
176 }
177 _ => {
178 return Err(EpollHelperError::HandleEvent(anyhow!(
179 "Unexpected event: {}",
180 ev_type
181 )));
182 }
183 }
184 Ok(())
185 }
186 }
187
188 /// Virtio device for exposing a watchdog to the guest
189 pub struct Watchdog {
190 common: VirtioCommon,
191 id: String,
192 seccomp_action: SeccompAction,
193 reset_evt: EventFd,
194 last_ping_time: Arc<Mutex<Option<Instant>>>,
195 timer: File,
196 exit_evt: EventFd,
197 }
198
199 #[derive(Serialize, Deserialize)]
200 pub struct WatchdogState {
201 pub avail_features: u64,
202 pub acked_features: u64,
203 pub enabled: bool,
204 }
205
206 impl Watchdog {
207 /// Create a new virtio watchdog device that will reboot VM if the guest hangs
new( id: String, reset_evt: EventFd, seccomp_action: SeccompAction, exit_evt: EventFd, state: Option<WatchdogState>, ) -> io::Result<Watchdog>208 pub fn new(
209 id: String,
210 reset_evt: EventFd,
211 seccomp_action: SeccompAction,
212 exit_evt: EventFd,
213 state: Option<WatchdogState>,
214 ) -> io::Result<Watchdog> {
215 let mut last_ping_time = None;
216 let (avail_features, acked_features, paused) = if let Some(state) = state {
217 info!("Restoring virtio-watchdog {}", id);
218
219 // When restoring enable the watchdog if it was previously enabled.
220 // We reset the timer to ensure that we don't unnecessarily reboot
221 // due to the offline time.
222 if state.enabled {
223 last_ping_time = Some(Instant::now());
224 }
225
226 (state.avail_features, state.acked_features, true)
227 } else {
228 (1u64 << VIRTIO_F_VERSION_1, 0, false)
229 };
230
231 let timer_fd = timerfd_create().map_err(|e| {
232 error!("Failed to create timer fd {}", e);
233 e
234 })?;
235 // SAFETY: timer_fd is a valid fd
236 let timer = unsafe { File::from_raw_fd(timer_fd) };
237
238 Ok(Watchdog {
239 common: VirtioCommon {
240 device_type: VirtioDeviceType::Watchdog as u32,
241 queue_sizes: QUEUE_SIZES.to_vec(),
242 paused_sync: Some(Arc::new(Barrier::new(2))),
243 avail_features,
244 acked_features,
245 min_queues: 1,
246 paused: Arc::new(AtomicBool::new(paused)),
247 ..Default::default()
248 },
249 id,
250 seccomp_action,
251 reset_evt,
252 last_ping_time: Arc::new(Mutex::new(last_ping_time)),
253 timer,
254 exit_evt,
255 })
256 }
257
state(&self) -> WatchdogState258 fn state(&self) -> WatchdogState {
259 WatchdogState {
260 avail_features: self.common.avail_features,
261 acked_features: self.common.acked_features,
262 enabled: self.last_ping_time.lock().unwrap().is_some(),
263 }
264 }
265
266 #[cfg(fuzzing)]
wait_for_epoll_threads(&mut self)267 pub fn wait_for_epoll_threads(&mut self) {
268 self.common.wait_for_epoll_threads();
269 }
270 }
271
272 impl Drop for Watchdog {
drop(&mut self)273 fn drop(&mut self) {
274 if let Some(kill_evt) = self.common.kill_evt.take() {
275 // Ignore the result because there is nothing we can do about it.
276 let _ = kill_evt.write(1);
277 }
278 self.common.wait_for_epoll_threads();
279 }
280 }
281
timerfd_create() -> Result<RawFd, io::Error>282 fn timerfd_create() -> Result<RawFd, io::Error> {
283 // SAFETY: FFI call, trivially safe
284 let res = unsafe { libc::timerfd_create(libc::CLOCK_MONOTONIC, 0) };
285 if res < 0 {
286 Err(io::Error::last_os_error())
287 } else {
288 Ok(res as RawFd)
289 }
290 }
291
timerfd_setup(timer: &File, secs: i64) -> Result<(), io::Error>292 fn timerfd_setup(timer: &File, secs: i64) -> Result<(), io::Error> {
293 let periodic = libc::itimerspec {
294 it_interval: libc::timespec {
295 tv_sec: secs,
296 tv_nsec: 0,
297 },
298 it_value: libc::timespec {
299 tv_sec: secs,
300 tv_nsec: 0,
301 },
302 };
303
304 let res =
305 // SAFETY: FFI call with correct arguments
306 unsafe { libc::timerfd_settime(timer.as_raw_fd(), 0, &periodic, std::ptr::null_mut()) };
307
308 if res < 0 {
309 Err(io::Error::last_os_error())
310 } else {
311 Ok(())
312 }
313 }
314
315 impl VirtioDevice for Watchdog {
device_type(&self) -> u32316 fn device_type(&self) -> u32 {
317 self.common.device_type
318 }
319
queue_max_sizes(&self) -> &[u16]320 fn queue_max_sizes(&self) -> &[u16] {
321 &self.common.queue_sizes
322 }
323
features(&self) -> u64324 fn features(&self) -> u64 {
325 self.common.avail_features
326 }
327
ack_features(&mut self, value: u64)328 fn ack_features(&mut self, value: u64) {
329 self.common.ack_features(value)
330 }
331
activate( &mut self, mem: GuestMemoryAtomic<GuestMemoryMmap>, interrupt_cb: Arc<dyn VirtioInterrupt>, mut queues: Vec<(usize, Queue, EventFd)>, ) -> ActivateResult332 fn activate(
333 &mut self,
334 mem: GuestMemoryAtomic<GuestMemoryMmap>,
335 interrupt_cb: Arc<dyn VirtioInterrupt>,
336 mut queues: Vec<(usize, Queue, EventFd)>,
337 ) -> ActivateResult {
338 self.common.activate(&queues, &interrupt_cb)?;
339 let (kill_evt, pause_evt) = self.common.dup_eventfds();
340
341 let reset_evt = self.reset_evt.try_clone().map_err(|e| {
342 error!("Failed to clone reset_evt eventfd: {}", e);
343 ActivateError::BadActivate
344 })?;
345
346 let timer = self.timer.try_clone().map_err(|e| {
347 error!("Failed to clone timer fd: {}", e);
348 ActivateError::BadActivate
349 })?;
350
351 let (_, queue, queue_evt) = queues.remove(0);
352
353 let mut handler = WatchdogEpollHandler {
354 mem,
355 queue,
356 interrupt_cb,
357 queue_evt,
358 kill_evt,
359 pause_evt,
360 timer,
361 last_ping_time: self.last_ping_time.clone(),
362 reset_evt,
363 };
364
365 let paused = self.common.paused.clone();
366 let paused_sync = self.common.paused_sync.clone();
367 let mut epoll_threads = Vec::new();
368
369 spawn_virtio_thread(
370 &self.id,
371 &self.seccomp_action,
372 Thread::VirtioWatchdog,
373 &mut epoll_threads,
374 &self.exit_evt,
375 move || handler.run(paused, paused_sync.unwrap()),
376 )?;
377
378 self.common.epoll_threads = Some(epoll_threads);
379
380 event!("virtio-device", "activated", "id", &self.id);
381 Ok(())
382 }
383
reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>>384 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> {
385 let result = self.common.reset();
386 event!("virtio-device", "reset", "id", &self.id);
387 result
388 }
389 }
390
391 impl Pausable for Watchdog {
pause(&mut self) -> result::Result<(), MigratableError>392 fn pause(&mut self) -> result::Result<(), MigratableError> {
393 info!("Watchdog paused - disabling timer");
394 timerfd_setup(&self.timer, 0)
395 .map_err(|e| MigratableError::Pause(anyhow!("Error clearing timer: {:?}", e)))?;
396 self.common.pause()
397 }
398
resume(&mut self) -> result::Result<(), MigratableError>399 fn resume(&mut self) -> result::Result<(), MigratableError> {
400 // Reset the timer on pause if it was previously used
401 if self.last_ping_time.lock().unwrap().is_some() {
402 info!(
403 "Watchdog resumed - enabling timer (every {} seconds)",
404 WATCHDOG_TIMER_INTERVAL
405 );
406 self.last_ping_time.lock().unwrap().replace(Instant::now());
407 timerfd_setup(&self.timer, WATCHDOG_TIMER_INTERVAL)
408 .map_err(|e| MigratableError::Resume(anyhow!("Error setting timer: {:?}", e)))?;
409 }
410 self.common.resume()
411 }
412 }
413
414 impl Snapshottable for Watchdog {
id(&self) -> String415 fn id(&self) -> String {
416 self.id.clone()
417 }
418
snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError>419 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
420 Snapshot::new_from_state(&self.state())
421 }
422 }
423
424 impl Transportable for Watchdog {}
425 impl Migratable for Watchdog {}
426