1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * eventfd support for mshv
4  *
5  * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
6  * framework code is taken from the kvm implementation.
7  *
8  * All credits to kvm developers.
9  */
10 
11 #include <linux/syscalls.h>
12 #include <linux/wait.h>
13 #include <linux/poll.h>
14 #include <linux/file.h>
15 #include <linux/list.h>
16 #include <linux/workqueue.h>
17 #include <linux/eventfd.h>
18 
19 #if IS_ENABLED(CONFIG_X86_64)
20 #include <asm/apic.h>
21 #endif
22 #include <asm/mshyperv.h>
23 
24 #include "mshv_eventfd.h"
25 #include "mshv.h"
26 #include "mshv_root.h"
27 
28 static struct workqueue_struct *irqfd_cleanup_wq;
29 
mshv_register_irq_ack_notifier(struct mshv_partition * partition,struct mshv_irq_ack_notifier * mian)30 void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
31 				    struct mshv_irq_ack_notifier *mian)
32 {
33 	mutex_lock(&partition->pt_irq_lock);
34 	hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
35 	mutex_unlock(&partition->pt_irq_lock);
36 }
37 
mshv_unregister_irq_ack_notifier(struct mshv_partition * partition,struct mshv_irq_ack_notifier * mian)38 void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
39 				      struct mshv_irq_ack_notifier *mian)
40 {
41 	mutex_lock(&partition->pt_irq_lock);
42 	hlist_del_init_rcu(&mian->link);
43 	mutex_unlock(&partition->pt_irq_lock);
44 	synchronize_rcu();
45 }
46 
mshv_notify_acked_gsi(struct mshv_partition * partition,int gsi)47 bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
48 {
49 	struct mshv_irq_ack_notifier *mian;
50 	bool acked = false;
51 
52 	rcu_read_lock();
53 	hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
54 				 link) {
55 		if (mian->irq_ack_gsi == gsi) {
56 			mian->irq_acked(mian);
57 			acked = true;
58 		}
59 	}
60 	rcu_read_unlock();
61 
62 	return acked;
63 }
64 
65 #if IS_ENABLED(CONFIG_ARM64)
hv_should_clear_interrupt(enum hv_interrupt_type type)66 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
67 {
68 	return false;
69 }
70 #elif IS_ENABLED(CONFIG_X86_64)
hv_should_clear_interrupt(enum hv_interrupt_type type)71 static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
72 {
73 	return type == HV_X64_INTERRUPT_TYPE_EXTINT;
74 }
75 #endif
76 
mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier * mian)77 static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
78 {
79 	struct mshv_irqfd_resampler *resampler;
80 	struct mshv_partition *partition;
81 	struct mshv_irqfd *irqfd;
82 	int idx;
83 
84 	resampler = container_of(mian, struct mshv_irqfd_resampler,
85 				 rsmplr_notifier);
86 	partition = resampler->rsmplr_partn;
87 
88 	idx = srcu_read_lock(&partition->pt_irq_srcu);
89 
90 	hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
91 				 irqfd_resampler_hnode) {
92 		if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
93 			hv_call_clear_virtual_interrupt(partition->pt_id);
94 
95 		eventfd_signal(irqfd->irqfd_resamplefd);
96 	}
97 
98 	srcu_read_unlock(&partition->pt_irq_srcu, idx);
99 }
100 
101 #if IS_ENABLED(CONFIG_X86_64)
102 static bool
mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,u32 vector)103 mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
104 			    u32 vector)
105 {
106 	int i;
107 
108 	for (i = 0; i < iv.vector_count; i++) {
109 		if (iv.vector[i] == vector)
110 			return true;
111 	}
112 
113 	return false;
114 }
115 
mshv_vp_irq_try_set_vector(struct mshv_vp * vp,u32 vector)116 static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
117 {
118 	union hv_vp_register_page_interrupt_vectors iv, new_iv;
119 
120 	iv = vp->vp_register_page->interrupt_vectors;
121 	new_iv = iv;
122 
123 	if (mshv_vp_irq_vector_injected(iv, vector))
124 		return 0;
125 
126 	if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
127 		return -ENOSPC;
128 
129 	new_iv.vector[new_iv.vector_count++] = vector;
130 
131 	if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
132 		    iv.as_uint64, new_iv.as_uint64) != iv.as_uint64)
133 		return -EAGAIN;
134 
135 	return 0;
136 }
137 
mshv_vp_irq_set_vector(struct mshv_vp * vp,u32 vector)138 static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
139 {
140 	int ret;
141 
142 	do {
143 		ret = mshv_vp_irq_try_set_vector(vp, vector);
144 	} while (ret == -EAGAIN && !need_resched());
145 
146 	return ret;
147 }
148 
149 /*
150  * Try to raise irq for guest via shared vector array. hyp does the actual
151  * inject of the interrupt.
152  */
mshv_try_assert_irq_fast(struct mshv_irqfd * irqfd)153 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
154 {
155 	struct mshv_partition *partition = irqfd->irqfd_partn;
156 	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
157 	struct mshv_vp *vp;
158 
159 	if (!(ms_hyperv.ext_features &
160 	      HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
161 		return -EOPNOTSUPP;
162 
163 	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
164 		return -EOPNOTSUPP;
165 
166 	if (irq->lapic_control.logical_dest_mode)
167 		return -EOPNOTSUPP;
168 
169 	vp = partition->pt_vp_array[irq->lapic_apic_id];
170 
171 	if (!vp->vp_register_page)
172 		return -EOPNOTSUPP;
173 
174 	if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
175 		return -EINVAL;
176 
177 	if (vp->run.flags.root_sched_dispatched &&
178 	    vp->vp_register_page->interrupt_vectors.as_uint64)
179 		return -EBUSY;
180 
181 	wake_up(&vp->run.vp_suspend_queue);
182 
183 	return 0;
184 }
185 #else /* CONFIG_X86_64 */
mshv_try_assert_irq_fast(struct mshv_irqfd * irqfd)186 static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
187 {
188 	return -EOPNOTSUPP;
189 }
190 #endif
191 
mshv_assert_irq_slow(struct mshv_irqfd * irqfd)192 static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
193 {
194 	struct mshv_partition *partition = irqfd->irqfd_partn;
195 	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
196 	unsigned int seq;
197 	int idx;
198 
199 	WARN_ON(irqfd->irqfd_resampler &&
200 		!irq->lapic_control.level_triggered);
201 
202 	idx = srcu_read_lock(&partition->pt_irq_srcu);
203 	if (irqfd->irqfd_girq_ent.guest_irq_num) {
204 		if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
205 			srcu_read_unlock(&partition->pt_irq_srcu, idx);
206 			return;
207 		}
208 
209 		do {
210 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
211 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
212 	}
213 
214 	hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
215 					 irq->lapic_vector, irq->lapic_apic_id,
216 					 irq->lapic_control);
217 	srcu_read_unlock(&partition->pt_irq_srcu, idx);
218 }
219 
mshv_irqfd_resampler_shutdown(struct mshv_irqfd * irqfd)220 static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
221 {
222 	struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
223 	struct mshv_partition *pt = rp->rsmplr_partn;
224 
225 	mutex_lock(&pt->irqfds_resampler_lock);
226 
227 	hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
228 	synchronize_srcu(&pt->pt_irq_srcu);
229 
230 	if (hlist_empty(&rp->rsmplr_irqfd_list)) {
231 		hlist_del(&rp->rsmplr_hnode);
232 		mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
233 		kfree(rp);
234 	}
235 
236 	mutex_unlock(&pt->irqfds_resampler_lock);
237 }
238 
239 /*
240  * Race-free decouple logic (ordering is critical)
241  */
mshv_irqfd_shutdown(struct work_struct * work)242 static void mshv_irqfd_shutdown(struct work_struct *work)
243 {
244 	struct mshv_irqfd *irqfd =
245 			container_of(work, struct mshv_irqfd, irqfd_shutdown);
246 
247 	/*
248 	 * Synchronize with the wait-queue and unhook ourselves to prevent
249 	 * further events.
250 	 */
251 	remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait);
252 
253 	if (irqfd->irqfd_resampler) {
254 		mshv_irqfd_resampler_shutdown(irqfd);
255 		eventfd_ctx_put(irqfd->irqfd_resamplefd);
256 	}
257 
258 	/*
259 	 * It is now safe to release the object's resources
260 	 */
261 	eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
262 	kfree(irqfd);
263 }
264 
265 /* assumes partition->pt_irqfds_lock is held */
mshv_irqfd_is_active(struct mshv_irqfd * irqfd)266 static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
267 {
268 	return !hlist_unhashed(&irqfd->irqfd_hnode);
269 }
270 
271 /*
272  * Mark the irqfd as inactive and schedule it for removal
273  *
274  * assumes partition->pt_irqfds_lock is held
275  */
mshv_irqfd_deactivate(struct mshv_irqfd * irqfd)276 static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
277 {
278 	if (!mshv_irqfd_is_active(irqfd))
279 		return;
280 
281 	hlist_del(&irqfd->irqfd_hnode);
282 
283 	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
284 }
285 
286 /*
287  * Called with wqh->lock held and interrupts disabled
288  */
mshv_irqfd_wakeup(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)289 static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
290 			     int sync, void *key)
291 {
292 	struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
293 						irqfd_wait);
294 	unsigned long flags = (unsigned long)key;
295 	int idx;
296 	unsigned int seq;
297 	struct mshv_partition *pt = irqfd->irqfd_partn;
298 	int ret = 0;
299 
300 	if (flags & POLLIN) {
301 		u64 cnt;
302 
303 		eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
304 		idx = srcu_read_lock(&pt->pt_irq_srcu);
305 		do {
306 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
307 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
308 
309 		/* An event has been signaled, raise an interrupt */
310 		ret = mshv_try_assert_irq_fast(irqfd);
311 		if (ret)
312 			mshv_assert_irq_slow(irqfd);
313 
314 		srcu_read_unlock(&pt->pt_irq_srcu, idx);
315 
316 		ret = 1;
317 	}
318 
319 	if (flags & POLLHUP) {
320 		/* The eventfd is closing, detach from the partition */
321 		unsigned long flags;
322 
323 		spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
324 
325 		/*
326 		 * We must check if someone deactivated the irqfd before
327 		 * we could acquire the pt_irqfds_lock since the item is
328 		 * deactivated from the mshv side before it is unhooked from
329 		 * the wait-queue.  If it is already deactivated, we can
330 		 * simply return knowing the other side will cleanup for us.
331 		 * We cannot race against the irqfd going away since the
332 		 * other side is required to acquire wqh->lock, which we hold
333 		 */
334 		if (mshv_irqfd_is_active(irqfd))
335 			mshv_irqfd_deactivate(irqfd);
336 
337 		spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
338 	}
339 
340 	return ret;
341 }
342 
343 /* Must be called under pt_irqfds_lock */
mshv_irqfd_update(struct mshv_partition * pt,struct mshv_irqfd * irqfd)344 static void mshv_irqfd_update(struct mshv_partition *pt,
345 			      struct mshv_irqfd *irqfd)
346 {
347 	write_seqcount_begin(&irqfd->irqfd_irqe_sc);
348 	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
349 						    irqfd->irqfd_irqnum);
350 	mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
351 	write_seqcount_end(&irqfd->irqfd_irqe_sc);
352 }
353 
mshv_irqfd_routing_update(struct mshv_partition * pt)354 void mshv_irqfd_routing_update(struct mshv_partition *pt)
355 {
356 	struct mshv_irqfd *irqfd;
357 
358 	spin_lock_irq(&pt->pt_irqfds_lock);
359 	hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
360 		mshv_irqfd_update(pt, irqfd);
361 	spin_unlock_irq(&pt->pt_irqfds_lock);
362 }
363 
mshv_irqfd_queue_proc(struct file * file,wait_queue_head_t * wqh,poll_table * polltbl)364 static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
365 				  poll_table *polltbl)
366 {
367 	struct mshv_irqfd *irqfd =
368 			container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
369 
370 	irqfd->irqfd_wqh = wqh;
371 	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
372 }
373 
mshv_irqfd_assign(struct mshv_partition * pt,struct mshv_user_irqfd * args)374 static int mshv_irqfd_assign(struct mshv_partition *pt,
375 			     struct mshv_user_irqfd *args)
376 {
377 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
378 	struct mshv_irqfd *irqfd, *tmp;
379 	unsigned int events;
380 	struct fd f;
381 	int ret;
382 	int idx;
383 
384 	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
385 	if (!irqfd)
386 		return -ENOMEM;
387 
388 	irqfd->irqfd_partn = pt;
389 	irqfd->irqfd_irqnum = args->gsi;
390 	INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
391 	seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
392 
393 	f = fdget(args->fd);
394 	if (!fd_file(f)) {
395 		ret = -EBADF;
396 		goto out;
397 	}
398 
399 	eventfd = eventfd_ctx_fileget(fd_file(f));
400 	if (IS_ERR(eventfd)) {
401 		ret = PTR_ERR(eventfd);
402 		goto fail;
403 	}
404 
405 	irqfd->irqfd_eventfd_ctx = eventfd;
406 
407 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
408 		struct mshv_irqfd_resampler *rp;
409 
410 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
411 		if (IS_ERR(resamplefd)) {
412 			ret = PTR_ERR(resamplefd);
413 			goto fail;
414 		}
415 
416 		irqfd->irqfd_resamplefd = resamplefd;
417 
418 		mutex_lock(&pt->irqfds_resampler_lock);
419 
420 		hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
421 				     rsmplr_hnode) {
422 			if (rp->rsmplr_notifier.irq_ack_gsi ==
423 							 irqfd->irqfd_irqnum) {
424 				irqfd->irqfd_resampler = rp;
425 				break;
426 			}
427 		}
428 
429 		if (!irqfd->irqfd_resampler) {
430 			rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT);
431 			if (!rp) {
432 				ret = -ENOMEM;
433 				mutex_unlock(&pt->irqfds_resampler_lock);
434 				goto fail;
435 			}
436 
437 			rp->rsmplr_partn = pt;
438 			INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
439 			rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
440 			rp->rsmplr_notifier.irq_acked =
441 						      mshv_irqfd_resampler_ack;
442 
443 			hlist_add_head(&rp->rsmplr_hnode,
444 				       &pt->irqfds_resampler_list);
445 			mshv_register_irq_ack_notifier(pt,
446 						       &rp->rsmplr_notifier);
447 			irqfd->irqfd_resampler = rp;
448 		}
449 
450 		hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
451 				   &irqfd->irqfd_resampler->rsmplr_irqfd_list);
452 
453 		mutex_unlock(&pt->irqfds_resampler_lock);
454 	}
455 
456 	/*
457 	 * Install our own custom wake-up handling so we are notified via
458 	 * a callback whenever someone signals the underlying eventfd
459 	 */
460 	init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
461 	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
462 
463 	spin_lock_irq(&pt->pt_irqfds_lock);
464 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
465 	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
466 		/*
467 		 * Resample Fd must be for level triggered interrupt
468 		 * Otherwise return with failure
469 		 */
470 		spin_unlock_irq(&pt->pt_irqfds_lock);
471 		ret = -EINVAL;
472 		goto fail;
473 	}
474 	ret = 0;
475 	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
476 		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
477 			continue;
478 		/* This fd is used for another irq already. */
479 		ret = -EBUSY;
480 		spin_unlock_irq(&pt->pt_irqfds_lock);
481 		goto fail;
482 	}
483 
484 	idx = srcu_read_lock(&pt->pt_irq_srcu);
485 	mshv_irqfd_update(pt, irqfd);
486 	hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
487 	spin_unlock_irq(&pt->pt_irqfds_lock);
488 
489 	/*
490 	 * Check if there was an event already pending on the eventfd
491 	 * before we registered, and trigger it as if we didn't miss it.
492 	 */
493 	events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
494 
495 	if (events & POLLIN)
496 		mshv_assert_irq_slow(irqfd);
497 
498 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
499 	/*
500 	 * do not drop the file until the irqfd is fully initialized, otherwise
501 	 * we might race against the POLLHUP
502 	 */
503 	fdput(f);
504 
505 	return 0;
506 
507 fail:
508 	if (irqfd->irqfd_resampler)
509 		mshv_irqfd_resampler_shutdown(irqfd);
510 
511 	if (resamplefd && !IS_ERR(resamplefd))
512 		eventfd_ctx_put(resamplefd);
513 
514 	if (eventfd && !IS_ERR(eventfd))
515 		eventfd_ctx_put(eventfd);
516 
517 	fdput(f);
518 
519 out:
520 	kfree(irqfd);
521 	return ret;
522 }
523 
524 /*
525  * shutdown any irqfd's that match fd+gsi
526  */
mshv_irqfd_deassign(struct mshv_partition * pt,struct mshv_user_irqfd * args)527 static int mshv_irqfd_deassign(struct mshv_partition *pt,
528 			       struct mshv_user_irqfd *args)
529 {
530 	struct mshv_irqfd *irqfd;
531 	struct hlist_node *n;
532 	struct eventfd_ctx *eventfd;
533 
534 	eventfd = eventfd_ctx_fdget(args->fd);
535 	if (IS_ERR(eventfd))
536 		return PTR_ERR(eventfd);
537 
538 	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
539 				  irqfd_hnode) {
540 		if (irqfd->irqfd_eventfd_ctx == eventfd &&
541 		    irqfd->irqfd_irqnum == args->gsi)
542 
543 			mshv_irqfd_deactivate(irqfd);
544 	}
545 
546 	eventfd_ctx_put(eventfd);
547 
548 	/*
549 	 * Block until we know all outstanding shutdown jobs have completed
550 	 * so that we guarantee there will not be any more interrupts on this
551 	 * gsi once this deassign function returns.
552 	 */
553 	flush_workqueue(irqfd_cleanup_wq);
554 
555 	return 0;
556 }
557 
mshv_set_unset_irqfd(struct mshv_partition * pt,struct mshv_user_irqfd * args)558 int mshv_set_unset_irqfd(struct mshv_partition *pt,
559 			 struct mshv_user_irqfd *args)
560 {
561 	if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
562 		return -EINVAL;
563 
564 	if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
565 		return mshv_irqfd_deassign(pt, args);
566 
567 	return mshv_irqfd_assign(pt, args);
568 }
569 
570 /*
571  * This function is called as the mshv VM fd is being released.
572  * Shutdown all irqfds that still remain open
573  */
mshv_irqfd_release(struct mshv_partition * pt)574 static void mshv_irqfd_release(struct mshv_partition *pt)
575 {
576 	struct mshv_irqfd *irqfd;
577 	struct hlist_node *n;
578 
579 	spin_lock_irq(&pt->pt_irqfds_lock);
580 
581 	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
582 		mshv_irqfd_deactivate(irqfd);
583 
584 	spin_unlock_irq(&pt->pt_irqfds_lock);
585 
586 	/*
587 	 * Block until we know all outstanding shutdown jobs have completed
588 	 * since we do not take a mshv_partition* reference.
589 	 */
590 	flush_workqueue(irqfd_cleanup_wq);
591 }
592 
mshv_irqfd_wq_init(void)593 int mshv_irqfd_wq_init(void)
594 {
595 	irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
596 	if (!irqfd_cleanup_wq)
597 		return -ENOMEM;
598 
599 	return 0;
600 }
601 
mshv_irqfd_wq_cleanup(void)602 void mshv_irqfd_wq_cleanup(void)
603 {
604 	destroy_workqueue(irqfd_cleanup_wq);
605 }
606 
607 /*
608  * --------------------------------------------------------------------
609  * ioeventfd: translate a MMIO memory write to an eventfd signal.
610  *
611  * userspace can register a MMIO address with an eventfd for receiving
612  * notification when the memory has been touched.
613  * --------------------------------------------------------------------
614  */
615 
ioeventfd_release(struct mshv_ioeventfd * p,u64 partition_id)616 static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
617 {
618 	if (p->iovntfd_doorbell_id > 0)
619 		mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
620 	eventfd_ctx_put(p->iovntfd_eventfd);
621 	kfree(p);
622 }
623 
624 /* MMIO writes trigger an event if the addr/val match */
ioeventfd_mmio_write(int doorbell_id,void * data)625 static void ioeventfd_mmio_write(int doorbell_id, void *data)
626 {
627 	struct mshv_partition *partition = (struct mshv_partition *)data;
628 	struct mshv_ioeventfd *p;
629 
630 	rcu_read_lock();
631 	hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
632 		if (p->iovntfd_doorbell_id == doorbell_id) {
633 			eventfd_signal(p->iovntfd_eventfd);
634 			break;
635 		}
636 
637 	rcu_read_unlock();
638 }
639 
ioeventfd_check_collision(struct mshv_partition * pt,struct mshv_ioeventfd * p)640 static bool ioeventfd_check_collision(struct mshv_partition *pt,
641 				      struct mshv_ioeventfd *p)
642 	__must_hold(&pt->mutex)
643 {
644 	struct mshv_ioeventfd *_p;
645 
646 	hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
647 		if (_p->iovntfd_addr == p->iovntfd_addr &&
648 		    _p->iovntfd_length == p->iovntfd_length &&
649 		    (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
650 		     _p->iovntfd_datamatch == p->iovntfd_datamatch))
651 			return true;
652 
653 	return false;
654 }
655 
mshv_assign_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)656 static int mshv_assign_ioeventfd(struct mshv_partition *pt,
657 				 struct mshv_user_ioeventfd *args)
658 	__must_hold(&pt->mutex)
659 {
660 	struct mshv_ioeventfd *p;
661 	struct eventfd_ctx *eventfd;
662 	u64 doorbell_flags = 0;
663 	int ret;
664 
665 	/* This mutex is currently protecting ioeventfd.items list */
666 	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
667 
668 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
669 		return -EOPNOTSUPP;
670 
671 	/* must be natural-word sized */
672 	switch (args->len) {
673 	case 0:
674 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
675 		break;
676 	case 1:
677 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
678 		break;
679 	case 2:
680 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
681 		break;
682 	case 4:
683 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
684 		break;
685 	case 8:
686 		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
687 		break;
688 	default:
689 		return -EINVAL;
690 	}
691 
692 	/* check for range overflow */
693 	if (args->addr + args->len < args->addr)
694 		return -EINVAL;
695 
696 	/* check for extra flags that we don't understand */
697 	if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
698 		return -EINVAL;
699 
700 	eventfd = eventfd_ctx_fdget(args->fd);
701 	if (IS_ERR(eventfd))
702 		return PTR_ERR(eventfd);
703 
704 	p = kzalloc(sizeof(*p), GFP_KERNEL);
705 	if (!p) {
706 		ret = -ENOMEM;
707 		goto fail;
708 	}
709 
710 	p->iovntfd_addr = args->addr;
711 	p->iovntfd_length  = args->len;
712 	p->iovntfd_eventfd = eventfd;
713 
714 	/* The datamatch feature is optional, otherwise this is a wildcard */
715 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
716 		p->iovntfd_datamatch = args->datamatch;
717 	} else {
718 		p->iovntfd_wildcard = true;
719 		doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
720 	}
721 
722 	if (ioeventfd_check_collision(pt, p)) {
723 		ret = -EEXIST;
724 		goto unlock_fail;
725 	}
726 
727 	ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
728 				     (void *)pt, p->iovntfd_addr,
729 				     p->iovntfd_datamatch, doorbell_flags);
730 	if (ret < 0)
731 		goto unlock_fail;
732 
733 	p->iovntfd_doorbell_id = ret;
734 
735 	hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
736 
737 	return 0;
738 
739 unlock_fail:
740 	kfree(p);
741 
742 fail:
743 	eventfd_ctx_put(eventfd);
744 
745 	return ret;
746 }
747 
mshv_deassign_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)748 static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
749 				   struct mshv_user_ioeventfd *args)
750 	__must_hold(&pt->mutex)
751 {
752 	struct mshv_ioeventfd *p;
753 	struct eventfd_ctx *eventfd;
754 	struct hlist_node *n;
755 	int ret = -ENOENT;
756 
757 	/* This mutex is currently protecting ioeventfd.items list */
758 	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
759 
760 	eventfd = eventfd_ctx_fdget(args->fd);
761 	if (IS_ERR(eventfd))
762 		return PTR_ERR(eventfd);
763 
764 	hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
765 		bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
766 
767 		if (p->iovntfd_eventfd != eventfd  ||
768 		    p->iovntfd_addr != args->addr  ||
769 		    p->iovntfd_length != args->len ||
770 		    p->iovntfd_wildcard != wildcard)
771 			continue;
772 
773 		if (!p->iovntfd_wildcard &&
774 		    p->iovntfd_datamatch != args->datamatch)
775 			continue;
776 
777 		hlist_del_rcu(&p->iovntfd_hnode);
778 		synchronize_rcu();
779 		ioeventfd_release(p, pt->pt_id);
780 		ret = 0;
781 		break;
782 	}
783 
784 	eventfd_ctx_put(eventfd);
785 
786 	return ret;
787 }
788 
mshv_set_unset_ioeventfd(struct mshv_partition * pt,struct mshv_user_ioeventfd * args)789 int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
790 			     struct mshv_user_ioeventfd *args)
791 	__must_hold(&pt->mutex)
792 {
793 	if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
794 	    mshv_field_nonzero(*args, rsvd))
795 		return -EINVAL;
796 
797 	/* PIO not yet implemented */
798 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
799 		return -EOPNOTSUPP;
800 
801 	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
802 		return mshv_deassign_ioeventfd(pt, args);
803 
804 	return mshv_assign_ioeventfd(pt, args);
805 }
806 
mshv_eventfd_init(struct mshv_partition * pt)807 void mshv_eventfd_init(struct mshv_partition *pt)
808 {
809 	spin_lock_init(&pt->pt_irqfds_lock);
810 	INIT_HLIST_HEAD(&pt->pt_irqfds_list);
811 
812 	INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
813 	mutex_init(&pt->irqfds_resampler_lock);
814 
815 	INIT_HLIST_HEAD(&pt->ioeventfds_list);
816 }
817 
mshv_eventfd_release(struct mshv_partition * pt)818 void mshv_eventfd_release(struct mshv_partition *pt)
819 {
820 	struct hlist_head items;
821 	struct hlist_node *n;
822 	struct mshv_ioeventfd *p;
823 
824 	hlist_move_list(&pt->ioeventfds_list, &items);
825 	synchronize_rcu();
826 
827 	hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
828 		hlist_del(&p->iovntfd_hnode);
829 		ioeventfd_release(p, pt->pt_id);
830 	}
831 
832 	mshv_irqfd_release(pt);
833 }
834