1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21
22 #include "cmd.h"
23
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27 #define MAX_CHUNK_SIZE SZ_8M
28
mlx5vf_drvdata(struct pci_dev * pdev)29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32
33 return container_of(core_device, struct mlx5vf_pci_core_device,
34 core_device);
35 }
36
37 struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer * buf,unsigned long offset)38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39 unsigned long offset)
40 {
41 unsigned long cur_offset = 0;
42 struct scatterlist *sg;
43 unsigned int i;
44
45 /* All accesses are sequential */
46 if (offset < buf->last_offset || !buf->last_offset_sg) {
47 buf->last_offset = 0;
48 buf->last_offset_sg = buf->table.sgt.sgl;
49 buf->sg_last_entry = 0;
50 }
51
52 cur_offset = buf->last_offset;
53
54 for_each_sg(buf->last_offset_sg, sg,
55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56 if (offset < sg->length + cur_offset) {
57 buf->last_offset_sg = sg;
58 buf->sg_last_entry += i;
59 buf->last_offset = cur_offset;
60 return nth_page(sg_page(sg),
61 (offset - cur_offset) / PAGE_SIZE);
62 }
63 cur_offset += sg->length;
64 }
65 return NULL;
66 }
67
mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer * buf,unsigned int npages)68 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
69 unsigned int npages)
70 {
71 unsigned int to_alloc = npages;
72 struct page **page_list;
73 unsigned long filled;
74 unsigned int to_fill;
75 int ret;
76
77 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
78 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
79 if (!page_list)
80 return -ENOMEM;
81
82 do {
83 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
84 page_list);
85 if (!filled) {
86 ret = -ENOMEM;
87 goto err;
88 }
89 to_alloc -= filled;
90 ret = sg_alloc_append_table_from_pages(
91 &buf->table, page_list, filled, 0,
92 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
93 GFP_KERNEL_ACCOUNT);
94
95 if (ret)
96 goto err;
97 buf->allocated_length += filled * PAGE_SIZE;
98 /* clean input for another bulk allocation */
99 memset(page_list, 0, filled * sizeof(*page_list));
100 to_fill = min_t(unsigned int, to_alloc,
101 PAGE_SIZE / sizeof(*page_list));
102 } while (to_alloc > 0);
103
104 kvfree(page_list);
105 return 0;
106
107 err:
108 kvfree(page_list);
109 return ret;
110 }
111
mlx5vf_disable_fd(struct mlx5_vf_migration_file * migf)112 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
113 {
114 mutex_lock(&migf->lock);
115 migf->state = MLX5_MIGF_STATE_ERROR;
116 migf->filp->f_pos = 0;
117 mutex_unlock(&migf->lock);
118 }
119
mlx5vf_release_file(struct inode * inode,struct file * filp)120 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
121 {
122 struct mlx5_vf_migration_file *migf = filp->private_data;
123
124 mlx5vf_disable_fd(migf);
125 mutex_destroy(&migf->lock);
126 kfree(migf);
127 return 0;
128 }
129
130 static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file * migf,loff_t pos,bool * end_of_data)131 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
132 bool *end_of_data)
133 {
134 struct mlx5_vhca_data_buffer *buf;
135 bool found = false;
136
137 *end_of_data = false;
138 spin_lock_irq(&migf->list_lock);
139 if (list_empty(&migf->buf_list)) {
140 *end_of_data = true;
141 goto end;
142 }
143
144 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
145 buf_elm);
146 if (pos >= buf->start_pos &&
147 pos < buf->start_pos + buf->length) {
148 found = true;
149 goto end;
150 }
151
152 /*
153 * As we use a stream based FD we may expect having the data always
154 * on first chunk
155 */
156 migf->state = MLX5_MIGF_STATE_ERROR;
157
158 end:
159 spin_unlock_irq(&migf->list_lock);
160 return found ? buf : NULL;
161 }
162
mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer * vhca_buf)163 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
164 {
165 struct mlx5_vf_migration_file *migf = vhca_buf->migf;
166
167 if (vhca_buf->stop_copy_chunk_num) {
168 bool is_header = vhca_buf->dma_dir == DMA_NONE;
169 u8 chunk_num = vhca_buf->stop_copy_chunk_num;
170 size_t next_required_umem_size = 0;
171
172 if (is_header)
173 migf->buf_header[chunk_num - 1] = vhca_buf;
174 else
175 migf->buf[chunk_num - 1] = vhca_buf;
176
177 spin_lock_irq(&migf->list_lock);
178 list_del_init(&vhca_buf->buf_elm);
179 if (!is_header) {
180 next_required_umem_size =
181 migf->next_required_umem_size;
182 migf->next_required_umem_size = 0;
183 migf->num_ready_chunks--;
184 }
185 spin_unlock_irq(&migf->list_lock);
186 if (next_required_umem_size)
187 mlx5vf_mig_file_set_save_work(migf, chunk_num,
188 next_required_umem_size);
189 return;
190 }
191
192 spin_lock_irq(&migf->list_lock);
193 list_del_init(&vhca_buf->buf_elm);
194 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
195 spin_unlock_irq(&migf->list_lock);
196 }
197
mlx5vf_buf_read(struct mlx5_vhca_data_buffer * vhca_buf,char __user ** buf,size_t * len,loff_t * pos)198 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
199 char __user **buf, size_t *len, loff_t *pos)
200 {
201 unsigned long offset;
202 ssize_t done = 0;
203 size_t copy_len;
204
205 copy_len = min_t(size_t,
206 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
207 while (copy_len) {
208 size_t page_offset;
209 struct page *page;
210 size_t page_len;
211 u8 *from_buff;
212 int ret;
213
214 offset = *pos - vhca_buf->start_pos;
215 page_offset = offset % PAGE_SIZE;
216 offset -= page_offset;
217 page = mlx5vf_get_migration_page(vhca_buf, offset);
218 if (!page)
219 return -EINVAL;
220 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
221 from_buff = kmap_local_page(page);
222 ret = copy_to_user(*buf, from_buff + page_offset, page_len);
223 kunmap_local(from_buff);
224 if (ret)
225 return -EFAULT;
226 *pos += page_len;
227 *len -= page_len;
228 *buf += page_len;
229 done += page_len;
230 copy_len -= page_len;
231 }
232
233 if (*pos >= vhca_buf->start_pos + vhca_buf->length)
234 mlx5vf_buf_read_done(vhca_buf);
235
236 return done;
237 }
238
mlx5vf_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)239 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
240 loff_t *pos)
241 {
242 struct mlx5_vf_migration_file *migf = filp->private_data;
243 struct mlx5_vhca_data_buffer *vhca_buf;
244 bool first_loop_call = true;
245 bool end_of_data;
246 ssize_t done = 0;
247
248 if (pos)
249 return -ESPIPE;
250 pos = &filp->f_pos;
251
252 if (!(filp->f_flags & O_NONBLOCK)) {
253 if (wait_event_interruptible(migf->poll_wait,
254 !list_empty(&migf->buf_list) ||
255 migf->state == MLX5_MIGF_STATE_ERROR ||
256 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
257 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
258 migf->state == MLX5_MIGF_STATE_COMPLETE))
259 return -ERESTARTSYS;
260 }
261
262 mutex_lock(&migf->lock);
263 if (migf->state == MLX5_MIGF_STATE_ERROR) {
264 done = -ENODEV;
265 goto out_unlock;
266 }
267
268 while (len) {
269 ssize_t count;
270
271 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
272 &end_of_data);
273 if (first_loop_call) {
274 first_loop_call = false;
275 /* Temporary end of file as part of PRE_COPY */
276 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
277 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
278 done = -ENOMSG;
279 goto out_unlock;
280 }
281
282 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
283 if (filp->f_flags & O_NONBLOCK) {
284 done = -EAGAIN;
285 goto out_unlock;
286 }
287 }
288 }
289
290 if (end_of_data)
291 goto out_unlock;
292
293 if (!vhca_buf) {
294 done = -EINVAL;
295 goto out_unlock;
296 }
297
298 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
299 if (count < 0) {
300 done = count;
301 goto out_unlock;
302 }
303 done += count;
304 }
305
306 out_unlock:
307 mutex_unlock(&migf->lock);
308 return done;
309 }
310
mlx5vf_save_poll(struct file * filp,struct poll_table_struct * wait)311 static __poll_t mlx5vf_save_poll(struct file *filp,
312 struct poll_table_struct *wait)
313 {
314 struct mlx5_vf_migration_file *migf = filp->private_data;
315 __poll_t pollflags = 0;
316
317 poll_wait(filp, &migf->poll_wait, wait);
318
319 mutex_lock(&migf->lock);
320 if (migf->state == MLX5_MIGF_STATE_ERROR)
321 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
322 else if (!list_empty(&migf->buf_list) ||
323 migf->state == MLX5_MIGF_STATE_COMPLETE)
324 pollflags = EPOLLIN | EPOLLRDNORM;
325 mutex_unlock(&migf->lock);
326
327 return pollflags;
328 }
329
330 /*
331 * FD is exposed and user can use it after receiving an error.
332 * Mark migf in error, and wake the user.
333 */
mlx5vf_mark_err(struct mlx5_vf_migration_file * migf)334 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
335 {
336 migf->state = MLX5_MIGF_STATE_ERROR;
337 wake_up_interruptible(&migf->poll_wait);
338 }
339
mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file * migf,u8 chunk_num,size_t next_required_umem_size)340 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
341 u8 chunk_num, size_t next_required_umem_size)
342 {
343 migf->save_data[chunk_num - 1].next_required_umem_size =
344 next_required_umem_size;
345 migf->save_data[chunk_num - 1].migf = migf;
346 get_file(migf->filp);
347 queue_work(migf->mvdev->cb_wq,
348 &migf->save_data[chunk_num - 1].work);
349 }
350
351 static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file * migf,u8 index,size_t required_length)352 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
353 u8 index, size_t required_length)
354 {
355 struct mlx5_vhca_data_buffer *buf = migf->buf[index];
356 u8 chunk_num;
357
358 WARN_ON(!buf);
359 chunk_num = buf->stop_copy_chunk_num;
360 buf->migf->buf[index] = NULL;
361 /* Checking whether the pre-allocated buffer can fit */
362 if (buf->allocated_length >= required_length)
363 return buf;
364
365 mlx5vf_put_data_buffer(buf);
366 buf = mlx5vf_get_data_buffer(buf->migf, required_length,
367 DMA_FROM_DEVICE);
368 if (IS_ERR(buf))
369 return buf;
370
371 buf->stop_copy_chunk_num = chunk_num;
372 return buf;
373 }
374
mlx5vf_mig_file_save_work(struct work_struct * _work)375 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
376 {
377 struct mlx5vf_save_work_data *save_data = container_of(_work,
378 struct mlx5vf_save_work_data, work);
379 struct mlx5_vf_migration_file *migf = save_data->migf;
380 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
381 struct mlx5_vhca_data_buffer *buf;
382
383 mutex_lock(&mvdev->state_mutex);
384 if (migf->state == MLX5_MIGF_STATE_ERROR)
385 goto end;
386
387 buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
388 save_data->chunk_num - 1,
389 save_data->next_required_umem_size);
390 if (IS_ERR(buf))
391 goto err;
392
393 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
394 goto err_save;
395
396 goto end;
397
398 err_save:
399 mlx5vf_put_data_buffer(buf);
400 err:
401 mlx5vf_mark_err(migf);
402 end:
403 mlx5vf_state_mutex_unlock(mvdev);
404 fput(migf->filp);
405 }
406
mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file * migf,bool track)407 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
408 bool track)
409 {
410 size_t size = sizeof(struct mlx5_vf_migration_header) +
411 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
412 struct mlx5_vf_migration_tag_stop_copy_data data = {};
413 struct mlx5_vhca_data_buffer *header_buf = NULL;
414 struct mlx5_vf_migration_header header = {};
415 unsigned long flags;
416 struct page *page;
417 u8 *to_buff;
418 int ret;
419
420 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
421 if (IS_ERR(header_buf))
422 return PTR_ERR(header_buf);
423
424 header.record_size = cpu_to_le64(sizeof(data));
425 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
426 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
427 page = mlx5vf_get_migration_page(header_buf, 0);
428 if (!page) {
429 ret = -EINVAL;
430 goto err;
431 }
432 to_buff = kmap_local_page(page);
433 memcpy(to_buff, &header, sizeof(header));
434 header_buf->length = sizeof(header);
435 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
436 memcpy(to_buff + sizeof(header), &data, sizeof(data));
437 header_buf->length += sizeof(data);
438 kunmap_local(to_buff);
439 header_buf->start_pos = header_buf->migf->max_pos;
440 migf->max_pos += header_buf->length;
441 spin_lock_irqsave(&migf->list_lock, flags);
442 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
443 spin_unlock_irqrestore(&migf->list_lock, flags);
444 if (track)
445 migf->pre_copy_initial_bytes = size;
446 return 0;
447 err:
448 mlx5vf_put_data_buffer(header_buf);
449 return ret;
450 }
451
mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,size_t state_size,u64 full_size,bool track)452 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
453 struct mlx5_vf_migration_file *migf,
454 size_t state_size, u64 full_size,
455 bool track)
456 {
457 struct mlx5_vhca_data_buffer *buf;
458 size_t inc_state_size;
459 int num_chunks;
460 int ret;
461 int i;
462
463 if (mvdev->chunk_mode) {
464 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
465
466 /* from firmware perspective at least 'state_size' buffer should be set */
467 inc_state_size = max(state_size, chunk_size);
468 } else {
469 if (track) {
470 /* let's be ready for stop_copy size that might grow by 10 percents */
471 if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
472 inc_state_size = state_size;
473 } else {
474 inc_state_size = state_size;
475 }
476 }
477
478 /* let's not overflow the device specification max SAVE size */
479 inc_state_size = min_t(size_t, inc_state_size,
480 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
481
482 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
483 for (i = 0; i < num_chunks; i++) {
484 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
485 if (IS_ERR(buf)) {
486 ret = PTR_ERR(buf);
487 goto err;
488 }
489
490 migf->buf[i] = buf;
491 buf = mlx5vf_get_data_buffer(migf,
492 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
493 if (IS_ERR(buf)) {
494 ret = PTR_ERR(buf);
495 goto err;
496 }
497 migf->buf_header[i] = buf;
498 if (mvdev->chunk_mode) {
499 migf->buf[i]->stop_copy_chunk_num = i + 1;
500 migf->buf_header[i]->stop_copy_chunk_num = i + 1;
501 INIT_WORK(&migf->save_data[i].work,
502 mlx5vf_mig_file_save_work);
503 migf->save_data[i].chunk_num = i + 1;
504 }
505 }
506
507 ret = mlx5vf_add_stop_copy_header(migf, track);
508 if (ret)
509 goto err;
510 return 0;
511
512 err:
513 for (i = 0; i < num_chunks; i++) {
514 if (migf->buf[i]) {
515 mlx5vf_put_data_buffer(migf->buf[i]);
516 migf->buf[i] = NULL;
517 }
518 if (migf->buf_header[i]) {
519 mlx5vf_put_data_buffer(migf->buf_header[i]);
520 migf->buf_header[i] = NULL;
521 }
522 }
523
524 return ret;
525 }
526
mlx5vf_precopy_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)527 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
528 unsigned long arg)
529 {
530 struct mlx5_vf_migration_file *migf = filp->private_data;
531 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
532 struct mlx5_vhca_data_buffer *buf;
533 struct vfio_precopy_info info = {};
534 loff_t *pos = &filp->f_pos;
535 unsigned long minsz;
536 size_t inc_length = 0;
537 bool end_of_data = false;
538 int ret;
539
540 if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
541 return -ENOTTY;
542
543 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
544
545 if (copy_from_user(&info, (void __user *)arg, minsz))
546 return -EFAULT;
547
548 if (info.argsz < minsz)
549 return -EINVAL;
550
551 mutex_lock(&mvdev->state_mutex);
552 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
553 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
554 ret = -EINVAL;
555 goto err_state_unlock;
556 }
557
558 /*
559 * We can't issue a SAVE command when the device is suspended, so as
560 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
561 * bytes that can't be read.
562 */
563 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
564 /*
565 * Once the query returns it's guaranteed that there is no
566 * active SAVE command.
567 * As so, the other code below is safe with the proper locks.
568 */
569 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
570 NULL, MLX5VF_QUERY_INC);
571 if (ret)
572 goto err_state_unlock;
573 }
574
575 mutex_lock(&migf->lock);
576 if (migf->state == MLX5_MIGF_STATE_ERROR) {
577 ret = -ENODEV;
578 goto err_migf_unlock;
579 }
580
581 if (migf->pre_copy_initial_bytes > *pos) {
582 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
583 } else {
584 info.dirty_bytes = migf->max_pos - *pos;
585 if (!info.dirty_bytes)
586 end_of_data = true;
587 info.dirty_bytes += inc_length;
588 }
589
590 if (!end_of_data || !inc_length) {
591 mutex_unlock(&migf->lock);
592 goto done;
593 }
594
595 mutex_unlock(&migf->lock);
596 /*
597 * We finished transferring the current state and the device has a
598 * dirty state, save a new state to be ready for.
599 */
600 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
601 if (IS_ERR(buf)) {
602 ret = PTR_ERR(buf);
603 mlx5vf_mark_err(migf);
604 goto err_state_unlock;
605 }
606
607 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
608 if (ret) {
609 mlx5vf_mark_err(migf);
610 mlx5vf_put_data_buffer(buf);
611 goto err_state_unlock;
612 }
613
614 done:
615 mlx5vf_state_mutex_unlock(mvdev);
616 if (copy_to_user((void __user *)arg, &info, minsz))
617 return -EFAULT;
618 return 0;
619
620 err_migf_unlock:
621 mutex_unlock(&migf->lock);
622 err_state_unlock:
623 mlx5vf_state_mutex_unlock(mvdev);
624 return ret;
625 }
626
627 static const struct file_operations mlx5vf_save_fops = {
628 .owner = THIS_MODULE,
629 .read = mlx5vf_save_read,
630 .poll = mlx5vf_save_poll,
631 .unlocked_ioctl = mlx5vf_precopy_ioctl,
632 .compat_ioctl = compat_ptr_ioctl,
633 .release = mlx5vf_release_file,
634 .llseek = no_llseek,
635 };
636
mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device * mvdev)637 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
638 {
639 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
640 struct mlx5_vhca_data_buffer *buf;
641 size_t length;
642 int ret;
643
644 if (migf->state == MLX5_MIGF_STATE_ERROR)
645 return -ENODEV;
646
647 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
648 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
649 if (ret)
650 goto err;
651
652 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
653 if (IS_ERR(buf)) {
654 ret = PTR_ERR(buf);
655 goto err;
656 }
657
658 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
659 if (ret)
660 goto err_save;
661
662 return 0;
663
664 err_save:
665 mlx5vf_put_data_buffer(buf);
666 err:
667 mlx5vf_mark_err(migf);
668 return ret;
669 }
670
671 static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device * mvdev,bool track)672 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
673 {
674 struct mlx5_vf_migration_file *migf;
675 struct mlx5_vhca_data_buffer *buf;
676 size_t length;
677 u64 full_size;
678 int ret;
679
680 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
681 if (!migf)
682 return ERR_PTR(-ENOMEM);
683
684 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
685 O_RDONLY);
686 if (IS_ERR(migf->filp)) {
687 ret = PTR_ERR(migf->filp);
688 goto end;
689 }
690
691 migf->mvdev = mvdev;
692 ret = mlx5vf_cmd_alloc_pd(migf);
693 if (ret)
694 goto out_free;
695
696 stream_open(migf->filp->f_inode, migf->filp);
697 mutex_init(&migf->lock);
698 init_waitqueue_head(&migf->poll_wait);
699 init_completion(&migf->save_comp);
700 /*
701 * save_comp is being used as a binary semaphore built from
702 * a completion. A normal mutex cannot be used because the lock is
703 * passed between kernel threads and lockdep can't model this.
704 */
705 complete(&migf->save_comp);
706 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
707 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
708 INIT_LIST_HEAD(&migf->buf_list);
709 INIT_LIST_HEAD(&migf->avail_list);
710 spin_lock_init(&migf->list_lock);
711 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
712 if (ret)
713 goto out_pd;
714
715 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
716 if (ret)
717 goto out_pd;
718
719 if (track) {
720 /* leave the allocated buffer ready for the stop-copy phase */
721 buf = mlx5vf_alloc_data_buffer(migf,
722 migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
723 if (IS_ERR(buf)) {
724 ret = PTR_ERR(buf);
725 goto out_pd;
726 }
727 } else {
728 buf = migf->buf[0];
729 migf->buf[0] = NULL;
730 }
731
732 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
733 if (ret)
734 goto out_save;
735 return migf;
736 out_save:
737 mlx5vf_free_data_buffer(buf);
738 out_pd:
739 mlx5fv_cmd_clean_migf_resources(migf);
740 out_free:
741 fput(migf->filp);
742 end:
743 kfree(migf);
744 return ERR_PTR(ret);
745 }
746
747 static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)748 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
749 const char __user **buf, size_t *len,
750 loff_t *pos, ssize_t *done)
751 {
752 unsigned long offset;
753 size_t page_offset;
754 struct page *page;
755 size_t page_len;
756 u8 *to_buff;
757 int ret;
758
759 offset = *pos - vhca_buf->start_pos;
760 page_offset = offset % PAGE_SIZE;
761
762 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
763 if (!page)
764 return -EINVAL;
765 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
766 to_buff = kmap_local_page(page);
767 ret = copy_from_user(to_buff + page_offset, *buf, page_len);
768 kunmap_local(to_buff);
769 if (ret)
770 return -EFAULT;
771
772 *pos += page_len;
773 *done += page_len;
774 *buf += page_len;
775 *len -= page_len;
776 vhca_buf->length += page_len;
777 return 0;
778 }
779
780 static int
mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer * vhca_buf,loff_t requested_length,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)781 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
782 loff_t requested_length,
783 const char __user **buf, size_t *len,
784 loff_t *pos, ssize_t *done)
785 {
786 int ret;
787
788 if (requested_length > MAX_LOAD_SIZE)
789 return -ENOMEM;
790
791 if (vhca_buf->allocated_length < requested_length) {
792 ret = mlx5vf_add_migration_pages(
793 vhca_buf,
794 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
795 PAGE_SIZE));
796 if (ret)
797 return ret;
798 }
799
800 while (*len) {
801 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
802 done);
803 if (ret)
804 return ret;
805 }
806
807 return 0;
808 }
809
810 static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,size_t image_size,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)811 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
812 struct mlx5_vhca_data_buffer *vhca_buf,
813 size_t image_size, const char __user **buf,
814 size_t *len, loff_t *pos, ssize_t *done,
815 bool *has_work)
816 {
817 size_t copy_len, to_copy;
818 int ret;
819
820 to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
821 copy_len = to_copy;
822 while (to_copy) {
823 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
824 done);
825 if (ret)
826 return ret;
827 }
828
829 *len -= copy_len;
830 if (vhca_buf->length == image_size) {
831 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
832 migf->max_pos += image_size;
833 *has_work = true;
834 }
835
836 return 0;
837 }
838
839 static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)840 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
841 struct mlx5_vhca_data_buffer *vhca_buf,
842 const char __user **buf, size_t *len,
843 loff_t *pos, ssize_t *done)
844 {
845 size_t copy_len, to_copy;
846 size_t required_data;
847 u8 *to_buff;
848 int ret;
849
850 required_data = migf->record_size - vhca_buf->length;
851 to_copy = min_t(size_t, *len, required_data);
852 copy_len = to_copy;
853 while (to_copy) {
854 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
855 done);
856 if (ret)
857 return ret;
858 }
859
860 *len -= copy_len;
861 if (vhca_buf->length == migf->record_size) {
862 switch (migf->record_tag) {
863 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
864 {
865 struct page *page;
866
867 page = mlx5vf_get_migration_page(vhca_buf, 0);
868 if (!page)
869 return -EINVAL;
870 to_buff = kmap_local_page(page);
871 migf->stop_copy_prep_size = min_t(u64,
872 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
873 kunmap_local(to_buff);
874 break;
875 }
876 default:
877 /* Optional tag */
878 break;
879 }
880
881 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
882 migf->max_pos += migf->record_size;
883 vhca_buf->length = 0;
884 }
885
886 return 0;
887 }
888
889 static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)890 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
891 struct mlx5_vhca_data_buffer *vhca_buf,
892 const char __user **buf,
893 size_t *len, loff_t *pos,
894 ssize_t *done, bool *has_work)
895 {
896 struct page *page;
897 size_t copy_len;
898 u8 *to_buff;
899 int ret;
900
901 copy_len = min_t(size_t, *len,
902 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
903 page = mlx5vf_get_migration_page(vhca_buf, 0);
904 if (!page)
905 return -EINVAL;
906 to_buff = kmap_local_page(page);
907 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
908 if (ret) {
909 ret = -EFAULT;
910 goto end;
911 }
912
913 *buf += copy_len;
914 *pos += copy_len;
915 *done += copy_len;
916 *len -= copy_len;
917 vhca_buf->length += copy_len;
918 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
919 u64 record_size;
920 u32 flags;
921
922 record_size = le64_to_cpup((__le64 *)to_buff);
923 if (record_size > MAX_LOAD_SIZE) {
924 ret = -ENOMEM;
925 goto end;
926 }
927
928 migf->record_size = record_size;
929 flags = le32_to_cpup((__le32 *)(to_buff +
930 offsetof(struct mlx5_vf_migration_header, flags)));
931 migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
932 offsetof(struct mlx5_vf_migration_header, tag)));
933 switch (migf->record_tag) {
934 case MLX5_MIGF_HEADER_TAG_FW_DATA:
935 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
936 break;
937 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
938 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
939 break;
940 default:
941 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
942 ret = -EOPNOTSUPP;
943 goto end;
944 }
945 /* We may read and skip this optional record data */
946 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
947 }
948
949 migf->max_pos += vhca_buf->length;
950 vhca_buf->length = 0;
951 *has_work = true;
952 }
953 end:
954 kunmap_local(to_buff);
955 return ret;
956 }
957
mlx5vf_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)958 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
959 size_t len, loff_t *pos)
960 {
961 struct mlx5_vf_migration_file *migf = filp->private_data;
962 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
963 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
964 loff_t requested_length;
965 bool has_work = false;
966 ssize_t done = 0;
967 int ret = 0;
968
969 if (pos)
970 return -ESPIPE;
971 pos = &filp->f_pos;
972
973 if (*pos < 0 ||
974 check_add_overflow((loff_t)len, *pos, &requested_length))
975 return -EINVAL;
976
977 mutex_lock(&migf->mvdev->state_mutex);
978 mutex_lock(&migf->lock);
979 if (migf->state == MLX5_MIGF_STATE_ERROR) {
980 ret = -ENODEV;
981 goto out_unlock;
982 }
983
984 while (len || has_work) {
985 has_work = false;
986 switch (migf->load_state) {
987 case MLX5_VF_LOAD_STATE_READ_HEADER:
988 ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
989 &buf, &len, pos,
990 &done, &has_work);
991 if (ret)
992 goto out_unlock;
993 break;
994 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
995 if (vhca_buf_header->allocated_length < migf->record_size) {
996 mlx5vf_free_data_buffer(vhca_buf_header);
997
998 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
999 migf->record_size, DMA_NONE);
1000 if (IS_ERR(migf->buf_header[0])) {
1001 ret = PTR_ERR(migf->buf_header[0]);
1002 migf->buf_header[0] = NULL;
1003 goto out_unlock;
1004 }
1005
1006 vhca_buf_header = migf->buf_header[0];
1007 }
1008
1009 vhca_buf_header->start_pos = migf->max_pos;
1010 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
1011 break;
1012 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
1013 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
1014 &buf, &len, pos, &done);
1015 if (ret)
1016 goto out_unlock;
1017 break;
1018 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
1019 {
1020 u64 size = max(migf->record_size,
1021 migf->stop_copy_prep_size);
1022
1023 if (vhca_buf->allocated_length < size) {
1024 mlx5vf_free_data_buffer(vhca_buf);
1025
1026 migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
1027 size, DMA_TO_DEVICE);
1028 if (IS_ERR(migf->buf[0])) {
1029 ret = PTR_ERR(migf->buf[0]);
1030 migf->buf[0] = NULL;
1031 goto out_unlock;
1032 }
1033
1034 vhca_buf = migf->buf[0];
1035 }
1036
1037 vhca_buf->start_pos = migf->max_pos;
1038 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
1039 break;
1040 }
1041 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
1042 ret = mlx5vf_resume_read_image_no_header(vhca_buf,
1043 requested_length,
1044 &buf, &len, pos, &done);
1045 if (ret)
1046 goto out_unlock;
1047 break;
1048 case MLX5_VF_LOAD_STATE_READ_IMAGE:
1049 ret = mlx5vf_resume_read_image(migf, vhca_buf,
1050 migf->record_size,
1051 &buf, &len, pos, &done, &has_work);
1052 if (ret)
1053 goto out_unlock;
1054 break;
1055 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
1056 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
1057 if (ret)
1058 goto out_unlock;
1059 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1060
1061 /* prep header buf for next image */
1062 vhca_buf_header->length = 0;
1063 /* prep data buf for next image */
1064 vhca_buf->length = 0;
1065
1066 break;
1067 default:
1068 break;
1069 }
1070 }
1071
1072 out_unlock:
1073 if (ret)
1074 migf->state = MLX5_MIGF_STATE_ERROR;
1075 mutex_unlock(&migf->lock);
1076 mlx5vf_state_mutex_unlock(migf->mvdev);
1077 return ret ? ret : done;
1078 }
1079
1080 static const struct file_operations mlx5vf_resume_fops = {
1081 .owner = THIS_MODULE,
1082 .write = mlx5vf_resume_write,
1083 .release = mlx5vf_release_file,
1084 .llseek = no_llseek,
1085 };
1086
1087 static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device * mvdev)1088 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1089 {
1090 struct mlx5_vf_migration_file *migf;
1091 struct mlx5_vhca_data_buffer *buf;
1092 int ret;
1093
1094 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
1095 if (!migf)
1096 return ERR_PTR(-ENOMEM);
1097
1098 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
1099 O_WRONLY);
1100 if (IS_ERR(migf->filp)) {
1101 ret = PTR_ERR(migf->filp);
1102 goto end;
1103 }
1104
1105 migf->mvdev = mvdev;
1106 ret = mlx5vf_cmd_alloc_pd(migf);
1107 if (ret)
1108 goto out_free;
1109
1110 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1111 if (IS_ERR(buf)) {
1112 ret = PTR_ERR(buf);
1113 goto out_pd;
1114 }
1115
1116 migf->buf[0] = buf;
1117 if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
1118 buf = mlx5vf_alloc_data_buffer(migf,
1119 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
1120 if (IS_ERR(buf)) {
1121 ret = PTR_ERR(buf);
1122 goto out_buf;
1123 }
1124
1125 migf->buf_header[0] = buf;
1126 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1127 } else {
1128 /* Initial state will be to read the image */
1129 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
1130 }
1131
1132 stream_open(migf->filp->f_inode, migf->filp);
1133 mutex_init(&migf->lock);
1134 INIT_LIST_HEAD(&migf->buf_list);
1135 INIT_LIST_HEAD(&migf->avail_list);
1136 spin_lock_init(&migf->list_lock);
1137 return migf;
1138 out_buf:
1139 mlx5vf_free_data_buffer(migf->buf[0]);
1140 out_pd:
1141 mlx5vf_cmd_dealloc_pd(migf);
1142 out_free:
1143 fput(migf->filp);
1144 end:
1145 kfree(migf);
1146 return ERR_PTR(ret);
1147 }
1148
mlx5vf_disable_fds(struct mlx5vf_pci_core_device * mvdev)1149 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
1150 {
1151 if (mvdev->resuming_migf) {
1152 mlx5vf_disable_fd(mvdev->resuming_migf);
1153 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1154 fput(mvdev->resuming_migf->filp);
1155 mvdev->resuming_migf = NULL;
1156 }
1157 if (mvdev->saving_migf) {
1158 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1159 cancel_work_sync(&mvdev->saving_migf->async_data.work);
1160 mlx5vf_disable_fd(mvdev->saving_migf);
1161 wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1162 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1163 fput(mvdev->saving_migf->filp);
1164 mvdev->saving_migf = NULL;
1165 }
1166 }
1167
1168 static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device * mvdev,u32 new)1169 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1170 u32 new)
1171 {
1172 u32 cur = mvdev->mig_state;
1173 int ret;
1174
1175 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1176 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1177 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1178 if (ret)
1179 return ERR_PTR(ret);
1180 return NULL;
1181 }
1182
1183 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1184 ret = mlx5vf_cmd_resume_vhca(mvdev,
1185 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1186 if (ret)
1187 return ERR_PTR(ret);
1188 return NULL;
1189 }
1190
1191 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1192 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1193 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1194 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1195 if (ret)
1196 return ERR_PTR(ret);
1197 return NULL;
1198 }
1199
1200 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1201 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1202 ret = mlx5vf_cmd_resume_vhca(mvdev,
1203 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1204 if (ret)
1205 return ERR_PTR(ret);
1206 return NULL;
1207 }
1208
1209 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1210 struct mlx5_vf_migration_file *migf;
1211
1212 migf = mlx5vf_pci_save_device_data(mvdev, false);
1213 if (IS_ERR(migf))
1214 return ERR_CAST(migf);
1215 get_file(migf->filp);
1216 mvdev->saving_migf = migf;
1217 return migf->filp;
1218 }
1219
1220 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
1221 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1222 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1223 new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1224 mlx5vf_disable_fds(mvdev);
1225 return NULL;
1226 }
1227
1228 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1229 struct mlx5_vf_migration_file *migf;
1230
1231 migf = mlx5vf_pci_resume_device_data(mvdev);
1232 if (IS_ERR(migf))
1233 return ERR_CAST(migf);
1234 get_file(migf->filp);
1235 mvdev->resuming_migf = migf;
1236 return migf->filp;
1237 }
1238
1239 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1240 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
1241 ret = mlx5vf_cmd_load_vhca_state(mvdev,
1242 mvdev->resuming_migf,
1243 mvdev->resuming_migf->buf[0]);
1244 if (ret)
1245 return ERR_PTR(ret);
1246 }
1247 mlx5vf_disable_fds(mvdev);
1248 return NULL;
1249 }
1250
1251 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1252 (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1253 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1254 struct mlx5_vf_migration_file *migf;
1255
1256 migf = mlx5vf_pci_save_device_data(mvdev, true);
1257 if (IS_ERR(migf))
1258 return ERR_CAST(migf);
1259 get_file(migf->filp);
1260 mvdev->saving_migf = migf;
1261 return migf->filp;
1262 }
1263
1264 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1265 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1266 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1267 if (ret)
1268 return ERR_PTR(ret);
1269 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1270 return ret ? ERR_PTR(ret) : NULL;
1271 }
1272
1273 /*
1274 * vfio_mig_get_next_state() does not use arcs other than the above
1275 */
1276 WARN_ON(true);
1277 return ERR_PTR(-EINVAL);
1278 }
1279
1280 /*
1281 * This function is called in all state_mutex unlock cases to
1282 * handle a 'deferred_reset' if exists.
1283 */
mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device * mvdev)1284 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1285 {
1286 again:
1287 spin_lock(&mvdev->reset_lock);
1288 if (mvdev->deferred_reset) {
1289 mvdev->deferred_reset = false;
1290 spin_unlock(&mvdev->reset_lock);
1291 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1292 mlx5vf_disable_fds(mvdev);
1293 goto again;
1294 }
1295 mutex_unlock(&mvdev->state_mutex);
1296 spin_unlock(&mvdev->reset_lock);
1297 }
1298
1299 static struct file *
mlx5vf_pci_set_device_state(struct vfio_device * vdev,enum vfio_device_mig_state new_state)1300 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1301 enum vfio_device_mig_state new_state)
1302 {
1303 struct mlx5vf_pci_core_device *mvdev = container_of(
1304 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1305 enum vfio_device_mig_state next_state;
1306 struct file *res = NULL;
1307 int ret;
1308
1309 mutex_lock(&mvdev->state_mutex);
1310 while (new_state != mvdev->mig_state) {
1311 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1312 new_state, &next_state);
1313 if (ret) {
1314 res = ERR_PTR(ret);
1315 break;
1316 }
1317 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1318 if (IS_ERR(res))
1319 break;
1320 mvdev->mig_state = next_state;
1321 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1322 fput(res);
1323 res = ERR_PTR(-EINVAL);
1324 break;
1325 }
1326 }
1327 mlx5vf_state_mutex_unlock(mvdev);
1328 return res;
1329 }
1330
mlx5vf_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)1331 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1332 unsigned long *stop_copy_length)
1333 {
1334 struct mlx5vf_pci_core_device *mvdev = container_of(
1335 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1336 size_t state_size;
1337 u64 total_size;
1338 int ret;
1339
1340 mutex_lock(&mvdev->state_mutex);
1341 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1342 &total_size, 0);
1343 if (!ret)
1344 *stop_copy_length = total_size;
1345 mlx5vf_state_mutex_unlock(mvdev);
1346 return ret;
1347 }
1348
mlx5vf_pci_get_device_state(struct vfio_device * vdev,enum vfio_device_mig_state * curr_state)1349 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1350 enum vfio_device_mig_state *curr_state)
1351 {
1352 struct mlx5vf_pci_core_device *mvdev = container_of(
1353 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1354
1355 mutex_lock(&mvdev->state_mutex);
1356 *curr_state = mvdev->mig_state;
1357 mlx5vf_state_mutex_unlock(mvdev);
1358 return 0;
1359 }
1360
mlx5vf_pci_aer_reset_done(struct pci_dev * pdev)1361 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1362 {
1363 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1364
1365 if (!mvdev->migrate_cap)
1366 return;
1367
1368 /*
1369 * As the higher VFIO layers are holding locks across reset and using
1370 * those same locks with the mm_lock we need to prevent ABBA deadlock
1371 * with the state_mutex and mm_lock.
1372 * In case the state_mutex was taken already we defer the cleanup work
1373 * to the unlock flow of the other running context.
1374 */
1375 spin_lock(&mvdev->reset_lock);
1376 mvdev->deferred_reset = true;
1377 if (!mutex_trylock(&mvdev->state_mutex)) {
1378 spin_unlock(&mvdev->reset_lock);
1379 return;
1380 }
1381 spin_unlock(&mvdev->reset_lock);
1382 mlx5vf_state_mutex_unlock(mvdev);
1383 }
1384
mlx5vf_pci_open_device(struct vfio_device * core_vdev)1385 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1386 {
1387 struct mlx5vf_pci_core_device *mvdev = container_of(
1388 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1389 struct vfio_pci_core_device *vdev = &mvdev->core_device;
1390 int ret;
1391
1392 ret = vfio_pci_core_enable(vdev);
1393 if (ret)
1394 return ret;
1395
1396 if (mvdev->migrate_cap)
1397 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1398 vfio_pci_core_finish_enable(vdev);
1399 return 0;
1400 }
1401
mlx5vf_pci_close_device(struct vfio_device * core_vdev)1402 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1403 {
1404 struct mlx5vf_pci_core_device *mvdev = container_of(
1405 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1406
1407 mlx5vf_cmd_close_migratable(mvdev);
1408 vfio_pci_core_close_device(core_vdev);
1409 }
1410
1411 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1412 .migration_set_state = mlx5vf_pci_set_device_state,
1413 .migration_get_state = mlx5vf_pci_get_device_state,
1414 .migration_get_data_size = mlx5vf_pci_get_data_size,
1415 };
1416
1417 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1418 .log_start = mlx5vf_start_page_tracker,
1419 .log_stop = mlx5vf_stop_page_tracker,
1420 .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1421 };
1422
mlx5vf_pci_init_dev(struct vfio_device * core_vdev)1423 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1424 {
1425 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1426 struct mlx5vf_pci_core_device, core_device.vdev);
1427 int ret;
1428
1429 ret = vfio_pci_core_init_dev(core_vdev);
1430 if (ret)
1431 return ret;
1432
1433 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1434 &mlx5vf_pci_log_ops);
1435
1436 return 0;
1437 }
1438
mlx5vf_pci_release_dev(struct vfio_device * core_vdev)1439 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1440 {
1441 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1442 struct mlx5vf_pci_core_device, core_device.vdev);
1443
1444 mlx5vf_cmd_remove_migratable(mvdev);
1445 vfio_pci_core_release_dev(core_vdev);
1446 }
1447
1448 static const struct vfio_device_ops mlx5vf_pci_ops = {
1449 .name = "mlx5-vfio-pci",
1450 .init = mlx5vf_pci_init_dev,
1451 .release = mlx5vf_pci_release_dev,
1452 .open_device = mlx5vf_pci_open_device,
1453 .close_device = mlx5vf_pci_close_device,
1454 .ioctl = vfio_pci_core_ioctl,
1455 .device_feature = vfio_pci_core_ioctl_feature,
1456 .read = vfio_pci_core_read,
1457 .write = vfio_pci_core_write,
1458 .mmap = vfio_pci_core_mmap,
1459 .request = vfio_pci_core_request,
1460 .match = vfio_pci_core_match,
1461 .bind_iommufd = vfio_iommufd_physical_bind,
1462 .unbind_iommufd = vfio_iommufd_physical_unbind,
1463 .attach_ioas = vfio_iommufd_physical_attach_ioas,
1464 .detach_ioas = vfio_iommufd_physical_detach_ioas,
1465 };
1466
mlx5vf_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)1467 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1468 const struct pci_device_id *id)
1469 {
1470 struct mlx5vf_pci_core_device *mvdev;
1471 int ret;
1472
1473 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1474 &pdev->dev, &mlx5vf_pci_ops);
1475 if (IS_ERR(mvdev))
1476 return PTR_ERR(mvdev);
1477
1478 dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1479 ret = vfio_pci_core_register_device(&mvdev->core_device);
1480 if (ret)
1481 goto out_put_vdev;
1482 return 0;
1483
1484 out_put_vdev:
1485 vfio_put_device(&mvdev->core_device.vdev);
1486 return ret;
1487 }
1488
mlx5vf_pci_remove(struct pci_dev * pdev)1489 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1490 {
1491 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1492
1493 vfio_pci_core_unregister_device(&mvdev->core_device);
1494 vfio_put_device(&mvdev->core_device.vdev);
1495 }
1496
1497 static const struct pci_device_id mlx5vf_pci_table[] = {
1498 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1499 {}
1500 };
1501
1502 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1503
1504 static const struct pci_error_handlers mlx5vf_err_handlers = {
1505 .reset_done = mlx5vf_pci_aer_reset_done,
1506 .error_detected = vfio_pci_core_aer_err_detected,
1507 };
1508
1509 static struct pci_driver mlx5vf_pci_driver = {
1510 .name = KBUILD_MODNAME,
1511 .id_table = mlx5vf_pci_table,
1512 .probe = mlx5vf_pci_probe,
1513 .remove = mlx5vf_pci_remove,
1514 .err_handler = &mlx5vf_err_handlers,
1515 .driver_managed_dma = true,
1516 };
1517
1518 module_pci_driver(mlx5vf_pci_driver);
1519
1520 MODULE_IMPORT_NS(IOMMUFD);
1521 MODULE_LICENSE("GPL");
1522 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1523 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1524 MODULE_DESCRIPTION(
1525 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1526