1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 1999, 2023 4 */ 5 6 #include <linux/cpuhotplug.h> 7 #include <linux/sched/task.h> 8 #include <linux/errno.h> 9 #include <linux/init.h> 10 #include <linux/irq.h> 11 #include <asm/asm-extable.h> 12 #include <asm/asm-offsets.h> 13 #include <asm/pfault.h> 14 #include <asm/diag.h> 15 16 #define __SUBCODE_MASK 0x0600 17 #define __PF_RES_FIELD 0x8000000000000000UL 18 19 /* 20 * 'pfault' pseudo page faults routines. 21 */ 22 static int pfault_disable; 23 24 static int __init nopfault(char *str) 25 { 26 pfault_disable = 1; 27 return 1; 28 } 29 early_param("nopfault", nopfault); 30 31 struct pfault_refbk { 32 u16 refdiagc; 33 u16 reffcode; 34 u16 refdwlen; 35 u16 refversn; 36 u64 refgaddr; 37 u64 refselmk; 38 u64 refcmpmk; 39 u64 reserved; 40 }; 41 42 static struct pfault_refbk pfault_init_refbk = { 43 .refdiagc = 0x258, 44 .reffcode = 0, 45 .refdwlen = 5, 46 .refversn = 2, 47 .refgaddr = __LC_LPP, 48 .refselmk = 1UL << 48, 49 .refcmpmk = 1UL << 48, 50 .reserved = __PF_RES_FIELD 51 }; 52 53 int __pfault_init(void) 54 { 55 int rc = -EOPNOTSUPP; 56 57 if (pfault_disable) 58 return rc; 59 diag_stat_inc(DIAG_STAT_X258); 60 asm_inline volatile( 61 " diag %[refbk],%[rc],0x258\n" 62 "0: nopr %%r7\n" 63 EX_TABLE(0b, 0b) 64 : [rc] "+d" (rc) 65 : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) 66 : "cc"); 67 return rc; 68 } 69 70 static struct pfault_refbk pfault_fini_refbk = { 71 .refdiagc = 0x258, 72 .reffcode = 1, 73 .refdwlen = 5, 74 .refversn = 2, 75 }; 76 77 void __pfault_fini(void) 78 { 79 if (pfault_disable) 80 return; 81 diag_stat_inc(DIAG_STAT_X258); 82 asm_inline volatile( 83 " diag %[refbk],0,0x258\n" 84 "0: nopr %%r7\n" 85 EX_TABLE(0b, 0b) 86 : 87 : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) 88 : "cc"); 89 } 90 91 static DEFINE_SPINLOCK(pfault_lock); 92 static LIST_HEAD(pfault_list); 93 94 #define PF_COMPLETE 0x0080 95 96 /* 97 * The mechanism of our pfault code: if Linux is running as guest, runs a user 98 * space process and the user space process accesses a page that the host has 99 * paged out we get a pfault interrupt. 100 * 101 * This allows us, within the guest, to schedule a different process. Without 102 * this mechanism the host would have to suspend the whole virtual cpu until 103 * the page has been paged in. 104 * 105 * So when we get such an interrupt then we set the state of the current task 106 * to uninterruptible and also set the need_resched flag. Both happens within 107 * interrupt context(!). If we later on want to return to user space we 108 * recognize the need_resched flag and then call schedule(). It's not very 109 * obvious how this works... 110 * 111 * Of course we have a lot of additional fun with the completion interrupt (-> 112 * host signals that a page of a process has been paged in and the process can 113 * continue to run). This interrupt can arrive on any cpu and, since we have 114 * virtual cpus, actually appear before the interrupt that signals that a page 115 * is missing. 116 */ 117 static void pfault_interrupt(struct ext_code ext_code, 118 unsigned int param32, unsigned long param64) 119 { 120 struct task_struct *tsk; 121 __u16 subcode; 122 pid_t pid; 123 124 /* 125 * Get the external interruption subcode & pfault initial/completion 126 * signal bit. VM stores this in the 'cpu address' field associated 127 * with the external interrupt. 128 */ 129 subcode = ext_code.subcode; 130 if ((subcode & 0xff00) != __SUBCODE_MASK) 131 return; 132 inc_irq_stat(IRQEXT_PFL); 133 /* Get the token (= pid of the affected task). */ 134 pid = param64 & LPP_PID_MASK; 135 rcu_read_lock(); 136 tsk = find_task_by_pid_ns(pid, &init_pid_ns); 137 if (tsk) 138 get_task_struct(tsk); 139 rcu_read_unlock(); 140 if (!tsk) 141 return; 142 spin_lock(&pfault_lock); 143 if (subcode & PF_COMPLETE) { 144 /* signal bit is set -> a page has been swapped in by VM */ 145 if (tsk->thread.pfault_wait == 1) { 146 /* 147 * Initial interrupt was faster than the completion 148 * interrupt. pfault_wait is valid. Set pfault_wait 149 * back to zero and wake up the process. This can 150 * safely be done because the task is still sleeping 151 * and can't produce new pfaults. 152 */ 153 tsk->thread.pfault_wait = 0; 154 list_del(&tsk->thread.list); 155 wake_up_process(tsk); 156 put_task_struct(tsk); 157 } else { 158 /* 159 * Completion interrupt was faster than initial 160 * interrupt. Set pfault_wait to -1 so the initial 161 * interrupt doesn't put the task to sleep. 162 * If the task is not running, ignore the completion 163 * interrupt since it must be a leftover of a PFAULT 164 * CANCEL operation which didn't remove all pending 165 * completion interrupts. 166 */ 167 if (task_is_running(tsk)) 168 tsk->thread.pfault_wait = -1; 169 } 170 } else { 171 /* signal bit not set -> a real page is missing. */ 172 if (WARN_ON_ONCE(tsk != current)) 173 goto out; 174 if (tsk->thread.pfault_wait == 1) { 175 /* Already on the list with a reference: put to sleep */ 176 goto block; 177 } else if (tsk->thread.pfault_wait == -1) { 178 /* 179 * Completion interrupt was faster than the initial 180 * interrupt (pfault_wait == -1). Set pfault_wait 181 * back to zero and exit. 182 */ 183 tsk->thread.pfault_wait = 0; 184 } else { 185 /* 186 * Initial interrupt arrived before completion 187 * interrupt. Let the task sleep. 188 * An extra task reference is needed since a different 189 * cpu may set the task state to TASK_RUNNING again 190 * before the scheduler is reached. 191 */ 192 get_task_struct(tsk); 193 tsk->thread.pfault_wait = 1; 194 list_add(&tsk->thread.list, &pfault_list); 195 block: 196 /* 197 * Since this must be a userspace fault, there 198 * is no kernel task state to trample. Rely on the 199 * return to userspace schedule() to block. 200 */ 201 __set_current_state(TASK_UNINTERRUPTIBLE); 202 set_tsk_need_resched(tsk); 203 set_preempt_need_resched(); 204 } 205 } 206 out: 207 spin_unlock(&pfault_lock); 208 put_task_struct(tsk); 209 } 210 211 static int pfault_cpu_dead(unsigned int cpu) 212 { 213 struct thread_struct *thread, *next; 214 struct task_struct *tsk; 215 216 spin_lock_irq(&pfault_lock); 217 list_for_each_entry_safe(thread, next, &pfault_list, list) { 218 thread->pfault_wait = 0; 219 list_del(&thread->list); 220 tsk = container_of(thread, struct task_struct, thread); 221 wake_up_process(tsk); 222 put_task_struct(tsk); 223 } 224 spin_unlock_irq(&pfault_lock); 225 return 0; 226 } 227 228 static int __init pfault_irq_init(void) 229 { 230 int rc; 231 232 rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 233 if (rc) 234 goto out_extint; 235 rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; 236 if (rc) 237 goto out_pfault; 238 irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); 239 cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", 240 NULL, pfault_cpu_dead); 241 return 0; 242 243 out_pfault: 244 unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); 245 out_extint: 246 pfault_disable = 1; 247 return rc; 248 } 249 early_initcall(pfault_irq_init); 250