arch/s390/mm/fault.c

   1 /*
   2  *  S390 version
   3  *    Copyright IBM Corp. 1999
   4  *    Author(s): Hartmut Penner (hp@de.ibm.com)
   5  *               Ulrich Weigand (uweigand@de.ibm.com)
   6  *
   7  *  Derived from "arch/i386/mm/fault.c"
   8  *    Copyright (C) 1995  Linus Torvalds
   9  */
  10
  11 #include <linux/kernel_stat.h>
  12 #include <linux/perf_event.h>
  13 #include <linux/signal.h>
  14 #include <linux/sched.h>
  15 #include <linux/kernel.h>
  16 #include <linux/errno.h>
  17 #include <linux/string.h>
  18 #include <linux/types.h>
  19 #include <linux/ptrace.h>
  20 #include <linux/mman.h>
  21 #include <linux/mm.h>
  22 #include <linux/compat.h>
  23 #include <linux/smp.h>
  24 #include <linux/kdebug.h>
  25 #include <linux/init.h>
  26 #include <linux/console.h>
  27 #include <linux/module.h>
  28 #include <linux/hardirq.h>
  29 #include <linux/kprobes.h>
  30 #include <linux/uaccess.h>
  31 #include <linux/hugetlb.h>
  32 #include <asm/asm-offsets.h>
  33 #include <asm/pgtable.h>
  34 #include <asm/irq.h>
  35 #include <asm/mmu_context.h>
  36 #include <asm/facility.h>
  37 #include "../kernel/entry.h"
  38
  39 #ifndef CONFIG_64BIT
  40 #define __FAIL_ADDR_MASK 0x7ffff000
  41 #define __SUBCODE_MASK 0x0200
  42 #define __PF_RES_FIELD 0ULL
  43 #else /* CONFIG_64BIT */
  44 #define __FAIL_ADDR_MASK -4096L
  45 #define __SUBCODE_MASK 0x0600
  46 #define __PF_RES_FIELD 0x8000000000000000ULL
  47 #endif /* CONFIG_64BIT */
  48
  49 #define VM_FAULT_BADCONTEXT     0x010000
  50 #define VM_FAULT_BADMAP         0x020000
  51 #define VM_FAULT_BADACCESS      0x040000
  52 #define VM_FAULT_SIGNAL         0x080000
  53 #define VM_FAULT_PFAULT         0x100000
  54
  55 static unsigned long store_indication __read_mostly;
  56
  57 #ifdef CONFIG_64BIT
  58 static int __init fault_init(void)
  59 {
  60         if (test_facility(75))
  61                 store_indication = 0xc00;
  62         return 0;
  63 }
  64 early_initcall(fault_init);
  65 #endif
  66
  67 static inline int notify_page_fault(struct pt_regs *regs)
  68 {
  69         int ret = 0;
  70
  71         /* kprobe_running() needs smp_processor_id() */
  72         if (kprobes_built_in() && !user_mode(regs)) {
  73                 preempt_disable();
  74                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  75                         ret = 1;
  76                 preempt_enable();
  77         }
  78         return ret;
  79 }
  80
  81
  82 /*
  83  * Unlock any spinlocks which will prevent us from getting the
  84  * message out.
  85  */
  86 void bust_spinlocks(int yes)
  87 {
  88         if (yes) {
  89                 oops_in_progress = 1;
  90         } else {
  91                 int loglevel_save = console_loglevel;
  92                 console_unblank();
  93                 oops_in_progress = 0;
  94                 /*
  95                  * OK, the message is on the console.  Now we call printk()
  96                  * without oops_in_progress set so that printk will give klogd
  97                  * a poke.  Hold onto your hats...
  98                  */
  99                 console_loglevel = 15;
 100                 printk(" ");
 101                 console_loglevel = loglevel_save;
 102         }
 103 }
 104
 105 /*
 106  * Returns the address space associated with the fault.
 107  * Returns 0 for kernel space and 1 for user space.
 108  */
 109 static inline int user_space_fault(struct pt_regs *regs)
 110 {
 111         unsigned long trans_exc_code;
 112
 113         /*
 114          * The lowest two bits of the translation exception
 115          * identification indicate which paging table was used.
 116          */
 117         trans_exc_code = regs->int_parm_long & 3;
 118         if (trans_exc_code == 3) /* home space -> kernel */
 119                 return 0;
 120         if (user_mode(regs))
 121                 return 1;
 122         if (trans_exc_code == 2) /* secondary space -> set_fs */
 123                 return current->thread.mm_segment.ar4;
 124         if (current->flags & PF_VCPU)
 125                 return 1;
 126         return 0;
 127 }
 128
 129 static inline void report_user_fault(struct pt_regs *regs, long signr)
 130 {
 131         if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 132                 return;
 133         if (!unhandled_signal(current, signr))
 134                 return;
 135         if (!printk_ratelimit())
 136                 return;
 137         printk(KERN_ALERT "User process fault: interruption code 0x%X ",
 138                regs->int_code);
 139         print_vma_addr(KERN_CONT "in ", regs->psw.addr & PSW_ADDR_INSN);
 140         printk(KERN_CONT "\n");
 141         printk(KERN_ALERT "failing address: %lX\n",
 142                regs->int_parm_long & __FAIL_ADDR_MASK);
 143         show_regs(regs);
 144 }
 145
 146 /*
 147  * Send SIGSEGV to task.  This is an external routine
 148  * to keep the stack usage of do_page_fault small.
 149  */
 150 static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 151 {
 152         struct siginfo si;
 153
 154         report_user_fault(regs, SIGSEGV);
 155         si.si_signo = SIGSEGV;
 156         si.si_code = si_code;
 157         si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 158         force_sig_info(SIGSEGV, &si, current);
 159 }
 160
 161 static noinline void do_no_context(struct pt_regs *regs)
 162 {
 163         const struct exception_table_entry *fixup;
 164         unsigned long address;
 165
 166         /* Are we prepared to handle this kernel fault?  */
 167         fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN);
 168         if (fixup) {
 169                 regs->psw.addr = extable_fixup(fixup) | PSW_ADDR_AMODE;
 170                 return;
 171         }
 172
 173         /*
 174          * Oops. The kernel tried to access some bad page. We'll have to
 175          * terminate things with extreme prejudice.
 176          */
 177         address = regs->int_parm_long & __FAIL_ADDR_MASK;
 178         if (!user_space_fault(regs))
 179                 printk(KERN_ALERT "Unable to handle kernel pointer dereference"
 180                        " at virtual kernel address %p\n", (void *)address);
 181         else
 182                 printk(KERN_ALERT "Unable to handle kernel paging request"
 183                        " at virtual user address %p\n", (void *)address);
 184
 185         die(regs, "Oops");
 186         do_exit(SIGKILL);
 187 }
 188
 189 static noinline void do_low_address(struct pt_regs *regs)
 190 {
 191         /* Low-address protection hit in kernel mode means
 192            NULL pointer write access in kernel mode.  */
 193         if (regs->psw.mask & PSW_MASK_PSTATE) {
 194                 /* Low-address protection hit in user mode 'cannot happen'. */
 195                 die (regs, "Low-address protection");
 196                 do_exit(SIGKILL);
 197         }
 198
 199         do_no_context(regs);
 200 }
 201
 202 static noinline void do_sigbus(struct pt_regs *regs)
 203 {
 204         struct task_struct *tsk = current;
 205         struct siginfo si;
 206
 207         /*
 208          * Send a sigbus, regardless of whether we were in kernel
 209          * or user mode.
 210          */
 211         si.si_signo = SIGBUS;
 212         si.si_errno = 0;
 213         si.si_code = BUS_ADRERR;
 214         si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
 215         force_sig_info(SIGBUS, &si, tsk);
 216 }
 217
 218 static noinline void do_fault_error(struct pt_regs *regs, int fault)
 219 {
 220         int si_code;
 221
 222         switch (fault) {
 223         case VM_FAULT_BADACCESS:
 224         case VM_FAULT_BADMAP:
 225                 /* Bad memory access. Check if it is kernel or user space. */
 226                 if (user_mode(regs)) {
 227                         /* User mode accesses just cause a SIGSEGV */
 228                         si_code = (fault == VM_FAULT_BADMAP) ?
 229                                 SEGV_MAPERR : SEGV_ACCERR;
 230                         do_sigsegv(regs, si_code);
 231                         return;
 232                 }
 233         case VM_FAULT_BADCONTEXT:
 234         case VM_FAULT_PFAULT:
 235                 do_no_context(regs);
 236                 break;
 237         case VM_FAULT_SIGNAL:
 238                 if (!user_mode(regs))
 239                         do_no_context(regs);
 240                 break;
 241         default: /* fault & VM_FAULT_ERROR */
 242                 if (fault & VM_FAULT_OOM) {
 243                         if (!user_mode(regs))
 244                                 do_no_context(regs);
 245                         else
 246                                 pagefault_out_of_memory();
 247                 } else if (fault & VM_FAULT_SIGBUS) {
 248                         /* Kernel mode? Handle exceptions or die */
 249                         if (!user_mode(regs))
 250                                 do_no_context(regs);
 251                         else
 252                                 do_sigbus(regs);
 253                 } else
 254                         BUG();
 255                 break;
 256         }
 257 }
 258
 259 /*
 260  * This routine handles page faults.  It determines the address,
 261  * and the problem, and then passes it off to one of the appropriate
 262  * routines.
 263  *
 264  * interruption code (int_code):
 265  *   04       Protection           ->  Write-Protection  (suprression)
 266  *   10       Segment translation  ->  Not present       (nullification)
 267  *   11       Page translation     ->  Not present       (nullification)
 268  *   3b       Region third trans.  ->  Not present       (nullification)
 269  */
 270 static inline int do_exception(struct pt_regs *regs, int access)
 271 {
 272 #ifdef CONFIG_PGSTE
 273         struct gmap *gmap;
 274 #endif
 275         struct task_struct *tsk;
 276         struct mm_struct *mm;
 277         struct vm_area_struct *vma;
 278         unsigned long trans_exc_code;
 279         unsigned long address;
 280         unsigned int flags;
 281         int fault;
 282
 283         tsk = current;
 284         /*
 285          * The instruction that caused the program check has
 286          * been nullified. Don't signal single step via SIGTRAP.
 287          */
 288         clear_tsk_thread_flag(tsk, TIF_PER_TRAP);
 289
 290         if (notify_page_fault(regs))
 291                 return 0;
 292
 293         mm = tsk->mm;
 294         trans_exc_code = regs->int_parm_long;
 295
 296         /*
 297          * Verify that the fault happened in user space, that
 298          * we are not in an interrupt and that there is a
 299          * user context.
 300          */
 301         fault = VM_FAULT_BADCONTEXT;
 302         if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
 303                 goto out;
 304
 305         address = trans_exc_code & __FAIL_ADDR_MASK;
 306         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 307         flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 308         if (user_mode(regs))
 309                 flags |= FAULT_FLAG_USER;
 310         if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 311                 flags |= FAULT_FLAG_WRITE;
 312         down_read(&mm->mmap_sem);
 313
 314 #ifdef CONFIG_PGSTE
 315         gmap = (struct gmap *)
 316                 ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0);
 317         if (gmap) {
 318                 address = __gmap_fault(address, gmap);
 319                 if (address == -EFAULT) {
 320                         fault = VM_FAULT_BADMAP;
 321                         goto out_up;
 322                 }
 323                 if (address == -ENOMEM) {
 324                         fault = VM_FAULT_OOM;
 325                         goto out_up;
 326                 }
 327                 if (gmap->pfault_enabled)
 328                         flags |= FAULT_FLAG_RETRY_NOWAIT;
 329         }
 330 #endif
 331
 332 retry:
 333         fault = VM_FAULT_BADMAP;
 334         vma = find_vma(mm, address);
 335         if (!vma)
 336                 goto out_up;
 337
 338         if (unlikely(vma->vm_start > address)) {
 339                 if (!(vma->vm_flags & VM_GROWSDOWN))
 340                         goto out_up;
 341                 if (expand_stack(vma, address))
 342                         goto out_up;
 343         }
 344
 345         /*
 346          * Ok, we have a good vm_area for this memory access, so
 347          * we can handle it..
 348          */
 349         fault = VM_FAULT_BADACCESS;
 350         if (unlikely(!(vma->vm_flags & access)))
 351                 goto out_up;
 352
 353         if (is_vm_hugetlb_page(vma))
 354                 address &= HPAGE_MASK;
 355         /*
 356          * If for any reason at all we couldn't handle the fault,
 357          * make sure we exit gracefully rather than endlessly redo
 358          * the fault.
 359          */
 360         fault = handle_mm_fault(mm, vma, address, flags);
 361         /* No reason to continue if interrupted by SIGKILL. */
 362         if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
 363                 fault = VM_FAULT_SIGNAL;
 364                 goto out;
 365         }
 366         if (unlikely(fault & VM_FAULT_ERROR))
 367                 goto out_up;
 368
 369         /*
 370          * Major/minor page fault accounting is only done on the
 371          * initial attempt. If we go through a retry, it is extremely
 372          * likely that the page will be found in page cache at that point.
 373          */
 374         if (flags & FAULT_FLAG_ALLOW_RETRY) {
 375                 if (fault & VM_FAULT_MAJOR) {
 376                         tsk->maj_flt++;
 377                         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 378                                       regs, address);
 379                 } else {
 380                         tsk->min_flt++;
 381                         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 382                                       regs, address);
 383                 }
 384                 if (fault & VM_FAULT_RETRY) {
 385 #ifdef CONFIG_PGSTE
 386                         if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
 387                                 /* FAULT_FLAG_RETRY_NOWAIT has been set,
 388                                  * mmap_sem has not been released */
 389                                 current->thread.gmap_pfault = 1;
 390                                 fault = VM_FAULT_PFAULT;
 391                                 goto out_up;
 392                         }
 393 #endif
 394                         /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
 395                          * of starvation. */
 396                         flags &= ~(FAULT_FLAG_ALLOW_RETRY |
 397                                    FAULT_FLAG_RETRY_NOWAIT);
 398                         flags |= FAULT_FLAG_TRIED;
 399                         down_read(&mm->mmap_sem);
 400                         goto retry;
 401                 }
 402         }
 403         fault = 0;
 404 out_up:
 405         up_read(&mm->mmap_sem);
 406 out:
 407         return fault;
 408 }
 409
 410 void __kprobes do_protection_exception(struct pt_regs *regs)
 411 {
 412         unsigned long trans_exc_code;
 413         int fault;
 414
 415         trans_exc_code = regs->int_parm_long;
 416         /*
 417          * Protection exceptions are suppressing, decrement psw address.
 418          * The exception to this rule are aborted transactions, for these
 419          * the PSW already points to the correct location.
 420          */
 421         if (!(regs->int_code & 0x200))
 422                 regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
 423         /*
 424          * Check for low-address protection.  This needs to be treated
 425          * as a special case because the translation exception code
 426          * field is not guaranteed to contain valid data in this case.
 427          */
 428         if (unlikely(!(trans_exc_code & 4))) {
 429                 do_low_address(regs);
 430                 return;
 431         }
 432         fault = do_exception(regs, VM_WRITE);
 433         if (unlikely(fault))
 434                 do_fault_error(regs, fault);
 435 }
 436
 437 void __kprobes do_dat_exception(struct pt_regs *regs)
 438 {
 439         int access, fault;
 440
 441         access = VM_READ | VM_EXEC | VM_WRITE;
 442         fault = do_exception(regs, access);
 443         if (unlikely(fault))
 444                 do_fault_error(regs, fault);
 445 }
 446
 447 #ifdef CONFIG_PFAULT
 448 /*
 449  * 'pfault' pseudo page faults routines.
 450  */
 451 static int pfault_disable;
 452
 453 static int __init nopfault(char *str)
 454 {
 455         pfault_disable = 1;
 456         return 1;
 457 }
 458
 459 __setup("nopfault", nopfault);
 460
 461 struct pfault_refbk {
 462         u16 refdiagc;
 463         u16 reffcode;
 464         u16 refdwlen;
 465         u16 refversn;
 466         u64 refgaddr;
 467         u64 refselmk;
 468         u64 refcmpmk;
 469         u64 reserved;
 470 } __attribute__ ((packed, aligned(8)));
 471
 472 int pfault_init(void)
 473 {
 474         struct pfault_refbk refbk = {
 475                 .refdiagc = 0x258,
 476                 .reffcode = 0,
 477                 .refdwlen = 5,
 478                 .refversn = 2,
 479                 .refgaddr = __LC_CURRENT_PID,
 480                 .refselmk = 1ULL << 48,
 481                 .refcmpmk = 1ULL << 48,
 482                 .reserved = __PF_RES_FIELD };
 483         int rc;
 484
 485         if (pfault_disable)
 486                 return -1;
 487         asm volatile(
 488                 "       diag    %1,%0,0x258\n"
 489                 "0:     j       2f\n"
 490                 "1:     la      %0,8\n"
 491                 "2:\n"
 492                 EX_TABLE(0b,1b)
 493                 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
 494         return rc;
 495 }
 496
 497 void pfault_fini(void)
 498 {
 499         struct pfault_refbk refbk = {
 500                 .refdiagc = 0x258,
 501                 .reffcode = 1,
 502                 .refdwlen = 5,
 503                 .refversn = 2,
 504         };
 505
 506         if (pfault_disable)
 507                 return;
 508         asm volatile(
 509                 "       diag    %0,0,0x258\n"
 510                 "0:\n"
 511                 EX_TABLE(0b,0b)
 512                 : : "a" (&refbk), "m" (refbk) : "cc");
 513 }
 514
 515 static DEFINE_SPINLOCK(pfault_lock);
 516 static LIST_HEAD(pfault_list);
 517
 518 static void pfault_interrupt(struct ext_code ext_code,
 519                              unsigned int param32, unsigned long param64)
 520 {
 521         struct task_struct *tsk;
 522         __u16 subcode;
 523         pid_t pid;
 524
 525         /*
 526          * Get the external interruption subcode & pfault
 527          * initial/completion signal bit. VM stores this
 528          * in the 'cpu address' field associated with the
 529          * external interrupt.
 530          */
 531         subcode = ext_code.subcode;
 532         if ((subcode & 0xff00) != __SUBCODE_MASK)
 533                 return;
 534         inc_irq_stat(IRQEXT_PFL);
 535         /* Get the token (= pid of the affected task). */
 536         pid = sizeof(void *) == 4 ? param32 : param64;
 537         rcu_read_lock();
 538         tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 539         if (tsk)
 540                 get_task_struct(tsk);
 541         rcu_read_unlock();
 542         if (!tsk)
 543                 return;
 544         spin_lock(&pfault_lock);
 545         if (subcode & 0x0080) {
 546                 /* signal bit is set -> a page has been swapped in by VM */
 547                 if (tsk->thread.pfault_wait == 1) {
 548                         /* Initial interrupt was faster than the completion
 549                          * interrupt. pfault_wait is valid. Set pfault_wait
 550                          * back to zero and wake up the process. This can
 551                          * safely be done because the task is still sleeping
 552                          * and can't produce new pfaults. */
 553                         tsk->thread.pfault_wait = 0;
 554                         list_del(&tsk->thread.list);
 555                         wake_up_process(tsk);
 556                         put_task_struct(tsk);
 557                 } else {
 558                         /* Completion interrupt was faster than initial
 559                          * interrupt. Set pfault_wait to -1 so the initial
 560                          * interrupt doesn't put the task to sleep.
 561                          * If the task is not running, ignore the completion
 562                          * interrupt since it must be a leftover of a PFAULT
 563                          * CANCEL operation which didn't remove all pending
 564                          * completion interrupts. */
 565                         if (tsk->state == TASK_RUNNING)
 566                                 tsk->thread.pfault_wait = -1;
 567                 }
 568         } else {
 569                 /* signal bit not set -> a real page is missing. */
 570                 if (WARN_ON_ONCE(tsk != current))
 571                         goto out;
 572                 if (tsk->thread.pfault_wait == 1) {
 573                         /* Already on the list with a reference: put to sleep */
 574                         __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 575                         set_tsk_need_resched(tsk);
 576                 } else if (tsk->thread.pfault_wait == -1) {
 577                         /* Completion interrupt was faster than the initial
 578                          * interrupt (pfault_wait == -1). Set pfault_wait
 579                          * back to zero and exit. */
 580                         tsk->thread.pfault_wait = 0;
 581                 } else {
 582                         /* Initial interrupt arrived before completion
 583                          * interrupt. Let the task sleep.
 584                          * An extra task reference is needed since a different
 585                          * cpu may set the task state to TASK_RUNNING again
 586                          * before the scheduler is reached. */
 587                         get_task_struct(tsk);
 588                         tsk->thread.pfault_wait = 1;
 589                         list_add(&tsk->thread.list, &pfault_list);
 590                         __set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 591                         set_tsk_need_resched(tsk);
 592                 }
 593         }
 594 out:
 595         spin_unlock(&pfault_lock);
 596         put_task_struct(tsk);
 597 }
 598
 599 static int pfault_cpu_notify(struct notifier_block *self, unsigned long action,
 600                              void *hcpu)
 601 {
 602         struct thread_struct *thread, *next;
 603         struct task_struct *tsk;
 604
 605         switch (action & ~CPU_TASKS_FROZEN) {
 606         case CPU_DEAD:
 607                 spin_lock_irq(&pfault_lock);
 608                 list_for_each_entry_safe(thread, next, &pfault_list, list) {
 609                         thread->pfault_wait = 0;
 610                         list_del(&thread->list);
 611                         tsk = container_of(thread, struct task_struct, thread);
 612                         wake_up_process(tsk);
 613                         put_task_struct(tsk);
 614                 }
 615                 spin_unlock_irq(&pfault_lock);
 616                 break;
 617         default:
 618                 break;
 619         }
 620         return NOTIFY_OK;
 621 }
 622
 623 static int __init pfault_irq_init(void)
 624 {
 625         int rc;
 626
 627         rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 628         if (rc)
 629                 goto out_extint;
 630         rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
 631         if (rc)
 632                 goto out_pfault;
 633         irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
 634         hotcpu_notifier(pfault_cpu_notify, 0);
 635         return 0;
 636
 637 out_pfault:
 638         unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
 639 out_extint:
 640         pfault_disable = 1;
 641         return rc;
 642 }
 643 early_initcall(pfault_irq_init);
 644
 645 #endif /* CONFIG_PFAULT */