Merge branch 'auto-ftrace-next' into tracing/for-linus

author Ingo Molnar <mingo@elte.hu>

Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
author Ingo Molnar <mingo@elte.hu>
Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
diff --combined Makefile

index e3c5eb66ec52dee13127e3b2b83f84c3184bd8be,8e519953d2db955d7f78ae01fa5b6197c17fd90c..4ac1d2f71ac3598df817199fc87d573831e848a2
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
   VERSION = 2
   PATCHLEVEL = 6
   SUBLEVEL = 26
- -EXTRAVERSION = -rc9
+ +EXTRAVERSION =
   NAME = Rotary Wombat
   
   # *DOCUMENTATION*
@@@ -528,6 -528,10 +528,10 @@@ KBUILD_CFLAGS    += -
   KBUILD_AFLAGS += -gdwarf-2
   endif
   
+ ifdef CONFIG_FTRACE
+ KBUILD_CFLAGS += -pg
+ endif
+ 
   # We trigger additional mismatches with less inlining
   ifdef CONFIG_DEBUG_SECTION_MISMATCH
   KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --combined arch/x86/Kconfig

index 2cfccc987a2605e81f7017f4e055e09ddf8e3da3,c3a4c03c08003e23b373456e57f7dc17c08534dd..6958d6bcaf704c8cc0c9d5af066e4787353cbb5b
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -23,6 -23,8 +23,8 @@@ config X8
         select HAVE_OPROFILE
         select HAVE_KPROBES
         select HAVE_KRETPROBES
+       select HAVE_DYNAMIC_FTRACE
+       select HAVE_FTRACE
         select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
         select HAVE_ARCH_KGDB if !X86_VOYAGER
   
@@@ -121,7 -123,7 +123,7 @@@ config ARCH_HAS_CACHE_LINE_SIZ
         def_bool y
   
   config HAVE_SETUP_PER_CPU_AREA
- -      def_bool X86_64 || (X86_SMP && !X86_VOYAGER)
+ +      def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
   
   config HAVE_CPUMASK_OF_CPU_MAP
         def_bool X86_64_SMP
@@@ -181,12 -183,12 +183,12 @@@ config X86_64_SM
   config X86_HT
         bool
         depends on SMP
- -      depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64
+ +      depends on (X86_32 && !X86_VOYAGER) || X86_64
         default y
   
   config X86_BIOS_REBOOT
         bool
- -      depends on !X86_VISWS && !X86_VOYAGER
+ +      depends on !X86_VOYAGER
         default y
   
   config X86_TRAMPOLINE
@@@ -230,26 -232,6 +232,26 @@@ config SM
   
           If you don't know what to do here, say N.
   
+ +config X86_FIND_SMP_CONFIG
+ +      def_bool y
+ +      depends on X86_MPPARSE || X86_VOYAGER
+ +
+ +if ACPI
+ +config X86_MPPARSE
+ +      def_bool y
+ +      bool "Enable MPS table"
+ +      depends on X86_LOCAL_APIC
+ +      help
+ +        For old smp systems that do not have proper acpi support. Newer systems
+ +        (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
+ +endif
+ +
+ +if !ACPI
+ +config X86_MPPARSE
+ +      def_bool y
+ +      depends on X86_LOCAL_APIC
+ +endif
+ +
   choice
         prompt "Subarchitecture Type"
         default X86_PC
@@@ -271,7 -253,7 +273,7 @@@ config X86_ELA
   
   config X86_VOYAGER
         bool "Voyager (NCR)"
- -      depends on X86_32 && (SMP || BROKEN)
+ +      depends on X86_32 && (SMP || BROKEN) && !PCI
         help
           Voyager is an MCA-based 32-way capable SMP architecture proprietary
           to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
@@@ -281,27 -263,16 +283,27 @@@
           If you do not specifically know you have a Voyager based machine,
           say N here, otherwise the kernel you build will not be bootable.
   
+ +config X86_GENERICARCH
+ +       bool "Generic architecture"
+ +      depends on X86_32
+ +       help
+ +          This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+ +        subarchitectures.  It is intended for a generic binary kernel.
+ +        if you select them all, kernel will probe it one by one. and will
+ +        fallback to default.
+ +
+ +if X86_GENERICARCH
+ +
   config X86_NUMAQ
         bool "NUMAQ (IBM/Sequent)"
- -      depends on SMP && X86_32
+ +      depends on SMP && X86_32 && PCI && X86_MPPARSE
         select NUMA
         help
- -        This option is used for getting Linux to run on a (IBM/Sequent) NUMA
- -        multiquad box. This changes the way that processors are bootstrapped,
- -        and uses Clustered Logical APIC addressing mode instead of Flat Logical.
- -        You will need a new lynxer.elf file to flash your firmware with - send
- -        email to <Martin.Bligh@us.ibm.com>.
+ +        This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
+ +        NUMA multiquad box. This changes the way that processors are
+ +        bootstrapped, and uses Clustered Logical APIC addressing mode instead
+ +        of Flat Logical.  You will need a new lynxer.elf file to flash your
+ +        firmware with - send email to <Martin.Bligh@us.ibm.com>.
   
   config X86_SUMMIT
         bool "Summit/EXA (IBM x440)"
@@@ -310,21 -281,46 +312,21 @@@
           This option is needed for IBM systems that use the Summit/EXA chipset.
           In particular, it is needed for the x440.
   
- -        If you don't have one of these computers, you should say N here.
- -        If you want to build a NUMA kernel, you must select ACPI.
+ +config X86_ES7000
+ +      bool "Support for Unisys ES7000 IA32 series"
+ +      depends on X86_32 && SMP
+ +      help
+ +        Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
+ +        supposed to run on an IA32-based Unisys ES7000 system.
   
   config X86_BIGSMP
- -      bool "Support for other sub-arch SMP systems with more than 8 CPUs"
+ +      bool "Support for big SMP systems with more than 8 CPUs"
         depends on X86_32 && SMP
         help
           This option is needed for the systems that have more than 8 CPUs
           and if the system is not of any sub-arch type above.
   
- -        If you don't have such a system, you should say N here.
- -
- -config X86_VISWS
- -      bool "SGI 320/540 (Visual Workstation)"
- -      depends on X86_32
- -      help
- -        The SGI Visual Workstation series is an IA32-based workstation
- -        based on SGI systems chips with some legacy PC hardware attached.
- -
- -        Say Y here to create a kernel to run on the SGI 320 or 540.
- -
- -        A kernel compiled for the Visual Workstation will not run on PCs
- -        and vice versa. See <file:Documentation/sgi-visws.txt> for details.
- -
- -config X86_GENERICARCH
- -       bool "Generic architecture (Summit, bigsmp, ES7000, default)"
- -      depends on X86_32
- -       help
- -          This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
- -        It is intended for a generic binary kernel.
- -        If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
- -
- -config X86_ES7000
- -      bool "Support for Unisys ES7000 IA32 series"
- -      depends on X86_32 && SMP
- -      help
- -        Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
- -        supposed to run on an IA32-based Unisys ES7000 system.
- -        Only choose this option if you have such a system, otherwise you
- -        should say N here.
+ +endif
   
   config X86_RDC321X
         bool "RDC R-321x SoC"
@@@ -343,7 -339,7 +345,7 @@@
   config X86_VSMP
         bool "Support for ScaleMP vSMP"
         select PARAVIRT
- -      depends on X86_64
+ +      depends on X86_64 && PCI
         help
           Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
           supposed to run on these EM64T-based machines.  Only choose this option
@@@ -351,18 -347,6 +353,18 @@@
   
   endchoice
   
+ +config X86_VISWS
+ +      bool "SGI 320/540 (Visual Workstation)"
+ +      depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
+ +      help
+ +        The SGI Visual Workstation series is an IA32-based workstation
+ +        based on SGI systems chips with some legacy PC hardware attached.
+ +
+ +        Say Y here to create a kernel to run on the SGI 320 or 540.
+ +
+ +        A kernel compiled for the Visual Workstation will run on general
+ +        PCs as well. See <file:Documentation/sgi-visws.txt> for details.
+ +
   config SCHED_NO_NO_OMIT_FRAME_POINTER
         def_bool y
         prompt "Single-depth WCHAN output"
@@@ -391,7 -375,7 +393,7 @@@ config VM
         bool "VMI Guest support"
         select PARAVIRT
         depends on X86_32
- -      depends on !(X86_VISWS || X86_VOYAGER)
+ +      depends on !X86_VOYAGER
         help
           VMI provides a paravirtualized interface to the VMware ESX server
           (it could be used by other hypervisors in theory too, but is not
@@@ -402,7 -386,7 +404,7 @@@ config KVM_CLOC
         bool "KVM paravirtualized clock"
         select PARAVIRT
         select PARAVIRT_CLOCK
- -      depends on !(X86_VISWS || X86_VOYAGER)
+ +      depends on !X86_VOYAGER
         help
           Turning on this option will allow you to run a paravirtualized clock
           when running over the KVM hypervisor. Instead of relying on a PIT
@@@ -413,7 -397,7 +415,7 @@@
   config KVM_GUEST
         bool "KVM Guest support"
         select PARAVIRT
- -      depends on !(X86_VISWS || X86_VOYAGER)
+ +      depends on !X86_VOYAGER
         help
          This option enables various optimizations for running under the KVM
          hypervisor.
@@@ -422,7 -406,7 +424,7 @@@ source "arch/x86/lguest/Kconfig
   
   config PARAVIRT
         bool "Enable paravirtualization code"
- -      depends on !(X86_VISWS || X86_VOYAGER)
+ +      depends on !X86_VOYAGER
         help
           This changes the kernel so it can modify itself when it is run
           under a hypervisor, potentially improving performance significantly
@@@ -435,33 -419,51 +437,33 @@@ config PARAVIRT_CLOC
   
   endif
   
- -config MEMTEST_BOOTPARAM
- -      bool "Memtest boot parameter"
+ +config PARAVIRT_DEBUG
+ +       bool "paravirt-ops debugging"
+ +       depends on PARAVIRT && DEBUG_KERNEL
+ +       help
+ +         Enable to debug paravirt_ops internals.  Specifically, BUG if
+ +       a paravirt_op is missing when it is called.
+ +
+ +config MEMTEST
+ +      bool "Memtest"
         depends on X86_64
         default y
         help
           This option adds a kernel parameter 'memtest', which allows memtest
- -        to be disabled at boot.  If this option is selected, memtest
- -        functionality can be disabled with memtest=0 on the kernel
- -        command line.  The purpose of this option is to allow a single
- -        kernel image to be distributed with memtest built in, but not
- -        necessarily enabled.
- -
+ +        to be set.
+ +              memtest=0, mean disabled; -- default
+ +              memtest=1, mean do 1 test pattern;
+ +              ...
+ +              memtest=4, mean do 4 test patterns.
           If you are unsure how to answer this question, answer Y.
   
- -config MEMTEST_BOOTPARAM_VALUE
- -      int "Memtest boot parameter default value (0-4)"
- -      depends on MEMTEST_BOOTPARAM
- -      range 0 4
- -      default 0
- -      help
- -        This option sets the default value for the kernel parameter
- -        'memtest', which allows memtest to be disabled at boot.  If this
- -        option is set to 0 (zero), the memtest kernel parameter will
- -        default to 0, disabling memtest at bootup.  If this option is
- -        set to 4, the memtest kernel parameter will default to 4,
- -        enabling memtest at bootup, and use that as pattern number.
- -
- -        If you are unsure how to answer this question, answer 0.
- -
- -config ACPI_SRAT
- -      def_bool y
- -      depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
- -      select ACPI_NUMA
- -
- -config HAVE_ARCH_PARSE_SRAT
- -      def_bool y
- -      depends on ACPI_SRAT
- -
   config X86_SUMMIT_NUMA
         def_bool y
- -      depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH)
+ +      depends on X86_32 && NUMA && X86_GENERICARCH
   
   config X86_CYCLONE_TIMER
         def_bool y
- -      depends on X86_32 && X86_SUMMIT || X86_GENERICARCH
+ +      depends on X86_GENERICARCH
   
   config ES7000_CLUSTERED_APIC
         def_bool y
@@@ -549,21 -551,6 +551,21 @@@ config CALGARY_IOMMU_ENABLED_BY_DEFAUL
           Calgary anyway, pass 'iommu=calgary' on the kernel command line.
           If unsure, say Y.
   
+ +config AMD_IOMMU
+ +      bool "AMD IOMMU support"
+ +      select SWIOTLB
+ +      depends on X86_64 && PCI && ACPI
+ +      help
+ +        With this option you can enable support for AMD IOMMU hardware in
+ +        your system. An IOMMU is a hardware component which provides
+ +        remapping of DMA memory accesses from devices. With an AMD IOMMU you
+ +        can isolate the the DMA memory of different devices and protect the
+ +        system from misbehaving device drivers or hardware.
+ +
+ +        You can find out if your system has an AMD IOMMU if you look into
+ +        your BIOS for an option to enable it or if you have an IVRS ACPI
+ +        table.
+ +
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         bool
@@@ -575,36 -562,21 +577,36 @@@
           3 GB of memory. If unsure, say Y.
   
   config IOMMU_HELPER
- -      def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB)
+ +      def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+ +config MAXSMP
+ +      bool "Configure Maximum number of SMP Processors and NUMA Nodes"
+ +      depends on X86_64 && SMP
+ +      default n
+ +      help
+ +        Configure maximum number of CPUS and NUMA Nodes for this architecture.
+ +        If unsure, say N.
   
+ +if MAXSMP
   config NR_CPUS
- -      int "Maximum number of CPUs (2-255)"
- -      range 2 255
+ +      int
+ +      default "4096"
+ +endif
+ +
+ +if !MAXSMP
+ +config NR_CPUS
+ +      int "Maximum number of CPUs (2-4096)"
+ +      range 2 4096
         depends on SMP
         default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
         default "8"
         help
           This allows you to specify the maximum number of CPUs which this
- -        kernel will support.  The maximum supported value is 255 and the
+ +        kernel will support.  The maximum supported value is 4096 and the
           minimum value which makes sense is 2.
   
           This is purely to save memory - each supported CPU adds
           approximately eight kilobytes to the kernel image.
+ +endif
   
   config SCHED_SMT
         bool "SMT (Hyperthreading) scheduler support"
@@@ -628,7 -600,7 +630,7 @@@ source "kernel/Kconfig.preempt
   
   config X86_UP_APIC
         bool "Local APIC support on uniprocessors"
- -      depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH)
+ +      depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
         help
           A local APIC (Advanced Programmable Interrupt Controller) is an
           integrated interrupt controller in the CPU. If you have a single-CPU
@@@ -653,11 -625,11 +655,11 @@@ config X86_UP_IOAPI
   
   config X86_LOCAL_APIC
         def_bool y
- -      depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH))
+ +      depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
   
   config X86_IO_APIC
         def_bool y
- -      depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH))
+ +      depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
   
   config X86_VISWS_APIC
         def_bool y
@@@ -711,7 -683,7 +713,7 @@@ config X86_MCE_NONFATA
   
   config X86_MCE_P4THERMAL
         bool "check for P4 thermal throttling interrupt."
- -      depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS
+ +      depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
         help
           Enabling this feature will cause a message to be printed when the P4
           enters thermal throttling.
@@@ -941,9 -913,9 +943,9 @@@ config X86_PA
   config NUMA
         bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
         depends on SMP
- -      depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL)
+ +      depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
         default n if X86_PC
- -      default y if (X86_NUMAQ || X86_SUMMIT)
+ +      default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
         help
           Enable NUMA (Non Uniform Memory Access) support.
           The kernel will try to allocate memory used by a CPU on the
@@@ -995,25 -967,13 +997,25 @@@ config NUMA_EM
           into virtual nodes when booted with "numa=fake=N", where N is the
           number of nodes. This is only useful for debugging.
   
+ +if MAXSMP
+ +
+ +config NODES_SHIFT
+ +      int
+ +      default "9"
+ +endif
+ +
+ +if !MAXSMP
   config NODES_SHIFT
- -      int "Max num nodes shift(1-9)"
- -      range 1 9  if X86_64
+ +      int "Maximum NUMA Nodes (as a power of 2)"
+ +      range 1 9   if X86_64
         default "6" if X86_64
         default "4" if X86_NUMAQ
         default "3"
         depends on NEED_MULTIPLE_NODES
+ +      help
+ +        Specify the maximum number of NUMA Nodes available on the target
+ +        system.  Increases memory reserved to accomodate various tables.
+ +endif
   
   config HAVE_ARCH_BOOTMEM_NODE
         def_bool y
@@@ -1132,40 -1092,6 +1134,40 @@@ config MTR
   
           See <file:Documentation/mtrr.txt> for more information.
   
+ +config MTRR_SANITIZER
+ +      def_bool y
+ +      prompt "MTRR cleanup support"
+ +      depends on MTRR
+ +      help
+ +        Convert MTRR layout from continuous to discrete, so some X driver
+ +        could add WB entries.
+ +
+ +        Say N here if you see bootup problems (boot crash, boot hang,
+ +        spontaneous reboots).
+ +
+ +        Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
+ +        could be used to send largest mtrr entry size for continuous block
+ +        to hold holes (aka. UC entries)
+ +
+ +        If unsure, say Y.
+ +
+ +config MTRR_SANITIZER_ENABLE_DEFAULT
+ +      int "MTRR cleanup enable value (0-1)"
+ +      range 0 1
+ +      default "0"
+ +      depends on MTRR_SANITIZER
+ +      help
+ +        Enable mtrr cleanup default value
+ +
+ +config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
+ +      int "MTRR cleanup spare reg num (0-7)"
+ +      range 0 7
+ +      default "1"
+ +      depends on MTRR_SANITIZER
+ +      help
+ +        mtrr cleanup spare entries default, it can be changed via
+ +        mtrr_spare_reg_nr=
+ +
   config X86_PAT
         bool
         prompt "x86 PAT support"
@@@ -1266,6 -1192,7 +1268,6 @@@ config KEXE
   
   config CRASH_DUMP
         bool "kernel crash dumps (EXPERIMENTAL)"
- -      depends on EXPERIMENTAL
         depends on X86_64 || (X86_32 && HIGHMEM)
         help
           Generate crash dump after being started by kexec.
@@@ -1414,7 -1341,7 +1416,7 @@@ config X86_APM_BOO
   
   menuconfig APM
         tristate "APM (Advanced Power Management) BIOS support"
- -      depends on X86_32 && PM_SLEEP && !X86_VISWS
+ +      depends on X86_32 && PM_SLEEP
         ---help---
           APM is a BIOS specification for saving power using several different
           techniques. This is mostly useful for battery powered laptops with
@@@ -1550,7 -1477,8 +1552,7 @@@ endmen
   menu "Bus options (PCI etc.)"
   
   config PCI
- -      bool "PCI support" if !X86_VISWS && !X86_VSMP
- -      depends on !X86_VOYAGER
+ +      bool "PCI support"
         default y
         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
         help
@@@ -1561,7 -1489,7 +1563,7 @@@
   
   choice
         prompt "PCI access mode"
- -      depends on X86_32 && PCI && !X86_VISWS
+ +      depends on X86_32 && PCI
         default PCI_GOANY
         ---help---
           On PCI systems, the BIOS can be used to detect the PCI devices and
@@@ -1598,12 -1526,12 +1600,12 @@@ endchoic
   
   config PCI_BIOS
         def_bool y
- -      depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
+ +      depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
   
   # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
   config PCI_DIRECT
         def_bool y
- -      depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC) || X86_VISWS)
+ +      depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
   
   config PCI_MMCONFIG
         def_bool y
@@@ -1663,7 -1591,7 +1665,7 @@@ if X86_3
   
   config ISA
         bool "ISA support"
- -      depends on !(X86_VOYAGER || X86_VISWS)
+ +      depends on !X86_VOYAGER
         help
           Find out whether you have ISA slots on your motherboard.  ISA is the
           name of a bus system, i.e. the way the CPU talks to the other stuff
@@@ -1690,7 -1618,7 +1692,7 @@@ config EIS
   source "drivers/eisa/Kconfig"
   
   config MCA
- -      bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
+ +      bool "MCA support" if !X86_VOYAGER
         default y if X86_VOYAGER
         help
           MicroChannel Architecture is found in some IBM PS/2 machines and
diff --combined arch/x86/Kconfig.debug

index acc0271920f2e3d80be6a50bb882066727e297e5,f7169edfbeab935eb34f4bc9aebb6a9275fc2940..5236621350bc9169d2be181e3de37d56c1a4a08b
--- 1/arch/x86/Kconfig.debug
--- 2/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@@ -20,14 -20,6 +20,14 @@@ config NONPROMISC_DEVME
   
           If in doubt, say Y.
   
+ +config X86_VERBOSE_BOOTUP
+ +      bool "Enable verbose x86 bootup info messages"
+ +      default y
+ +      help
+ +        Enables the informational output from the decompression stage
+ +        (e.g. bzImage) of the boot. If you disable this you will still
+ +        see errors. Disable this if you want silent bootup.
+ +
   config EARLY_PRINTK
         bool "Early printk" if EMBEDDED
         default y
@@@ -68,7 -60,7 +68,7 @@@ config DEBUG_PAGEALLO
   config DEBUG_PER_CPU_MAPS
         bool "Debug access to per_cpu maps"
         depends on DEBUG_KERNEL
- -      depends on X86_64_SMP
+ +      depends on X86_SMP
         default n
         help
           Say Y to verify that the per_cpu map being accessed has
@@@ -137,6 -129,15 +137,6 @@@ config 4KSTACK
           on the VM subsystem for higher order allocations. This option
           will also use IRQ stacks to compensate for the reduced stackspace.
   
- -config X86_FIND_SMP_CONFIG
- -      def_bool y
- -      depends on X86_LOCAL_APIC || X86_VOYAGER
- -      depends on X86_32
- -
- -config X86_MPPARSE
- -      def_bool y
- -      depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64
- -
   config DOUBLEFAULT
         default y
         bool "Enable doublefault exception handler" if EMBEDDED
@@@ -171,6 -172,34 +171,34 @@@ config IOMMU_LEA
           Add a simple leak tracer to the IOMMU code. This is useful when you
           are debugging a buggy device driver that leaks IOMMU mappings.
   
+ config MMIOTRACE_HOOKS
+       bool
+ 
+ config MMIOTRACE
+       bool "Memory mapped IO tracing"
+       depends on DEBUG_KERNEL && PCI
+       select TRACING
+       select MMIOTRACE_HOOKS
+       default y
+       help
+         Mmiotrace traces Memory Mapped I/O access and is meant for
+         debugging and reverse engineering. It is called from the ioremap
+         implementation and works via page faults. Tracing is disabled by
+         default and can be enabled at run-time.
+ 
+         See Documentation/tracers/mmiotrace.txt.
+         If you are not helping to develop drivers, say N.
+ 
+ config MMIOTRACE_TEST
+       tristate "Test module for mmiotrace"
+       depends on MMIOTRACE && m
+       help
+         This is a dumb module for testing mmiotrace. It is very dangerous
+         as it will write garbage to IO memory starting at a given address.
+         However, it should be safe to use on e.g. unused portion of VRAM.
+ 
+         Say N, unless you absolutely know what you are doing.
+ 
   #
   # IO delay types:
   #
diff --combined arch/x86/kernel/Makefile

index 55ff016e9f694f61d5dc143675797860a4f4de3d,5ff67208d4ae51c5c57ef531f0d7bc8b76b3025d..5112c84f542164e37a74ed9785451d127cd8da95
--- 1/arch/x86/kernel/Makefile
--- 2/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@@ -2,10 -2,17 +2,17 @@@
   # Makefile for the linux kernel.
   #
   
- -extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+ +extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
   
   CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
   
+ ifdef CONFIG_FTRACE
+ # Do not profile debug utilities
+ CFLAGS_REMOVE_tsc_64.o = -pg
+ CFLAGS_REMOVE_tsc_32.o = -pg
+ CFLAGS_REMOVE_rtc.o = -pg
+ endif
+ 
   #
   # vsyscalls (which work on the user stack) should have
   # no stack-protector checks:
@@@ -13,21 -20,20 +20,21 @@@
   nostackp := $(call cc-option, -fno-stack-protector)
   CFLAGS_vsyscall_64.o  := $(PROFILING) -g0 $(nostackp)
   CFLAGS_hpet.o         := $(nostackp)
- -CFLAGS_tsc_64.o               := $(nostackp)
+ +CFLAGS_tsc.o          := $(nostackp)
   
   obj-y                 := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
   obj-y                 += traps_$(BITS).o irq_$(BITS).o
   obj-y                 += time_$(BITS).o ioport.o ldt.o
- -obj-y                 += setup_$(BITS).o i8259_$(BITS).o setup.o
+ +obj-y                 += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
+ +obj-$(CONFIG_X86_VISWS)       += visws_quirks.o
+ +obj-$(CONFIG_X86_32)  += probe_roms_32.o
   obj-$(CONFIG_X86_32)  += sys_i386_32.o i386_ksyms_32.o
   obj-$(CONFIG_X86_64)  += sys_x86_64.o x8664_ksyms_64.o
- -obj-$(CONFIG_X86_64)  += syscall_64.o vsyscall_64.o setup64.o
- -obj-y                 += bootflag.o e820_$(BITS).o
+ +obj-$(CONFIG_X86_64)  += syscall_64.o vsyscall_64.o
+ +obj-y                 += bootflag.o e820.o
   obj-y                 += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
   obj-y                 += alternative.o i8253.o pci-nommu.o
- -obj-$(CONFIG_X86_64)  += bugs_64.o
- -obj-y                 += tsc_$(BITS).o io_delay.o rtc.o
+ +obj-y                 += tsc.o io_delay.o rtc.o
   
   obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline.o
   obj-y                         += process.o
@@@ -54,9 -60,10 +61,10 @@@ obj-$(CONFIG_X86_32_SMP)    += smpcommon.
   obj-$(CONFIG_X86_64_SMP)      += tsc_sync.o smpcommon.o
   obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline_$(BITS).o
   obj-$(CONFIG_X86_MPPARSE)     += mpparse.o
- -obj-$(CONFIG_X86_LOCAL_APIC)  += apic_$(BITS).o nmi_$(BITS).o
+ +obj-$(CONFIG_X86_LOCAL_APIC)  += apic_$(BITS).o nmi.o
   obj-$(CONFIG_X86_IO_APIC)     += io_apic_$(BITS).o
   obj-$(CONFIG_X86_REBOOTFIXUPS)        += reboot_fixups_32.o
+ obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
   obj-$(CONFIG_KEXEC)           += machine_kexec_$(BITS).o
   obj-$(CONFIG_KEXEC)           += relocate_kernel_$(BITS).o crash.o
   obj-$(CONFIG_CRASH_DUMP)      += crash_dump_$(BITS).o
@@@ -65,6 -72,7 +73,6 @@@ obj-$(CONFIG_X86_SUMMIT_NUMA) += summit
   obj-y                         += vsmp_64.o
   obj-$(CONFIG_KPROBES)         += kprobes.o
   obj-$(CONFIG_MODULES)         += module_$(BITS).o
- -obj-$(CONFIG_ACPI_SRAT)       += srat_32.o
   obj-$(CONFIG_EFI)             += efi.o efi_$(BITS).o efi_stub_$(BITS).o
   obj-$(CONFIG_DOUBLEFAULT)     += doublefault_32.o
   obj-$(CONFIG_KGDB)            += kgdb.o
@@@ -94,13 -102,12 +102,13 @@@ obj-$(CONFIG_OLPC)               += olpc.
   ###
   # 64 bit specific files
   ifeq ($(CONFIG_X86_64),y)
- -        obj-y                         += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
+ +        obj-y                         += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
           obj-$(CONFIG_X86_PM_TIMER)    += pmtimer_64.o
           obj-$(CONFIG_AUDIT)           += audit_64.o
   
           obj-$(CONFIG_GART_IOMMU)      += pci-gart_64.o aperture_64.o
           obj-$(CONFIG_CALGARY_IOMMU)   += pci-calgary_64.o tce_64.o
+ +        obj-$(CONFIG_AMD_IOMMU)               += amd_iommu_init.o amd_iommu.o
           obj-$(CONFIG_SWIOTLB)         += pci-swiotlb_64.o
   
           obj-$(CONFIG_PCI_MMCONFIG)    += mmconf-fam10h_64.o
diff --combined arch/x86/kernel/entry_32.S

index cfe28a715434762352df73207da0fe422c76e713,95e6bbe3665e1c240daa0400618a7d00c63af5a3..6bc07f0f1202eeb1eaac0b55064acad7f355371a
--- 1/arch/x86/kernel/entry_32.S
--- 2/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@@ -51,14 -51,15 +51,15 @@@
   #include <asm/percpu.h>
   #include <asm/dwarf2.h>
   #include <asm/processor-flags.h>
- -#include "irq_vectors.h"
+ #include <asm/ftrace.h>
+ +#include <asm/irq_vectors.h>
   
   /*
    * We use macros for low-level operations which need to be overridden
    * for paravirtualization.  The following will never clobber any registers:
    *   INTERRUPT_RETURN (aka. "iret")
    *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- - *   ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
+ + *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
    *
    * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
    * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@@ -349,7 -350,7 +350,7 @@@ sysenter_past_esp
         xorl %ebp,%ebp
         TRACE_IRQS_ON
   1:    mov  PT_FS(%esp), %fs
- -      ENABLE_INTERRUPTS_SYSCALL_RET
+ +      ENABLE_INTERRUPTS_SYSEXIT
         CFI_ENDPROC
   .pushsection .fixup,"ax"
   2:    movl $0,PT_FS(%esp)
@@@ -874,10 -875,10 +875,10 @@@ ENTRY(native_iret
   .previous
   END(native_iret)
   
- -ENTRY(native_irq_enable_syscall_ret)
+ +ENTRY(native_irq_enable_sysexit)
         sti
         sysexit
- -END(native_irq_enable_syscall_ret)
+ +END(native_irq_enable_sysexit)
   #endif
   
   KPROBE_ENTRY(int3)
@@@ -1024,7 -1025,6 +1025,7 @@@ ENTRY(xen_sysenter_target
         RING0_INT_FRAME
         addl $5*4, %esp         /* remove xen-provided frame */
         jmp sysenter_past_esp
+ +      CFI_ENDPROC
   
   ENTRY(xen_hypervisor_callback)
         CFI_STARTPROC
@@@ -1111,6 -1111,77 +1112,77 @@@ ENDPROC(xen_failsafe_callback
   
   #endif        /* CONFIG_XEN */
   
+ #ifdef CONFIG_FTRACE
+ #ifdef CONFIG_DYNAMIC_FTRACE
+ 
+ ENTRY(mcount)
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       subl $MCOUNT_INSN_SIZE, %eax
+ 
+ .globl mcount_call
+ mcount_call:
+       call ftrace_stub
+ 
+       popl %edx
+       popl %ecx
+       popl %eax
+ 
+       ret
+ END(mcount)
+ 
+ ENTRY(ftrace_caller)
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+ 
+ .globl ftrace_call
+ ftrace_call:
+       call ftrace_stub
+ 
+       popl %edx
+       popl %ecx
+       popl %eax
+ 
+ .globl ftrace_stub
+ ftrace_stub:
+       ret
+ END(ftrace_caller)
+ 
+ #else /* ! CONFIG_DYNAMIC_FTRACE */
+ 
+ ENTRY(mcount)
+       cmpl $ftrace_stub, ftrace_trace_function
+       jnz trace
+ .globl ftrace_stub
+ ftrace_stub:
+       ret
+ 
+       /* taken from glibc */
+ trace:
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+ 
+       call *ftrace_trace_function
+ 
+       popl %edx
+       popl %ecx
+       popl %eax
+ 
+       jmp ftrace_stub
+ END(mcount)
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ #endif /* CONFIG_FTRACE */
+ 
   .section .rodata,"a"
   #include "syscall_table_32.S"
   
diff --combined arch/x86/kernel/entry_64.S

index bb4e22f4892fd807cf4dfaee0a8ae41aefb9111b,b0f7308f78a6809135c8d84a163c1b4568d9bd8c..ba41bf42748d7657548b8e39191d9496f86a9480
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -51,15 -51,122 +51,121 @@@
   #include <asm/page.h>
   #include <asm/irqflags.h>
   #include <asm/paravirt.h>
+ #include <asm/ftrace.h>
   
         .code64
   
+ #ifdef CONFIG_FTRACE
+ #ifdef CONFIG_DYNAMIC_FTRACE
+ ENTRY(mcount)
+ 
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+ 
+       movq 0x38(%rsp), %rdi
+       subq $MCOUNT_INSN_SIZE, %rdi
+ 
+ .globl mcount_call
+ mcount_call:
+       call ftrace_stub
+ 
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+ 
+       retq
+ END(mcount)
+ 
+ ENTRY(ftrace_caller)
+ 
+       /* taken from glibc */
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+ 
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+ 
+ .globl ftrace_call
+ ftrace_call:
+       call ftrace_stub
+ 
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+ 
+ .globl ftrace_stub
+ ftrace_stub:
+       retq
+ END(ftrace_caller)
+ 
+ #else /* ! CONFIG_DYNAMIC_FTRACE */
+ ENTRY(mcount)
+       cmpq $ftrace_stub, ftrace_trace_function
+       jnz trace
+ .globl ftrace_stub
+ ftrace_stub:
+       retq
+ 
+ trace:
+       /* taken from glibc */
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+ 
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+ 
+       call   *ftrace_trace_function
+ 
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+ 
+       jmp ftrace_stub
+ END(mcount)
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ #endif /* CONFIG_FTRACE */
+ 
   #ifndef CONFIG_PREEMPT
   #define retint_kernel retint_restore_args
   #endif        
   
   #ifdef CONFIG_PARAVIRT
- -ENTRY(native_irq_enable_syscall_ret)
- -      movq    %gs:pda_oldrsp,%rsp
+ +ENTRY(native_usergs_sysret64)
         swapgs
         sysretq
   #endif /* CONFIG_PARAVIRT */
@@@ -103,7 -210,7 +209,7 @@@
         .macro FAKE_STACK_FRAME child_rip
         /* push in order ss, rsp, eflags, cs, rip */
         xorl %eax, %eax
- -      pushq %rax /* ss */
+ +      pushq $__KERNEL_DS /* ss */
         CFI_ADJUST_CFA_OFFSET   8
         /*CFI_REL_OFFSET        ss,0*/
         pushq %rax /* rsp */
@@@ -168,13 -275,13 +274,13 @@@ ENTRY(ret_from_fork
         CFI_ADJUST_CFA_OFFSET -4
         call schedule_tail
         GET_THREAD_INFO(%rcx)
- -      testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+ +      testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
         jnz rff_trace
   rff_action:   
         RESTORE_REST
         testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
         je   int_ret_from_sys_call
- -      testl $_TIF_IA32,threadinfo_flags(%rcx)
+ +      testl $_TIF_IA32,TI_flags(%rcx)
         jnz  int_ret_from_sys_call
         RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
         jmp ret_from_sys_call
@@@ -243,8 -350,7 +349,8 @@@ ENTRY(system_call_after_swapgs
         movq  %rcx,RIP-ARGOFFSET(%rsp)
         CFI_REL_OFFSET rip,RIP-ARGOFFSET
         GET_THREAD_INFO(%rcx)
- -      testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ +      testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
+ +              TI_flags(%rcx)
         jnz tracesys
         cmpq $__NR_syscall_max,%rax
         ja badsys
@@@ -263,7 -369,7 +369,7 @@@ sysret_check
         GET_THREAD_INFO(%rcx)
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
- -      movl threadinfo_flags(%rcx),%edx
+ +      movl TI_flags(%rcx),%edx
         andl %edi,%edx
         jnz  sysret_careful 
         CFI_REMEMBER_STATE
@@@ -275,8 -381,7 +381,8 @@@
         CFI_REGISTER    rip,rcx
         RESTORE_ARGS 0,-ARG_SKIP,1
         /*CFI_REGISTER  rflags,r11*/
- -      ENABLE_INTERRUPTS_SYSCALL_RET
+ +      movq    %gs:pda_oldrsp, %rsp
+ +      USERGS_SYSRET64
   
         CFI_RESTORE_STATE
         /* Handle reschedules */
@@@ -306,7 -411,7 +412,7 @@@ sysret_signal
         leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
         xorl %esi,%esi # oldset -> arg2
         call ptregscall_common
- -1:    movl $_TIF_NEED_RESCHED,%edi
+ +1:    movl $_TIF_WORK_MASK,%edi
         /* Use IRET because user could have changed frame. This
            works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
         DISABLE_INTERRUPTS(CLBR_NONE)
@@@ -348,10 -453,10 +454,10 @@@ int_ret_from_sys_call
   int_with_check:
         LOCKDEP_SYS_EXIT_IRQ
         GET_THREAD_INFO(%rcx)
- -      movl threadinfo_flags(%rcx),%edx
+ +      movl TI_flags(%rcx),%edx
         andl %edi,%edx
         jnz   int_careful
- -      andl    $~TS_COMPAT,threadinfo_status(%rcx)
+ +      andl    $~TS_COMPAT,TI_status(%rcx)
         jmp   retint_swapgs
   
         /* Either reschedule or signal or syscall exit tracking needed. */
@@@ -394,7 -499,7 +500,7 @@@ int_signal
         movq %rsp,%rdi          # &ptregs -> arg1
         xorl %esi,%esi          # oldset -> arg2
         call do_notify_resume
- -1:    movl $_TIF_NEED_RESCHED,%edi    
+ +1:    movl $_TIF_WORK_MASK,%edi
   int_restore_rest:
         RESTORE_REST
         DISABLE_INTERRUPTS(CLBR_NONE)
@@@ -421,6 -526,7 +527,6 @@@ END(\label
         PTREGSCALL stub_clone, sys_clone, %r8
         PTREGSCALL stub_fork, sys_fork, %rdi
         PTREGSCALL stub_vfork, sys_vfork, %rdi
- -      PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
         PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
         PTREGSCALL stub_iopl, sys_iopl, %rsi
   
@@@ -559,7 -665,7 +665,7 @@@ retint_with_reschedule
         movl $_TIF_WORK_MASK,%edi
   retint_check:
         LOCKDEP_SYS_EXIT_IRQ
- -      movl threadinfo_flags(%rcx),%edx
+ +      movl TI_flags(%rcx),%edx
         andl %edi,%edx
         CFI_REMEMBER_STATE
         jnz  retint_careful
@@@ -647,16 -753,17 +753,16 @@@ retint_signal
         RESTORE_REST
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
- -      movl $_TIF_NEED_RESCHED,%edi
         GET_THREAD_INFO(%rcx)
- -      jmp retint_check
+ +      jmp retint_with_reschedule
   
   #ifdef CONFIG_PREEMPT
         /* Returning to kernel space. Check if we need preemption */
         /* rcx:  threadinfo. interrupts off. */
   ENTRY(retint_kernel)
- -      cmpl $0,threadinfo_preempt_count(%rcx)
+ +      cmpl $0,TI_preempt_count(%rcx)
         jnz  retint_restore_args
- -      bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+ +      bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
         jnc  retint_restore_args
         bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
         jnc  retint_restore_args
@@@ -719,10 -826,6 +825,10 @@@ ENTRY(apic_timer_interrupt
         apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
   END(apic_timer_interrupt)
   
+ +ENTRY(uv_bau_message_intr1)
+ +      apicinterrupt 220,uv_bau_message_interrupt
+ +END(uv_bau_message_intr1)
+ +
   ENTRY(error_interrupt)
         apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
   END(error_interrupt)
@@@ -736,7 -839,6 +842,7 @@@ END(spurious_interrupt
    */           
         .macro zeroentry sym
         INTR_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq $0        /* push error code/oldrax */ 
         CFI_ADJUST_CFA_OFFSET 8
         pushq %rax      /* push real oldrax to the rdi slot */ 
@@@ -749,7 -851,6 +855,7 @@@
   
         .macro errorentry sym
         XCPT_FRAME
+ +      PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq %rax
         CFI_ADJUST_CFA_OFFSET 8
         CFI_REL_OFFSET rax,0
@@@ -819,7 -920,7 +925,7 @@@ paranoid_restore\trace
         jmp irq_return
   paranoid_userspace\trace:
         GET_THREAD_INFO(%rcx)
- -      movl threadinfo_flags(%rcx),%ebx
+ +      movl TI_flags(%rcx),%ebx
         andl $_TIF_WORK_MASK,%ebx
         jz paranoid_swapgs\trace
         movq %rsp,%rdi                  /* &pt_regs */
@@@ -917,7 -1018,7 +1023,7 @@@ error_exit
         testl %eax,%eax
         jne  retint_kernel
         LOCKDEP_SYS_EXIT_IRQ
- -      movl  threadinfo_flags(%rcx),%edx
+ +      movl  TI_flags(%rcx),%edx
         movl  $_TIF_WORK_MASK,%edi
         andl  %edi,%edx
         jnz  retint_careful
@@@ -931,11 -1032,11 +1037,11 @@@ error_kernelspace
            iret run with kernel gs again, so don't set the user space flag.
            B stepping K8s sometimes report an truncated RIP for IRET 
            exceptions returning to compat mode. Check for these here too. */
- -      leaq irq_return(%rip),%rbp
- -      cmpq %rbp,RIP(%rsp) 
+ +      leaq irq_return(%rip),%rcx
+ +      cmpq %rcx,RIP(%rsp)
         je   error_swapgs
- -      movl %ebp,%ebp  /* zero extend */
- -      cmpq %rbp,RIP(%rsp) 
+ +      movl %ecx,%ecx  /* zero extend */
+ +      cmpq %rcx,RIP(%rsp)
         je   error_swapgs
         cmpq $gs_change,RIP(%rsp)
           je   error_swapgs
@@@ -944,7 -1045,7 +1050,7 @@@ KPROBE_END(error_entry
         
          /* Reload gs selector with exception handling */
          /* edi:  new selector */ 
- -ENTRY(load_gs_index)
+ +ENTRY(native_load_gs_index)
         CFI_STARTPROC
         pushf
         CFI_ADJUST_CFA_OFFSET 8
@@@ -958,7 -1059,7 +1064,7 @@@ gs_change
         CFI_ADJUST_CFA_OFFSET -8
           ret
         CFI_ENDPROC
- -ENDPROC(load_gs_index)
+ +ENDPROC(native_load_gs_index)
          
           .section __ex_table,"a"
           .align 8
@@@ -1125,6 -1226,10 +1231,6 @@@ ENTRY(coprocessor_segment_overrun
         zeroentry do_coprocessor_segment_overrun
   END(coprocessor_segment_overrun)
   
- -ENTRY(reserved)
- -      zeroentry do_reserved
- -END(reserved)
- -
         /* runs on exception stack */
   ENTRY(double_fault)
         XCPT_FRAME
diff --combined arch/x86/kernel/machine_kexec_32.c

index f4960171bc66625ebac582a172990b0531031fcb,88923fd7a6fc5c3943bee384a43860f7fe1104fd..8864230d55afd11476e96e13429c4a1ba03ce8cd
--- 1/arch/x86/kernel/machine_kexec_32.c
--- 2/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@@ -11,6 -11,8 +11,8 @@@
   #include <linux/delay.h>
   #include <linux/init.h>
   #include <linux/numa.h>
+ #include <linux/ftrace.h>
+ 
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
   #include <asm/tlbflush.h>
@@@ -39,7 -41,7 +41,7 @@@ static void set_idt(void *newidt, __u1
         curidt.address = (unsigned long)newidt;
   
         load_idt(&curidt);
- -};
+ +}
   
   
   static void set_gdt(void *newgdt, __u16 limit)
@@@ -51,7 -53,7 +53,7 @@@
         curgdt.address = (unsigned long)newgdt;
   
         load_gdt(&curgdt);
- -};
+ +}
   
   static void load_segments(void)
   {
@@@ -107,6 -109,8 +109,8 @@@ NORET_TYPE void machine_kexec(struct ki
         unsigned long page_list[PAGES_NR];
         void *control_page;
   
+       tracer_disable();
+ 
         /* Interrupts aren't acceptable while we reboot */
         local_irq_disable();
   
diff --combined arch/x86/kernel/machine_kexec_64.c

index 7830dc4a8380d4f2631beef1df15f183cd69b721,1558fdc174f96155c8e3af95f0a2426f441848c3..9dd9262693a330ae8aa81f386af90b0defb8c904
--- 1/arch/x86/kernel/machine_kexec_64.c
--- 2/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@@ -11,6 -11,8 +11,8 @@@
   #include <linux/string.h>
   #include <linux/reboot.h>
   #include <linux/numa.h>
+ #include <linux/ftrace.h>
+ 
   #include <asm/pgtable.h>
   #include <asm/tlbflush.h>
   #include <asm/mmu_context.h>
@@@ -110,7 -112,7 +112,7 @@@ static int init_pgtable(struct kimage *
   {
         pgd_t *level4p;
         level4p = (pgd_t *)__va(start_pgtable);
- -      return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ +      return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
   }
   
   static void set_idt(void *newidt, u16 limit)
@@@ -184,6 -186,8 +186,8 @@@ NORET_TYPE void machine_kexec(struct ki
         unsigned long page_list[PAGES_NR];
         void *control_page;
   
+       tracer_disable();
+ 
         /* Interrupts aren't acceptable while we reboot */
         local_irq_disable();
   
diff --combined arch/x86/kernel/process_32.c

index 9a139f6c9df30fa2c304e96da89b99fd62b78a8d,347a7aba8b1637612e098edd0921825df8a66b51..0c3927accb0054b71c7de9eb828a93559232737e
--- 1/arch/x86/kernel/process_32.c
--- 2/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@@ -58,6 -58,11 +58,6 @@@
   
   asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
   
- -static int hlt_counter;
- -
- -unsigned long boot_option_idle_override = 0;
- -EXPORT_SYMBOL(boot_option_idle_override);
- -
   DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
   EXPORT_PER_CPU_SYMBOL(current_task);
   
@@@ -72,24 -77,57 +72,24 @@@ unsigned long thread_saved_pc(struct ta
         return ((unsigned long *)tsk->thread.sp)[3];
   }
   
- -/*
- - * Powermanagement idle function, if any..
- - */
- -void (*pm_idle)(void);
- -EXPORT_SYMBOL(pm_idle);
+ +#ifdef CONFIG_HOTPLUG_CPU
+ +#include <asm/nmi.h>
   
- -void disable_hlt(void)
+ +static void cpu_exit_clear(void)
   {
- -      hlt_counter++;
- -}
+ +      int cpu = raw_smp_processor_id();
   
- -EXPORT_SYMBOL(disable_hlt);
+ +      idle_task_exit();
   
- -void enable_hlt(void)
- -{
- -      hlt_counter--;
- -}
+ +      cpu_uninit();
+ +      irq_ctx_exit(cpu);
   
- -EXPORT_SYMBOL(enable_hlt);
+ +      cpu_clear(cpu, cpu_callout_map);
+ +      cpu_clear(cpu, cpu_callin_map);
   
- -/*
- - * We use this if we don't have any better
- - * idle routine..
- - */
- -void default_idle(void)
- -{
- -      if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- -              current_thread_info()->status &= ~TS_POLLING;
- -              /*
- -               * TS_POLLING-cleared state must be visible before we
- -               * test NEED_RESCHED:
- -               */
- -              smp_mb();
- -
- -              if (!need_resched())
- -                      safe_halt();    /* enables interrupts racelessly */
- -              else
- -                      local_irq_enable();
- -              current_thread_info()->status |= TS_POLLING;
- -      } else {
- -              local_irq_enable();
- -              /* loop is done by the caller */
- -              cpu_relax();
- -      }
+ +      numa_remove_cpu(cpu);
   }
- -#ifdef CONFIG_APM_MODULE
- -EXPORT_SYMBOL(default_idle);
- -#endif
   
- -#ifdef CONFIG_HOTPLUG_CPU
- -#include <asm/nmi.h>
   /* We don't actually take CPU down, just spin without interrupts. */
   static inline void play_dead(void)
   {
@@@ -130,19 -168,27 +130,22 @@@ void cpu_idle(void
         while (1) {
                 tick_nohz_stop_sched_tick();
                 while (!need_resched()) {
- -                      void (*idle)(void);
   
                         check_pgt_cache();
                         rmb();
- -                      idle = pm_idle;
   
                         if (rcu_pending(cpu))
                                 rcu_check_callbacks(cpu, 0);
   
- -                      if (!idle)
- -                              idle = default_idle;
- -
                         if (cpu_is_offline(cpu))
                                 play_dead();
   
                         local_irq_disable();
                         __get_cpu_var(irq_stat).idle_timestamp = jiffies;
- -                      idle();
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
+ +                      pm_idle();
+                       start_critical_timings();
                 }
                 tick_nohz_restart_sched_tick();
                 preempt_enable_no_resched();
diff --combined arch/x86/kernel/process_64.c

index db5eb963e4df2e7d573012f96bc3696eff6b52ca,ea090e6cfe396fc4afc0d8b5c76ba6e4734f9a6f..a8e53626ac9aaf5fc8290908aaf42552556a1b11
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -56,6 -56,15 +56,6 @@@ asmlinkage extern void ret_from_fork(vo
   
   unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
   
- -unsigned long boot_option_idle_override = 0;
- -EXPORT_SYMBOL(boot_option_idle_override);
- -
- -/*
- - * Powermanagement idle function, if any..
- - */
- -void (*pm_idle)(void);
- -EXPORT_SYMBOL(pm_idle);
- -
   static ATOMIC_NOTIFIER_HEAD(idle_notifier);
   
   void idle_notifier_register(struct notifier_block *n)
@@@ -85,6 -94,25 +85,6 @@@ void exit_idle(void
         __exit_idle();
   }
   
- -/*
- - * We use this if we don't have any better
- - * idle routine..
- - */
- -void default_idle(void)
- -{
- -      current_thread_info()->status &= ~TS_POLLING;
- -      /*
- -       * TS_POLLING-cleared state must be visible before we
- -       * test NEED_RESCHED:
- -       */
- -      smp_mb();
- -      if (!need_resched())
- -              safe_halt();    /* enables interrupts racelessly */
- -      else
- -              local_irq_enable();
- -      current_thread_info()->status |= TS_POLLING;
- -}
- -
   #ifdef CONFIG_HOTPLUG_CPU
   DECLARE_PER_CPU(int, cpu_state);
   
@@@ -122,9 -150,12 +122,9 @@@ void cpu_idle(void
         while (1) {
                 tick_nohz_stop_sched_tick();
                 while (!need_resched()) {
- -                      void (*idle)(void);
   
                         rmb();
- -                      idle = pm_idle;
- -                      if (!idle)
- -                              idle = default_idle;
+ +
                         if (cpu_is_offline(smp_processor_id()))
                                 play_dead();
                         /*
@@@ -134,7 -165,10 +134,10 @@@
                          */
                         local_irq_disable();
                         enter_idle();
- -                      idle();
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
+ +                      pm_idle();
+                       start_critical_timings();
                         /* In many cases the interrupt that ended idle
                            has already called exit_idle. But some idle
                            loops can be woken up without interrupt. */
@@@ -335,10 -369,10 +338,10 @@@ int copy_thread(int nr, unsigned long c
         p->thread.fs = me->thread.fs;
         p->thread.gs = me->thread.gs;
   
- -      asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
- -      asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
- -      asm("mov %%es,%0" : "=m" (p->thread.es));
- -      asm("mov %%ds,%0" : "=m" (p->thread.ds));
+ +      savesegment(gs, p->thread.gsindex);
+ +      savesegment(fs, p->thread.fsindex);
+ +      savesegment(es, p->thread.es);
+ +      savesegment(ds, p->thread.ds);
   
         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
                 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
@@@ -377,9 -411,7 +380,9 @@@ out
   void
   start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
   {
- -      asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+ +      loadsegment(fs, 0);
+ +      loadsegment(es, 0);
+ +      loadsegment(ds, 0);
         load_gs_index(0);
         regs->ip                = new_ip;
         regs->sp                = new_sp;
@@@ -538,7 -570,6 +541,7 @@@ __switch_to(struct task_struct *prev_p
                                  *next = &next_p->thread;
         int cpu = smp_processor_id();
         struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ +      unsigned fsindex, gsindex;
   
         /* we're going to use this soon, after a few expensive things */
         if (next_p->fpu_counter>5)
@@@ -553,38 -584,22 +556,38 @@@
          * Switch DS and ES.
          * This won't pick up thread selector changes, but I guess that is ok.
          */
- -      asm volatile("mov %%es,%0" : "=m" (prev->es));
+ +      savesegment(es, prev->es);
         if (unlikely(next->es | prev->es))
                 loadsegment(es, next->es); 
- -      
- -      asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+ +
+ +      savesegment(ds, prev->ds);
         if (unlikely(next->ds | prev->ds))
                 loadsegment(ds, next->ds);
   
+ +
+ +      /* We must save %fs and %gs before load_TLS() because
+ +       * %fs and %gs may be cleared by load_TLS().
+ +       *
+ +       * (e.g. xen_load_tls())
+ +       */
+ +      savesegment(fs, fsindex);
+ +      savesegment(gs, gsindex);
+ +
         load_TLS(next, cpu);
   
+ +      /*
+ +       * Leave lazy mode, flushing any hypercalls made here.
+ +       * This must be done before restoring TLS segments so
+ +       * the GDT and LDT are properly updated, and must be
+ +       * done before math_state_restore, so the TS bit is up
+ +       * to date.
+ +       */
+ +      arch_leave_lazy_cpu_mode();
+ +
         /* 
          * Switch FS and GS.
          */
         { 
- -              unsigned fsindex;
- -              asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
                 /* segment register != 0 always requires a reload. 
                    also reload when it has changed. 
                    when prev process used 64bit base always reload
@@@ -602,7 -617,10 +605,7 @@@
                 if (next->fs) 
                         wrmsrl(MSR_FS_BASE, next->fs); 
                 prev->fsindex = fsindex;
- -      }
- -      { 
- -              unsigned gsindex;
- -              asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+ +
                 if (unlikely(gsindex | next->gsindex | prev->gs)) {
                         load_gs_index(next->gsindex);
                         if (gsindex)
@@@ -783,7 -801,7 +786,7 @@@ long do_arch_prctl(struct task_struct *
                         set_32bit_tls(task, FS_TLS, addr);
                         if (doit) {
                                 load_TLS(&task->thread, cpu);
- -                              asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+ +                              loadsegment(fs, FS_TLS_SEL);
                         }
                         task->thread.fsindex = FS_TLS_SEL;
                         task->thread.fs = 0;
@@@ -793,7 -811,7 +796,7 @@@
                         if (doit) {
                                 /* set the selector to 0 to not confuse
                                    __switch_to */
- -                              asm volatile("movl %0,%%fs" :: "r" (0));
+ +                              loadsegment(fs, 0);
                                 ret = checking_wrmsrl(MSR_FS_BASE, addr);
                         }
                 }
@@@ -816,7 -834,7 +819,7 @@@
                 if (task->thread.gsindex == GS_TLS_SEL)
                         base = read_32bit_tls(task, GS_TLS);
                 else if (doit) {
- -                      asm("movl %%gs,%0" : "=r" (gsindex));
+ +                      savesegment(gs, gsindex);
                         if (gsindex)
                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
                         else
diff --combined arch/x86/kernel/vsyscall_64.c

index c87cbd84c3e521ca92a567c751bc852a74239122,4063dfa2a02d54da181c363be5f215db7b161bbb..e50740d32314e2608e13da780fa2c704f2172df4
--- 1/arch/x86/kernel/vsyscall_64.c
--- 2/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@@ -42,7 -42,8 +42,8 @@@
   #include <asm/topology.h>
   #include <asm/vgtod.h>
   
- #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+ #define __vsyscall(nr) \
+               __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
   #define __syscall_clobber "r11","cx","memory"
   
   /*
@@@ -249,7 -250,7 +250,7 @@@ static ctl_table kernel_root_table2[] 
      doesn't violate that. We'll find out if it does. */
   static void __cpuinit vsyscall_set_cpu(int cpu)
   {
- -      unsigned long *d;
+ +      unsigned long d;
         unsigned long node = 0;
   #ifdef CONFIG_NUMA
         node = cpu_to_node(cpu);
@@@ -260,11 -261,11 +261,11 @@@
         /* Store cpu number in limit so that it can be loaded quickly
            in user space in vgetcpu.
            12 bits for the CPU and 8 bits for the node. */
- -      d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
- -      *d = 0x0f40000000000ULL;
- -      *d |= cpu;
- -      *d |= (node & 0xf) << 12;
- -      *d |= (node >> 4) << 48;
+ +      d = 0x0f40000000000ULL;
+ +      d |= cpu;
+ +      d |= (node & 0xf) << 12;
+ +      d |= (node >> 4) << 48;
+ +      write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
   }
   
   static void __cpuinit cpu_vsyscall_init(void *arg)
diff --combined arch/x86/kernel/x8664_ksyms_64.c

index 2f306a8268973bb531402dc7b8aa8cb14ebbd22a,16ff4bf418d9de6b00295ef16490d3f20ec00f6b..b545f371b5f542243de7c9962ce551fb10647f3a
--- 1/arch/x86/kernel/x8664_ksyms_64.c
--- 2/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@@ -2,13 -2,20 +2,20 @@@
      All C exports should go in the respective C files. */
   
   #include <linux/module.h>
- #include <net/checksum.h>
   #include <linux/smp.h>
   
+ #include <net/checksum.h>
+ 
   #include <asm/processor.h>
- #include <asm/uaccess.h>
   #include <asm/pgtable.h>
+ #include <asm/uaccess.h>
   #include <asm/desc.h>
+ #include <asm/ftrace.h>
+ 
+ #ifdef CONFIG_FTRACE
+ /* mcount is defined in assembly */
+ EXPORT_SYMBOL(mcount);
+ #endif
   
   EXPORT_SYMBOL(kernel_thread);
   
@@@ -53,3 -60,8 +60,3 @@@ EXPORT_SYMBOL(init_level4_pgt)
   EXPORT_SYMBOL(load_gs_index);
   
   EXPORT_SYMBOL(_proxy_pda);
- -
- -#ifdef CONFIG_PARAVIRT
- -/* Virtualized guests may want to use it */
- -EXPORT_SYMBOL_GPL(cpu_gdt_descr);
- -#endif
diff --combined arch/x86/lib/Makefile

index 83226e0a7ce4e331aa1aac3277b1d6301bf025f8,84aa2883fe153fba9094169cef5f447bb79f2bd0..aa3fa4119424205ba297e1af4ce4100ba5514baf
--- 1/arch/x86/lib/Makefile
--- 2/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@@ -4,8 -4,9 +4,9 @@@
   
   obj-$(CONFIG_SMP) := msr-on-cpu.o
   
- -lib-y := delay_$(BITS).o
+ +lib-y := delay.o
+ lib-y += thunk_$(BITS).o
- -lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
+ +lib-y += usercopy_$(BITS).o getuser.o putuser.o
   lib-y += memcpy_$(BITS).o
   
   ifeq ($(CONFIG_X86_32),y)
diff --combined arch/x86/mm/Makefile

index c107641cd39bfadb4969b6b86183fe666034616e,07dab503c9e37de210223f56538bc8816140204d..9873716e9f764bcd7c5bc0f369a0269efdffb35a
--- 1/arch/x86/mm/Makefile
--- 2/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@@ -8,11 -8,15 +8,16 @@@ obj-$(CONFIG_X86_PTDUMP)      += dump_pageta
   
   obj-$(CONFIG_HIGHMEM)         += highmem_32.o
   
+ obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
+ obj-$(CONFIG_MMIOTRACE)               += mmiotrace.o
+ mmiotrace-y                   := pf_in.o mmio-mod.o
+ obj-$(CONFIG_MMIOTRACE_TEST)  += testmmiotrace.o
+ 
   ifeq ($(CONFIG_X86_32),y)
   obj-$(CONFIG_NUMA)            += discontig_32.o
   else
   obj-$(CONFIG_NUMA)            += numa_64.o
   obj-$(CONFIG_K8_NUMA)         += k8topology_64.o
- -obj-$(CONFIG_ACPI_NUMA)               += srat_64.o
   endif
+ +obj-$(CONFIG_ACPI_NUMA)               += srat_$(BITS).o
+ +
diff --combined arch/x86/mm/fault.c

index d0f5fce77d95b9f8afa7c0879a3e211b937365b8,0a778e3c43ee42df8b71405c9f5e35bd49a411e4..455f3fe67b42412c8acda3eb63e33678ebec1845
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -10,6 -10,7 +10,7 @@@
   #include <linux/string.h>
   #include <linux/types.h>
   #include <linux/ptrace.h>
+ #include <linux/mmiotrace.h>
   #include <linux/mman.h>
   #include <linux/mm.h>
   #include <linux/smp.h>
@@@ -49,13 -50,27 +50,23 @@@
   #define PF_RSVD               (1<<3)
   #define PF_INSTR      (1<<4)
   
+ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+ {
+ #ifdef CONFIG_MMIOTRACE_HOOKS
+       if (unlikely(is_kmmio_active()))
+               if (kmmio_handler(regs, addr) == 1)
+                       return -1;
+ #endif
+       return 0;
+ }
+ 
   static inline int notify_page_fault(struct pt_regs *regs)
   {
   #ifdef CONFIG_KPROBES
         int ret = 0;
   
         /* kprobe_running() needs smp_processor_id() */
- -#ifdef CONFIG_X86_32
         if (!user_mode_vm(regs)) {
- -#else
- -      if (!user_mode(regs)) {
- -#endif
                 preempt_disable();
                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
                         ret = 1;
@@@ -392,7 -407,11 +403,7 @@@ static void show_fault_oops(struct pt_r
                 printk(KERN_CONT "NULL pointer dereference");
         else
                 printk(KERN_CONT "paging request");
- -#ifdef CONFIG_X86_32
- -      printk(KERN_CONT " at %08lx\n", address);
- -#else
- -      printk(KERN_CONT " at %016lx\n", address);
- -#endif
+ +      printk(KERN_CONT " at %p\n", (void *) address);
         printk(KERN_ALERT "IP:");
         printk_address(regs->ip, 1);
         dump_pagetable(address);
@@@ -598,6 -617,8 +609,8 @@@ void __kprobes do_page_fault(struct pt_
   
         if (notify_page_fault(regs))
                 return;
+       if (unlikely(kmmio_fault(regs, address)))
+               return;
   
         /*
          * We fault-in kernel-space virtual memory on-demand. The
@@@ -792,10 -813,14 +805,10 @@@ bad_area_nosemaphore
                 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
                     printk_ratelimit()) {
                         printk(
- -#ifdef CONFIG_X86_32
- -                      "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
- -#else
- -                      "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
- -#endif
+ +                      "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
                         task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- -                      tsk->comm, task_pid_nr(tsk), address, regs->ip,
- -                      regs->sp, error_code);
+ +                      tsk->comm, task_pid_nr(tsk), address,
+ +                      (void *) regs->ip, (void *) regs->sp, error_code);
                         print_vma_addr(" in ", regs->ip);
                         printk("\n");
                 }
@@@ -903,7 -928,14 +916,7 @@@ LIST_HEAD(pgd_list)
   void vmalloc_sync_all(void)
   {
   #ifdef CONFIG_X86_32
- -      /*
- -       * Note that races in the updates of insync and start aren't
- -       * problematic: insync can only get set bits added, and updates to
- -       * start are only improving performance (without affecting correctness
- -       * if undone).
- -       */
- -      static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- -      static unsigned long start = TASK_SIZE;
+ +      unsigned long start = VMALLOC_START & PGDIR_MASK;
         unsigned long address;
   
         if (SHARED_KERNEL_PMD)
@@@ -911,38 -943,56 +924,38 @@@
   
         BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
         for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
- -              if (!test_bit(pgd_index(address), insync)) {
- -                      unsigned long flags;
- -                      struct page *page;
- -
- -                      spin_lock_irqsave(&pgd_lock, flags);
- -                      list_for_each_entry(page, &pgd_list, lru) {
- -                              if (!vmalloc_sync_one(page_address(page),
- -                                                    address))
- -                                      break;
- -                      }
- -                      spin_unlock_irqrestore(&pgd_lock, flags);
- -                      if (!page)
- -                              set_bit(pgd_index(address), insync);
+ +              unsigned long flags;
+ +              struct page *page;
+ +
+ +              spin_lock_irqsave(&pgd_lock, flags);
+ +              list_for_each_entry(page, &pgd_list, lru) {
+ +                      if (!vmalloc_sync_one(page_address(page),
+ +                                            address))
+ +                              break;
                 }
- -              if (address == start && test_bit(pgd_index(address), insync))
- -                      start = address + PGDIR_SIZE;
+ +              spin_unlock_irqrestore(&pgd_lock, flags);
         }
   #else /* CONFIG_X86_64 */
- -      /*
- -       * Note that races in the updates of insync and start aren't
- -       * problematic: insync can only get set bits added, and updates to
- -       * start are only improving performance (without affecting correctness
- -       * if undone).
- -       */
- -      static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- -      static unsigned long start = VMALLOC_START & PGDIR_MASK;
+ +      unsigned long start = VMALLOC_START & PGDIR_MASK;
         unsigned long address;
   
         for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
- -              if (!test_bit(pgd_index(address), insync)) {
- -                      const pgd_t *pgd_ref = pgd_offset_k(address);
- -                      unsigned long flags;
- -                      struct page *page;
- -
- -                      if (pgd_none(*pgd_ref))
- -                              continue;
- -                      spin_lock_irqsave(&pgd_lock, flags);
- -                      list_for_each_entry(page, &pgd_list, lru) {
- -                              pgd_t *pgd;
- -                              pgd = (pgd_t *)page_address(page) + pgd_index(address);
- -                              if (pgd_none(*pgd))
- -                                      set_pgd(pgd, *pgd_ref);
- -                              else
- -                                      BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- -                      }
- -                      spin_unlock_irqrestore(&pgd_lock, flags);
- -                      set_bit(pgd_index(address), insync);
+ +              const pgd_t *pgd_ref = pgd_offset_k(address);
+ +              unsigned long flags;
+ +              struct page *page;
+ +
+ +              if (pgd_none(*pgd_ref))
+ +                      continue;
+ +              spin_lock_irqsave(&pgd_lock, flags);
+ +              list_for_each_entry(page, &pgd_list, lru) {
+ +                      pgd_t *pgd;
+ +                      pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ +                      if (pgd_none(*pgd))
+ +                              set_pgd(pgd, *pgd_ref);
+ +                      else
+ +                              BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
                 }
- -              if (address == start)
- -                      start = address + PGDIR_SIZE;
+ +              spin_unlock_irqrestore(&pgd_lock, flags);
         }
   #endif
   }
diff --combined arch/x86/mm/init_32.c

index 029e8cffca9e11cf1794b0cad1c74029eaefa815,f96eca21ad8fe4adb65f24856cab2a4b62af7c74..9689a5138e6472e33c6d0862b3ae56194ffcedb4
--- 1/arch/x86/mm/init_32.c
--- 2/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@@ -50,7 -50,6 +50,7 @@@
   
   unsigned int __VMALLOC_RESERVE = 128 << 20;
   
+ +unsigned long max_low_pfn_mapped;
   unsigned long max_pfn_mapped;
   
   DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@@ -58,27 -57,6 +58,27 @@@ unsigned long highstart_pfn, highend_pf
   
   static noinline int do_test_wp_bit(void);
   
+ +
+ +static unsigned long __initdata table_start;
+ +static unsigned long __meminitdata table_end;
+ +static unsigned long __meminitdata table_top;
+ +
+ +static int __initdata after_init_bootmem;
+ +
+ +static __init void *alloc_low_page(unsigned long *phys)
+ +{
+ +      unsigned long pfn = table_end++;
+ +      void *adr;
+ +
+ +      if (pfn >= table_top)
+ +              panic("alloc_low_page: ran out of memory");
+ +
+ +      adr = __va(pfn * PAGE_SIZE);
+ +      memset(adr, 0, PAGE_SIZE);
+ +      *phys  = pfn * PAGE_SIZE;
+ +      return adr;
+ +}
+ +
   /*
    * Creates a middle page table and puts a pointer to it in the
    * given global directory entry. This only returns the gd entry
@@@ -90,12 -68,9 +90,12 @@@ static pmd_t * __init one_md_table_init
         pmd_t *pmd_table;
   
   #ifdef CONFIG_X86_PAE
+ +      unsigned long phys;
         if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
- -              pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- -
+ +              if (after_init_bootmem)
+ +                      pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ +              else
+ +                      pmd_table = (pmd_t *)alloc_low_page(&phys);
                 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                 pud = pud_offset(pgd, 0);
@@@ -117,16 -92,12 +117,16 @@@ static pte_t * __init one_page_table_in
         if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                 pte_t *page_table = NULL;
   
+ +              if (after_init_bootmem) {
   #ifdef CONFIG_DEBUG_PAGEALLOC
- -              page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+ +                      page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
   #endif
- -              if (!page_table) {
- -                      page_table =
+ +                      if (!page_table)
+ +                              page_table =
                                 (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ +              } else {
+ +                      unsigned long phys;
+ +                      page_table = (pte_t *)alloc_low_page(&phys);
                 }
   
                 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
@@@ -184,44 -155,38 +184,44 @@@ static inline int is_kernel_text(unsign
    * of max_low_pfn pages, by creating page tables starting from address
    * PAGE_OFFSET:
    */
- -static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+ +static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
+ +                                              unsigned long start_pfn,
+ +                                              unsigned long end_pfn,
+ +                                              int use_pse)
   {
         int pgd_idx, pmd_idx, pte_ofs;
         unsigned long pfn;
         pgd_t *pgd;
         pmd_t *pmd;
         pte_t *pte;
+ +      unsigned pages_2m = 0, pages_4k = 0;
   
- -      pgd_idx = pgd_index(PAGE_OFFSET);
- -      pgd = pgd_base + pgd_idx;
- -      pfn = 0;
+ +      if (!cpu_has_pse)
+ +              use_pse = 0;
   
+ +      pfn = start_pfn;
+ +      pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ +      pgd = pgd_base + pgd_idx;
         for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
                 pmd = one_md_table_init(pgd);
- -              if (pfn >= max_low_pfn)
- -                      continue;
   
- -              for (pmd_idx = 0;
- -                   pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+ +              if (pfn >= end_pfn)
+ +                      continue;
+ +#ifdef CONFIG_X86_PAE
+ +              pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ +              pmd += pmd_idx;
+ +#else
+ +              pmd_idx = 0;
+ +#endif
+ +              for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
                      pmd++, pmd_idx++) {
                         unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
   
                         /*
                          * Map with big pages if possible, otherwise
                          * create normal page tables:
- -                       *
- -                       * Don't use a large page for the first 2/4MB of memory
- -                       * because there are often fixed size MTRRs in there
- -                       * and overlapping MTRRs into large pages can cause
- -                       * slowdowns.
                          */
- -                      if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
+ +                      if (use_pse) {
                                 unsigned int addr2;
                                 pgprot_t prot = PAGE_KERNEL_LARGE;
   
@@@ -232,30 -197,34 +232,30 @@@
                                     is_kernel_text(addr2))
                                         prot = PAGE_KERNEL_LARGE_EXEC;
   
+ +                              pages_2m++;
                                 set_pmd(pmd, pfn_pmd(pfn, prot));
   
                                 pfn += PTRS_PER_PTE;
- -                              max_pfn_mapped = pfn;
                                 continue;
                         }
                         pte = one_page_table_init(pmd);
   
- -                      for (pte_ofs = 0;
- -                           pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ +                      pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ +                      pte += pte_ofs;
+ +                      for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
                              pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
                                 pgprot_t prot = PAGE_KERNEL;
   
                                 if (is_kernel_text(addr))
                                         prot = PAGE_KERNEL_EXEC;
   
+ +                              pages_4k++;
                                 set_pte(pte, pfn_pte(pfn, prot));
                         }
- -                      max_pfn_mapped = pfn;
                 }
         }
- -}
- -
- -static inline int page_kills_ppro(unsigned long pagenr)
- -{
- -      if (pagenr >= 0x70000 && pagenr <= 0x7003F)
- -              return 1;
- -      return 0;
+ +      update_page_count(PG_LEVEL_2M, pages_2m);
+ +      update_page_count(PG_LEVEL_4K, pages_4k);
   }
   
   /*
@@@ -318,62 -287,29 +318,62 @@@ static void __init permanent_kmaps_init
         pkmap_page_table = pte;
   }
   
- -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+ +static void __init add_one_highpage_init(struct page *page, int pfn)
   {
- -      if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
- -              ClearPageReserved(page);
- -              init_page_count(page);
- -              __free_page(page);
- -              totalhigh_pages++;
- -      } else
- -              SetPageReserved(page);
+ +      ClearPageReserved(page);
+ +      init_page_count(page);
+ +      __free_page(page);
+ +      totalhigh_pages++;
   }
   
- -#ifndef CONFIG_NUMA
- -static void __init set_highmem_pages_init(int bad_ppro)
+ +struct add_highpages_data {
+ +      unsigned long start_pfn;
+ +      unsigned long end_pfn;
+ +};
+ +
+ +static int __init add_highpages_work_fn(unsigned long start_pfn,
+ +                                       unsigned long end_pfn, void *datax)
   {
- -      int pfn;
+ +      int node_pfn;
+ +      struct page *page;
+ +      unsigned long final_start_pfn, final_end_pfn;
+ +      struct add_highpages_data *data;
   
- -      for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
- -              /*
- -               * Holes under sparsemem might not have no mem_map[]:
- -               */
- -              if (pfn_valid(pfn))
- -                      add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ +      data = (struct add_highpages_data *)datax;
+ +
+ +      final_start_pfn = max(start_pfn, data->start_pfn);
+ +      final_end_pfn = min(end_pfn, data->end_pfn);
+ +      if (final_start_pfn >= final_end_pfn)
+ +              return 0;
+ +
+ +      for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+ +           node_pfn++) {
+ +              if (!pfn_valid(node_pfn))
+ +                      continue;
+ +              page = pfn_to_page(node_pfn);
+ +              add_one_highpage_init(page, node_pfn);
         }
+ +
+ +      return 0;
+ +
+ +}
+ +
+ +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ +                                            unsigned long end_pfn)
+ +{
+ +      struct add_highpages_data data;
+ +
+ +      data.start_pfn = start_pfn;
+ +      data.end_pfn = end_pfn;
+ +
+ +      work_with_active_regions(nid, add_highpages_work_fn, &data);
+ +}
+ +
+ +#ifndef CONFIG_NUMA
+ +static void __init set_highmem_pages_init(void)
+ +{
+ +      add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
+ +
         totalram_pages += totalhigh_pages;
   }
   #endif /* !CONFIG_NUMA */
@@@ -381,9 -317,14 +381,9 @@@
   #else
   # define kmap_init()                          do { } while (0)
   # define permanent_kmaps_init(pgd_base)               do { } while (0)
- -# define set_highmem_pages_init(bad_ppro)     do { } while (0)
+ +# define set_highmem_pages_init()     do { } while (0)
   #endif /* CONFIG_HIGHMEM */
   
- -pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
- -EXPORT_SYMBOL(__PAGE_KERNEL);
- -
- -pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
- -
   void __init native_pagetable_setup_start(pgd_t *base)
   {
         unsigned long pfn, va;
@@@ -439,10 -380,27 +439,10 @@@ void __init native_pagetable_setup_done
    * be partially populated, and so it avoids stomping on any existing
    * mappings.
    */
- -static void __init pagetable_init(void)
+ +static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
   {
- -      pgd_t *pgd_base = swapper_pg_dir;
         unsigned long vaddr, end;
   
- -      paravirt_pagetable_setup_start(pgd_base);
- -
- -      /* Enable PSE if available */
- -      if (cpu_has_pse)
- -              set_in_cr4(X86_CR4_PSE);
- -
- -      /* Enable PGE if available */
- -      if (cpu_has_pge) {
- -              set_in_cr4(X86_CR4_PGE);
- -              __PAGE_KERNEL |= _PAGE_GLOBAL;
- -              __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
- -      }
- -
- -      kernel_physical_mapping_init(pgd_base);
- -      remap_numa_kva();
- -
         /*
          * Fixed mappings, only the page table structure has to be
          * created - mappings will be set by set_fixmap():
@@@ -452,13 -410,6 +452,13 @@@
         end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
         page_table_range_init(vaddr, end, pgd_base);
         early_ioremap_reset();
+ +}
+ +
+ +static void __init pagetable_init(void)
+ +{
+ +      pgd_t *pgd_base = swapper_pg_dir;
+ +
+ +      paravirt_pagetable_setup_start(pgd_base);
   
         permanent_kmaps_init(pgd_base);
   
@@@ -505,7 -456,7 +505,7 @@@ void zap_low_mappings(void
   
   int nx_enabled;
   
- -pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+ +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
   EXPORT_SYMBOL_GPL(__supported_pte_mask);
   
   #ifdef CONFIG_X86_PAE
@@@ -558,318 -509,27 +558,318 @@@ static void __init set_nx(void
   }
   #endif
   
+ +/* user-defined highmem size */
+ +static unsigned int highmem_pages = -1;
+ +
   /*
- - * paging_init() sets up the page tables - note that the first 8MB are
- - * already mapped by head.S.
- - *
- - * This routines also unmaps the page at virtual kernel address 0, so
- - * that we can trap those pesky NULL-reference errors in the kernel.
+ + * highmem=size forces highmem to be exactly 'size' bytes.
+ + * This works even on boxes that have no highmem otherwise.
+ + * This also works to reduce highmem size on bigger boxes.
    */
- -void __init paging_init(void)
+ +static int __init parse_highmem(char *arg)
+ +{
+ +      if (!arg)
+ +              return -EINVAL;
+ +
+ +      highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ +      return 0;
+ +}
+ +early_param("highmem", parse_highmem);
+ +
+ +/*
+ + * Determine low and high memory ranges:
+ + */
+ +void __init find_low_pfn_range(void)
   {
+ +      /* it could update max_pfn */
+ +
+ +      /* max_low_pfn is 0, we already have early_res support */
+ +
+ +      max_low_pfn = max_pfn;
+ +      if (max_low_pfn > MAXMEM_PFN) {
+ +              if (highmem_pages == -1)
+ +                      highmem_pages = max_pfn - MAXMEM_PFN;
+ +              if (highmem_pages + MAXMEM_PFN < max_pfn)
+ +                      max_pfn = MAXMEM_PFN + highmem_pages;
+ +              if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ +                      printk(KERN_WARNING "only %luMB highmem pages "
+ +                              "available, ignoring highmem size of %uMB.\n",
+ +                              pages_to_mb(max_pfn - MAXMEM_PFN),
+ +                              pages_to_mb(highmem_pages));
+ +                      highmem_pages = 0;
+ +              }
+ +              max_low_pfn = MAXMEM_PFN;
+ +#ifndef CONFIG_HIGHMEM
+ +              /* Maximum memory usable is what is directly addressable */
+ +              printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+ +                                      MAXMEM>>20);
+ +              if (max_pfn > MAX_NONPAE_PFN)
+ +                      printk(KERN_WARNING
+ +                               "Use a HIGHMEM64G enabled kernel.\n");
+ +              else
+ +                      printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ +              max_pfn = MAXMEM_PFN;
+ +#else /* !CONFIG_HIGHMEM */
+ +#ifndef CONFIG_HIGHMEM64G
+ +              if (max_pfn > MAX_NONPAE_PFN) {
+ +                      max_pfn = MAX_NONPAE_PFN;
+ +                      printk(KERN_WARNING "Warning only 4GB will be used."
+ +                              "Use a HIGHMEM64G enabled kernel.\n");
+ +              }
+ +#endif /* !CONFIG_HIGHMEM64G */
+ +#endif /* !CONFIG_HIGHMEM */
+ +      } else {
+ +              if (highmem_pages == -1)
+ +                      highmem_pages = 0;
+ +#ifdef CONFIG_HIGHMEM
+ +              if (highmem_pages >= max_pfn) {
+ +                      printk(KERN_ERR "highmem size specified (%uMB) is "
+ +                              "bigger than pages available (%luMB)!.\n",
+ +                              pages_to_mb(highmem_pages),
+ +                              pages_to_mb(max_pfn));
+ +                      highmem_pages = 0;
+ +              }
+ +              if (highmem_pages) {
+ +                      if (max_low_pfn - highmem_pages <
+ +                          64*1024*1024/PAGE_SIZE){
+ +                              printk(KERN_ERR "highmem size %uMB results in "
+ +                              "smaller than 64MB lowmem, ignoring it.\n"
+ +                                      , pages_to_mb(highmem_pages));
+ +                              highmem_pages = 0;
+ +                      }
+ +                      max_low_pfn -= highmem_pages;
+ +              }
+ +#else
+ +              if (highmem_pages)
+ +                      printk(KERN_ERR "ignoring highmem size on non-highmem"
+ +                                      " kernel!\n");
+ +#endif
+ +      }
+ +}
+ +
+ +#ifndef CONFIG_NEED_MULTIPLE_NODES
+ +void __init initmem_init(unsigned long start_pfn,
+ +                                unsigned long end_pfn)
+ +{
+ +#ifdef CONFIG_HIGHMEM
+ +      highstart_pfn = highend_pfn = max_pfn;
+ +      if (max_pfn > max_low_pfn)
+ +              highstart_pfn = max_low_pfn;
+ +      memory_present(0, 0, highend_pfn);
+ +      e820_register_active_regions(0, 0, highend_pfn);
+ +      printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ +              pages_to_mb(highend_pfn - highstart_pfn));
+ +      num_physpages = highend_pfn;
+ +      high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+ +#else
+ +      memory_present(0, 0, max_low_pfn);
+ +      e820_register_active_regions(0, 0, max_low_pfn);
+ +      num_physpages = max_low_pfn;
+ +      high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+ +#endif
+ +#ifdef CONFIG_FLATMEM
+ +      max_mapnr = num_physpages;
+ +#endif
+ +      printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ +                      pages_to_mb(max_low_pfn));
+ +
+ +      setup_bootmem_allocator();
+ +}
+ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+ +
+ +static void __init zone_sizes_init(void)
+ +{
+ +      unsigned long max_zone_pfns[MAX_NR_ZONES];
+ +      memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+ +      max_zone_pfns[ZONE_DMA] =
+ +              virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ +      max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+ +#ifdef CONFIG_HIGHMEM
+ +      max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+ +#endif
+ +
+ +      free_area_init_nodes(max_zone_pfns);
+ +}
+ +
+ +void __init setup_bootmem_allocator(void)
+ +{
+ +      int i;
+ +      unsigned long bootmap_size, bootmap;
+ +      /*
+ +       * Initialize the boot-time allocator (with low memory only):
+ +       */
+ +      bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+ +      bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ +                               max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ +                               PAGE_SIZE);
+ +      if (bootmap == -1L)
+ +              panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ +      reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
+ +
+ +      /* don't touch min_low_pfn */
+ +      bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ +                                       min_low_pfn, max_low_pfn);
+ +      printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+ +               max_pfn_mapped<<PAGE_SHIFT);
+ +      printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+ +               min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+ +      printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+ +               bootmap, bootmap + bootmap_size);
+ +      for_each_online_node(i)
+ +              free_bootmem_with_active_regions(i, max_low_pfn);
+ +      early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+ +
+ +      after_init_bootmem = 1;
+ +}
+ +
+ +static void __init find_early_table_space(unsigned long end)
+ +{
+ +      unsigned long puds, pmds, ptes, tables, start;
+ +
+ +      puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+ +      tables = PAGE_ALIGN(puds * sizeof(pud_t));
+ +
+ +      pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ +      tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
+ +
+ +      if (cpu_has_pse) {
+ +              unsigned long extra;
+ +
+ +              extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ +              extra += PMD_SIZE;
+ +              ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ +      } else
+ +              ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ +
+ +      tables += PAGE_ALIGN(ptes * sizeof(pte_t));
+ +
+ +      /* for fixmap */
+ +      tables += PAGE_SIZE * 2;
+ +
+ +      /*
+ +       * RED-PEN putting page tables only on node 0 could
+ +       * cause a hotspot and fill up ZONE_DMA. The page tables
+ +       * need roughly 0.5KB per GB.
+ +       */
+ +      start = 0x7000;
+ +      table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ +                                      tables, PAGE_SIZE);
+ +      if (table_start == -1UL)
+ +              panic("Cannot find space for the kernel page tables");
+ +
+ +      table_start >>= PAGE_SHIFT;
+ +      table_end = table_start;
+ +      table_top = table_start + (tables>>PAGE_SHIFT);
+ +
+ +      printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ +              end, table_start << PAGE_SHIFT,
+ +              (table_start << PAGE_SHIFT) + tables);
+ +}
+ +
+ +unsigned long __init_refok init_memory_mapping(unsigned long start,
+ +                                              unsigned long end)
+ +{
+ +      pgd_t *pgd_base = swapper_pg_dir;
+ +      unsigned long start_pfn, end_pfn;
+ +      unsigned long big_page_start;
+ +
+ +      /*
+ +       * Find space for the kernel direct mapping tables.
+ +       */
+ +      if (!after_init_bootmem)
+ +              find_early_table_space(end);
+ +
   #ifdef CONFIG_X86_PAE
         set_nx();
         if (nx_enabled)
                 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
   #endif
- -      pagetable_init();
+ +
+ +      /* Enable PSE if available */
+ +      if (cpu_has_pse)
+ +              set_in_cr4(X86_CR4_PSE);
+ +
+ +      /* Enable PGE if available */
+ +      if (cpu_has_pge) {
+ +              set_in_cr4(X86_CR4_PGE);
+ +              __supported_pte_mask |= _PAGE_GLOBAL;
+ +      }
+ +
+ +      /*
+ +       * Don't use a large page for the first 2/4MB of memory
+ +       * because there are often fixed size MTRRs in there
+ +       * and overlapping MTRRs into large pages can cause
+ +       * slowdowns.
+ +       */
+ +      big_page_start = PMD_SIZE;
+ +
+ +      if (start < big_page_start) {
+ +              start_pfn = start >> PAGE_SHIFT;
+ +              end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
+ +      } else {
+ +              /* head is not big page alignment ? */
+ +              start_pfn = start >> PAGE_SHIFT;
+ +              end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ +                               << (PMD_SHIFT - PAGE_SHIFT);
+ +      }
+ +      if (start_pfn < end_pfn)
+ +              kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
+ +
+ +      /* big page range */
+ +      start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ +                       << (PMD_SHIFT - PAGE_SHIFT);
+ +      if (start_pfn < (big_page_start >> PAGE_SHIFT))
+ +              start_pfn =  big_page_start >> PAGE_SHIFT;
+ +      end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ +      if (start_pfn < end_pfn)
+ +              kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
+ +                                              cpu_has_pse);
+ +
+ +      /* tail is not big page alignment ? */
+ +      start_pfn = end_pfn;
+ +      if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
+ +              end_pfn = end >> PAGE_SHIFT;
+ +              if (start_pfn < end_pfn)
+ +                      kernel_physical_mapping_init(pgd_base, start_pfn,
+ +                                                       end_pfn, 0);
+ +      }
+ +
+ +      early_ioremap_page_table_range_init(pgd_base);
   
         load_cr3(swapper_pg_dir);
   
         __flush_tlb_all();
   
+ +      if (!after_init_bootmem)
+ +              reserve_early(table_start << PAGE_SHIFT,
+ +                               table_end << PAGE_SHIFT, "PGTABLE");
+ +
+ +      return end >> PAGE_SHIFT;
+ +}
+ +
+ +
+ +/*
+ + * paging_init() sets up the page tables - note that the first 8MB are
+ + * already mapped by head.S.
+ + *
+ + * This routines also unmaps the page at virtual kernel address 0, so
+ + * that we can trap those pesky NULL-reference errors in the kernel.
+ + */
+ +void __init paging_init(void)
+ +{
+ +      pagetable_init();
+ +
+ +      __flush_tlb_all();
+ +
         kmap_init();
+ +
+ +      /*
+ +       * NOTE: at this point the bootmem allocator is fully available.
+ +       */
+ +      sparse_init();
+ +      zone_sizes_init();
+ +
+ +      paravirt_post_allocator_init();
   }
   
   /*
@@@ -904,10 -564,23 +904,10 @@@ static struct kcore_list kcore_mem, kco
   void __init mem_init(void)
   {
         int codesize, reservedpages, datasize, initsize;
- -      int tmp, bad_ppro;
+ +      int tmp;
   
   #ifdef CONFIG_FLATMEM
         BUG_ON(!mem_map);
- -#endif
- -      bad_ppro = ppro_with_ram_bug();
- -
- -#ifdef CONFIG_HIGHMEM
- -      /* check that fixmap and pkmap do not overlap */
- -      if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
- -              printk(KERN_ERR
- -                      "fixmap and kmap areas overlap - this will crash\n");
- -              printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
- -                              PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
- -                              FIXADDR_START);
- -              BUG();
- -      }
   #endif
         /* this will put all low memory onto the freelists */
         totalram_pages += free_all_bootmem();
@@@ -920,7 -593,7 +920,7 @@@
                 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
                         reservedpages++;
   
- -      set_highmem_pages_init(bad_ppro);
+ +      set_highmem_pages_init();
   
         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
@@@ -941,6 -614,7 +941,6 @@@
                 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
                );
   
- -#if 1 /* double-sanity-check paranoia */
         printk(KERN_INFO "virtual kernel memory layout:\n"
                 "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
   #ifdef CONFIG_HIGHMEM
@@@ -981,6 -655,7 +981,6 @@@
   #endif
         BUG_ON(VMALLOC_START                            > VMALLOC_END);
         BUG_ON((unsigned long)high_memory               > VMALLOC_START);
- -#endif /* double-sanity-check paranoia */
   
         if (boot_cpu_data.wp_works_ok < 0)
                 test_wp_bit();
@@@ -1035,6 -710,8 +1035,8 @@@ void mark_rodata_ro(void
         unsigned long start = PFN_ALIGN(_text);
         unsigned long size = PFN_ALIGN(_etext) - start;
   
+ #ifndef CONFIG_DYNAMIC_FTRACE
+       /* Dynamic tracing modifies the kernel text section */
         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
         printk(KERN_INFO "Write protecting the kernel text: %luk\n",
                 size >> 10);
@@@ -1047,6 -724,8 +1049,8 @@@
         printk(KERN_INFO "Testing CPA: write protecting again\n");
         set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
   #endif
+ #endif /* CONFIG_DYNAMIC_FTRACE */
+ 
         start += size;
         size = (unsigned long)__end_rodata - start;
         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
@@@ -1109,9 -788,3 +1113,9 @@@ void free_initrd_mem(unsigned long star
         free_init_pages("initrd memory", start, end);
   }
   #endif
+ +
+ +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ +                                 int flags)
+ +{
+ +      return reserve_bootmem(phys, len, flags);
+ +}
diff --combined arch/x86/mm/init_64.c

index a25cc6fa2207262131dcec567f3b43ce861d0f3f,17c0a6138a53bd0bc84f7a2504bd85a612733c91..27de2435e0080f419e3c5ac7386f710b993a93cd
--- 1/arch/x86/mm/init_64.c
--- 2/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@@ -18,7 -18,6 +18,7 @@@
   #include <linux/swap.h>
   #include <linux/smp.h>
   #include <linux/init.h>
+ +#include <linux/initrd.h>
   #include <linux/pagemap.h>
   #include <linux/bootmem.h>
   #include <linux/proc_fs.h>
@@@ -48,14 -47,6 +48,14 @@@
   #include <asm/numa.h>
   #include <asm/cacheflush.h>
   
+ +/*
+ + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ + * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ + * apertures, ACPI and other tables without having to play with fixmaps.
+ + */
+ +unsigned long max_low_pfn_mapped;
+ +unsigned long max_pfn_mapped;
+ +
   static unsigned long dma_reserve __initdata;
   
   DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@@ -144,17 -135,26 +144,17 @@@ static __init void *spp_getpage(void
         return ptr;
   }
   
- -static __init void
- -set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
+ +void
+ +set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
   {
- -      pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
- -      pte_t *pte, new_pte;
- -
- -      pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
+ +      pte_t *pte;
   
- -      pgd = pgd_offset_k(vaddr);
- -      if (pgd_none(*pgd)) {
- -              printk(KERN_ERR
- -                      "PGD FIXMAP MISSING, it should be setup in head.S!\n");
- -              return;
- -      }
- -      pud = pud_offset(pgd, vaddr);
+ +      pud = pud_page + pud_index(vaddr);
         if (pud_none(*pud)) {
                 pmd = (pmd_t *) spp_getpage();
- -              set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
+ +              pud_populate(&init_mm, pud, pmd);
                 if (pmd != pmd_offset(pud, 0)) {
                         printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
                                 pmd, pmd_offset(pud, 0));
@@@ -164,12 -164,13 +164,12 @@@
         pmd = pmd_offset(pud, vaddr);
         if (pmd_none(*pmd)) {
                 pte = (pte_t *) spp_getpage();
- -              set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
+ +              pmd_populate_kernel(&init_mm, pmd, pte);
                 if (pte != pte_offset_kernel(pmd, 0)) {
                         printk(KERN_ERR "PAGETABLE BUG #02!\n");
                         return;
                 }
         }
- -      new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
   
         pte = pte_offset_kernel(pmd, vaddr);
         if (!pte_none(*pte) && pte_val(new_pte) &&
@@@ -184,64 -185,6 +184,64 @@@
         __flush_tlb_one(vaddr);
   }
   
+ +void
+ +set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+ +{
+ +      pgd_t *pgd;
+ +      pud_t *pud_page;
+ +
+ +      pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+ +
+ +      pgd = pgd_offset_k(vaddr);
+ +      if (pgd_none(*pgd)) {
+ +              printk(KERN_ERR
+ +                      "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ +              return;
+ +      }
+ +      pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+ +      set_pte_vaddr_pud(pud_page, vaddr, pteval);
+ +}
+ +
+ +/*
+ + * Create large page table mappings for a range of physical addresses.
+ + */
+ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+ +                                              pgprot_t prot)
+ +{
+ +      pgd_t *pgd;
+ +      pud_t *pud;
+ +      pmd_t *pmd;
+ +
+ +      BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+ +      for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+ +              pgd = pgd_offset_k((unsigned long)__va(phys));
+ +              if (pgd_none(*pgd)) {
+ +                      pud = (pud_t *) spp_getpage();
+ +                      set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ +                                              _PAGE_USER));
+ +              }
+ +              pud = pud_offset(pgd, (unsigned long)__va(phys));
+ +              if (pud_none(*pud)) {
+ +                      pmd = (pmd_t *) spp_getpage();
+ +                      set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+ +                                              _PAGE_USER));
+ +              }
+ +              pmd = pmd_offset(pud, phys);
+ +              BUG_ON(!pmd_none(*pmd));
+ +              set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+ +      }
+ +}
+ +
+ +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+ +{
+ +      __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+ +}
+ +
+ +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+ +{
+ +      __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+ +}
+ +
   /*
    * The head.S code sets up the kernel high mapping:
    *
@@@ -270,9 -213,20 +270,9 @@@ void __init cleanup_highmap(void
         }
   }
   
- -/* NOTE: this is meant to be run only at boot */
- -void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
- -{
- -      unsigned long address = __fix_to_virt(idx);
- -
- -      if (idx >= __end_of_fixed_addresses) {
- -              printk(KERN_ERR "Invalid __set_fixmap\n");
- -              return;
- -      }
- -      set_pte_phys(address, phys, prot);
- -}
- -
   static unsigned long __initdata table_start;
   static unsigned long __meminitdata table_end;
+ +static unsigned long __meminitdata table_top;
   
   static __meminit void *alloc_low_page(unsigned long *phys)
   {
@@@ -286,7 -240,7 +286,7 @@@
                 return adr;
         }
   
- -      if (pfn >= end_pfn)
+ +      if (pfn >= table_top)
                 panic("alloc_low_page: ran out of memory");
   
         adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@@ -303,61 -257,65 +303,61 @@@ static __meminit void unmap_low_page(vo
         early_iounmap(adr, PAGE_SIZE);
   }
   
- -/* Must run before zap_low_mappings */
- -__meminit void *early_ioremap(unsigned long addr, unsigned long size)
+ +static unsigned long __meminit
+ +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
   {
- -      pmd_t *pmd, *last_pmd;
- -      unsigned long vaddr;
- -      int i, pmds;
+ +      unsigned pages = 0;
+ +      unsigned long last_map_addr = end;
+ +      int i;
+ +
+ +      pte_t *pte = pte_page + pte_index(addr);
   
- -      pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- -      vaddr = __START_KERNEL_map;
- -      pmd = level2_kernel_pgt;
- -      last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+ +      for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
   
- -      for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
- -              for (i = 0; i < pmds; i++) {
- -                      if (pmd_present(pmd[i]))
- -                              goto continue_outer_loop;
+ +              if (addr >= end) {
+ +                      if (!after_bootmem) {
+ +                              for(; i < PTRS_PER_PTE; i++, pte++)
+ +                                      set_pte(pte, __pte(0));
+ +                      }
+ +                      break;
                 }
- -              vaddr += addr & ~PMD_MASK;
- -              addr &= PMD_MASK;
   
- -              for (i = 0; i < pmds; i++, addr += PMD_SIZE)
- -                      set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- -              __flush_tlb_all();
+ +              if (pte_val(*pte))
+ +                      continue;
   
- -              return (void *)vaddr;
- -continue_outer_loop:
- -              ;
+ +              if (0)
+ +                      printk("   pte=%p addr=%lx pte=%016lx\n",
+ +                             pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ +              set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
+ +              last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ +              pages++;
         }
- -      printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
+ +      update_page_count(PG_LEVEL_4K, pages);
   
- -      return NULL;
+ +      return last_map_addr;
   }
   
- -/*
- - * To avoid virtual aliases later:
- - */
- -__meminit void early_iounmap(void *addr, unsigned long size)
+ +static unsigned long __meminit
+ +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end)
   {
- -      unsigned long vaddr;
- -      pmd_t *pmd;
- -      int i, pmds;
- -
- -      vaddr = (unsigned long)addr;
- -      pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- -      pmd = level2_kernel_pgt + pmd_index(vaddr);
+ +      pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
   
- -      for (i = 0; i < pmds; i++)
- -              pmd_clear(pmd + i);
- -
- -      __flush_tlb_all();
+ +      return phys_pte_init(pte, address, end);
   }
   
   static unsigned long __meminit
- -phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
+ +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+ +                       unsigned long page_size_mask)
   {
+ +      unsigned long pages = 0;
+ +      unsigned long last_map_addr = end;
+ +
         int i = pmd_index(address);
   
         for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
+ +              unsigned long pte_phys;
                 pmd_t *pmd = pmd_page + pmd_index(address);
+ +              pte_t *pte;
   
                 if (address >= end) {
                         if (!after_bootmem) {
@@@ -367,50 -325,31 +367,50 @@@
                         break;
                 }
   
- -              if (pmd_val(*pmd))
+ +              if (pmd_val(*pmd)) {
+ +                      if (!pmd_large(*pmd))
+ +                              last_map_addr = phys_pte_update(pmd, address,
+ +                                                               end);
+ +                      continue;
+ +              }
+ +
+ +              if (page_size_mask & (1<<PG_LEVEL_2M)) {
+ +                      pages++;
+ +                      set_pte((pte_t *)pmd,
+ +                              pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ +                      last_map_addr = (address & PMD_MASK) + PMD_SIZE;
                         continue;
+ +              }
   
- -              set_pte((pte_t *)pmd,
- -                      pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ +              pte = alloc_low_page(&pte_phys);
+ +              last_map_addr = phys_pte_init(pte, address, end);
+ +              unmap_low_page(pte);
+ +
+ +              pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
         }
- -      return address;
+ +      update_page_count(PG_LEVEL_2M, pages);
+ +      return last_map_addr;
   }
   
   static unsigned long __meminit
- -phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+ +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
+ +                       unsigned long page_size_mask)
   {
         pmd_t *pmd = pmd_offset(pud, 0);
         unsigned long last_map_addr;
   
         spin_lock(&init_mm.page_table_lock);
- -      last_map_addr = phys_pmd_init(pmd, address, end);
+ +      last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
         spin_unlock(&init_mm.page_table_lock);
         __flush_tlb_all();
         return last_map_addr;
   }
   
   static unsigned long __meminit
- -phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+ +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+ +                       unsigned long page_size_mask)
   {
+ +      unsigned long pages = 0;
         unsigned long last_map_addr = end;
         int i = pud_index(addr);
   
@@@ -430,13 -369,11 +430,13 @@@
   
                 if (pud_val(*pud)) {
                         if (!pud_large(*pud))
- -                              last_map_addr = phys_pmd_update(pud, addr, end);
+ +                              last_map_addr = phys_pmd_update(pud, addr, end,
+ +                                                       page_size_mask);
                         continue;
                 }
   
- -              if (direct_gbpages) {
+ +              if (page_size_mask & (1<<PG_LEVEL_1G)) {
+ +                      pages++;
                         set_pte((pte_t *)pud,
                                 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
                         last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
@@@ -446,50 -383,27 +446,50 @@@
                 pmd = alloc_low_page(&pmd_phys);
   
                 spin_lock(&init_mm.page_table_lock);
- -              set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- -              last_map_addr = phys_pmd_init(pmd, addr, end);
+ +              last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
+ +              unmap_low_page(pmd);
+ +              pud_populate(&init_mm, pud, __va(pmd_phys));
                 spin_unlock(&init_mm.page_table_lock);
   
- -              unmap_low_page(pmd);
         }
         __flush_tlb_all();
+ +      update_page_count(PG_LEVEL_1G, pages);
   
- -      return last_map_addr >> PAGE_SHIFT;
+ +      return last_map_addr;
+ +}
+ +
+ +static unsigned long __meminit
+ +phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
+ +               unsigned long page_size_mask)
+ +{
+ +      pud_t *pud;
+ +
+ +      pud = (pud_t *)pgd_page_vaddr(*pgd);
+ +
+ +      return phys_pud_init(pud, addr, end, page_size_mask);
   }
   
   static void __init find_early_table_space(unsigned long end)
   {
- -      unsigned long puds, pmds, tables, start;
+ +      unsigned long puds, pmds, ptes, tables, start;
   
         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
         tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
- -      if (!direct_gbpages) {
+ +      if (direct_gbpages) {
+ +              unsigned long extra;
+ +              extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+ +              pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ +      } else
                 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- -              tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
- -      }
+ +      tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ +
+ +      if (cpu_has_pse) {
+ +              unsigned long extra;
+ +              extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ +              ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ +      } else
+ +              ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ +      tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE);
   
         /*
          * RED-PEN putting page tables only on node 0 could
@@@ -503,10 -417,10 +503,10 @@@
   
         table_start >>= PAGE_SHIFT;
         table_end = table_start;
+ +      table_top = table_start + (tables >> PAGE_SHIFT);
   
- -      early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
- -              end, table_start << PAGE_SHIFT,
- -              (table_start << PAGE_SHIFT) + tables);
+ +      printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+ +              end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
   }
   
   static void __init init_gbpages(void)
@@@ -517,7 -431,7 +517,7 @@@
                 direct_gbpages = 0;
   }
   
- -#ifdef CONFIG_MEMTEST_BOOTPARAM
+ +#ifdef CONFIG_MEMTEST
   
   static void __init memtest(unsigned long start_phys, unsigned long size,
                                  unsigned pattern)
@@@ -579,8 -493,7 +579,8 @@@
   
   }
   
- -static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+ +/* default is disabled */
+ +static int memtest_pattern __initdata;
   
   static int __init parse_memtest(char *arg)
   {
@@@ -629,85 -542,15 +629,85 @@@ static void __init early_memtest(unsign
   }
   #endif
   
+ +static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+ +                                              unsigned long end,
+ +                                              unsigned long page_size_mask)
+ +{
+ +
+ +      unsigned long next, last_map_addr = end;
+ +
+ +      start = (unsigned long)__va(start);
+ +      end = (unsigned long)__va(end);
+ +
+ +      for (; start < end; start = next) {
+ +              pgd_t *pgd = pgd_offset_k(start);
+ +              unsigned long pud_phys;
+ +              pud_t *pud;
+ +
+ +              next = start + PGDIR_SIZE;
+ +              if (next > end)
+ +                      next = end;
+ +
+ +              if (pgd_val(*pgd)) {
+ +                      last_map_addr = phys_pud_update(pgd, __pa(start),
+ +                                               __pa(end), page_size_mask);
+ +                      continue;
+ +              }
+ +
+ +              if (after_bootmem)
+ +                      pud = pud_offset(pgd, start & PGDIR_MASK);
+ +              else
+ +                      pud = alloc_low_page(&pud_phys);
+ +
+ +              last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+ +                                               page_size_mask);
+ +              unmap_low_page(pud);
+ +              pgd_populate(&init_mm, pgd_offset_k(start),
+ +                           __va(pud_phys));
+ +      }
+ +
+ +      return last_map_addr;
+ +}
+ +
+ +struct map_range {
+ +      unsigned long start;
+ +      unsigned long end;
+ +      unsigned page_size_mask;
+ +};
+ +
+ +#define NR_RANGE_MR 5
+ +
+ +static int save_mr(struct map_range *mr, int nr_range,
+ +                 unsigned long start_pfn, unsigned long end_pfn,
+ +                 unsigned long page_size_mask)
+ +{
+ +
+ +      if (start_pfn < end_pfn) {
+ +              if (nr_range >= NR_RANGE_MR)
+ +                      panic("run out of range for init_memory_mapping\n");
+ +              mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ +              mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+ +              mr[nr_range].page_size_mask = page_size_mask;
+ +              nr_range++;
+ +      }
+ +
+ +      return nr_range;
+ +}
+ +
   /*
    * Setup the direct mapping of the physical memory at PAGE_OFFSET.
    * This runs before bootmem is initialized and gets pages directly from
    * the physical memory. To access them they are temporarily mapped.
    */
- -unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+ +unsigned long __init_refok init_memory_mapping(unsigned long start,
+ +                                             unsigned long end)
   {
- -      unsigned long next, last_map_addr = end;
- -      unsigned long start_phys = start, end_phys = end;
+ +      unsigned long last_map_addr = 0;
+ +      unsigned long page_size_mask = 0;
+ +      unsigned long start_pfn, end_pfn;
+ +
+ +      struct map_range mr[NR_RANGE_MR];
+ +      int nr_range, i;
   
         printk(KERN_INFO "init_memory_mapping\n");
   
@@@ -718,115 -561,48 +718,115 @@@
          * memory mapped. Unfortunately this is done currently before the
          * nodes are discovered.
          */
- -      if (!after_bootmem) {
+ +      if (!after_bootmem)
                 init_gbpages();
- -              find_early_table_space(end);
+ +
+ +      if (direct_gbpages)
+ +              page_size_mask |= 1 << PG_LEVEL_1G;
+ +      if (cpu_has_pse)
+ +              page_size_mask |= 1 << PG_LEVEL_2M;
+ +
+ +      memset(mr, 0, sizeof(mr));
+ +      nr_range = 0;
+ +
+ +      /* head if not big page alignment ?*/
+ +      start_pfn = start >> PAGE_SHIFT;
+ +      end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT)
+ +                      << (PMD_SHIFT - PAGE_SHIFT);
+ +      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ +
+ +      /* big page (2M) range*/
+ +      start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
+ +                       << (PMD_SHIFT - PAGE_SHIFT);
+ +      end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT)
+ +                       << (PUD_SHIFT - PAGE_SHIFT);
+ +      if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)))
+ +              end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT));
+ +      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ +                      page_size_mask & (1<<PG_LEVEL_2M));
+ +
+ +      /* big page (1G) range */
+ +      start_pfn = end_pfn;
+ +      end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+ +      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ +                              page_size_mask &
+ +                               ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ +
+ +      /* tail is not big page (1G) alignment */
+ +      start_pfn = end_pfn;
+ +      end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+ +      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ +                      page_size_mask & (1<<PG_LEVEL_2M));
+ +
+ +      /* tail is not big page (2M) alignment */
+ +      start_pfn = end_pfn;
+ +      end_pfn = end>>PAGE_SHIFT;
+ +      nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ +
+ +      /* try to merge same page size and continuous */
+ +      for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ +              unsigned long old_start;
+ +              if (mr[i].end != mr[i+1].start ||
+ +                  mr[i].page_size_mask != mr[i+1].page_size_mask)
+ +                      continue;
+ +              /* move it */
+ +              old_start = mr[i].start;
+ +              memmove(&mr[i], &mr[i+1],
+ +                       (nr_range - 1 - i) * sizeof (struct map_range));
+ +              mr[i].start = old_start;
+ +              nr_range--;
         }
   
- -      start = (unsigned long)__va(start);
- -      end = (unsigned long)__va(end);
+ +      for (i = 0; i < nr_range; i++)
+ +              printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+ +                              mr[i].start, mr[i].end,
+ +                      (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+ +                       (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
   
- -      for (; start < end; start = next) {
- -              pgd_t *pgd = pgd_offset_k(start);
- -              unsigned long pud_phys;
- -              pud_t *pud;
- -
- -              if (after_bootmem)
- -                      pud = pud_offset(pgd, start & PGDIR_MASK);
- -              else
- -                      pud = alloc_low_page(&pud_phys);
+ +      if (!after_bootmem)
+ +              find_early_table_space(end);
   
- -              next = start + PGDIR_SIZE;
- -              if (next > end)
- -                      next = end;
- -              last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
- -              if (!after_bootmem)
- -                      set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
- -              unmap_low_page(pud);
- -      }
+ +      for (i = 0; i < nr_range; i++)
+ +              last_map_addr = kernel_physical_mapping_init(
+ +                                      mr[i].start, mr[i].end,
+ +                                      mr[i].page_size_mask);
   
         if (!after_bootmem)
                 mmu_cr4_features = read_cr4();
         __flush_tlb_all();
   
- -      if (!after_bootmem)
+ +      if (!after_bootmem && table_end > table_start)
                 reserve_early(table_start << PAGE_SHIFT,
                                  table_end << PAGE_SHIFT, "PGTABLE");
   
+ +      printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
+ +                       last_map_addr, end);
+ +
         if (!after_bootmem)
- -              early_memtest(start_phys, end_phys);
+ +              early_memtest(start, end);
   
- -      return last_map_addr;
+ +      return last_map_addr >> PAGE_SHIFT;
   }
   
   #ifndef CONFIG_NUMA
+ +void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+ +{
+ +      unsigned long bootmap_size, bootmap;
+ +
+ +      bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ +      bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+ +                               PAGE_SIZE);
+ +      if (bootmap == -1L)
+ +              panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ +      /* don't touch min_low_pfn */
+ +      bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
+ +                                       0, end_pfn);
+ +      e820_register_active_regions(0, start_pfn, end_pfn);
+ +      free_bootmem_with_active_regions(0, end_pfn);
+ +      early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
+ +      reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
+ +}
+ +
   void __init paging_init(void)
   {
         unsigned long max_zone_pfns[MAX_NR_ZONES];
@@@ -834,9 -610,9 +834,9 @@@
         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- -      max_zone_pfns[ZONE_NORMAL] = end_pfn;
+ +      max_zone_pfns[ZONE_NORMAL] = max_pfn;
   
- -      memory_present(0, 0, end_pfn);
+ +      memory_present(0, 0, max_pfn);
         sparse_init();
         free_area_init_nodes(max_zone_pfns);
   }
@@@ -918,8 -694,8 +918,8 @@@ void __init mem_init(void
   #else
         totalram_pages = free_all_bootmem();
   #endif
- -      reservedpages = end_pfn - totalram_pages -
- -                                      absent_pages_in_range(0, end_pfn);
+ +      reservedpages = max_pfn - totalram_pages -
+ +                                      absent_pages_in_range(0, max_pfn);
         after_bootmem = 1;
   
         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@@ -938,7 -714,7 +938,7 @@@
         printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
                                 "%ldk reserved, %ldk data, %ldk init)\n",
                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- -              end_pfn << (PAGE_SHIFT-10),
+ +              max_pfn << (PAGE_SHIFT-10),
                 codesize >> 10,
                 reservedpages << (PAGE_SHIFT-10),
                 datasize >> 10,
@@@ -991,6 -767,13 +991,13 @@@ EXPORT_SYMBOL_GPL(rodata_test_data)
   void mark_rodata_ro(void)
   {
         unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+       unsigned long rodata_start =
+               ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+ 
+ #ifdef CONFIG_DYNAMIC_FTRACE
+       /* Dynamic tracing modifies the kernel text section */
+       start = rodata_start;
+ #endif
   
         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
                (end - start) >> 10);
@@@ -1000,8 -783,7 +1007,7 @@@
          * The rodata section (but not the kernel text!) should also be
          * not-executable.
          */
-       start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-       set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
   
         rodata_test();
   
@@@ -1023,26 -805,24 +1029,26 @@@ void free_initrd_mem(unsigned long star
   }
   #endif
   
- -void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+ +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ +                                 int flags)
   {
   #ifdef CONFIG_NUMA
         int nid, next_nid;
+ +      int ret;
   #endif
         unsigned long pfn = phys >> PAGE_SHIFT;
   
- -      if (pfn >= end_pfn) {
+ +      if (pfn >= max_pfn) {
                 /*
                  * This can happen with kdump kernels when accessing
                  * firmware tables:
                  */
                 if (pfn < max_pfn_mapped)
- -                      return;
+ +                      return -EFAULT;
   
- -              printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
+ +              printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
                                 phys, len);
- -              return;
+ +              return -EFAULT;
         }
   
         /* Should check here against the e820 map to avoid double free */
@@@ -1050,13 -830,9 +1056,13 @@@
         nid = phys_to_nid(phys);
         next_nid = phys_to_nid(phys + len - 1);
         if (nid == next_nid)
- -              reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ +              ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
         else
- -              reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
+ +              ret = reserve_bootmem(phys, len, flags);
+ +
+ +      if (ret != 0)
+ +              return ret;
+ +
   #else
         reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
   #endif
@@@ -1065,8 -841,6 +1071,8 @@@
                 dma_reserve += len / PAGE_SIZE;
                 set_dma_reserve(dma_reserve);
         }
+ +
+ +      return 0;
   }
   
   int kern_addr_valid(unsigned long addr)
@@@ -1171,7 -945,7 +1177,7 @@@ vmemmap_populate(struct page *start_pag
         pmd_t *pmd;
   
         for (; addr < end; addr = next) {
- -              next = pmd_addr_end(addr, end);
+ +              void *p = NULL;
   
                 pgd = vmemmap_pgd_populate(addr, node);
                 if (!pgd)
@@@ -1181,51 -955,33 +1187,51 @@@
                 if (!pud)
                         return -ENOMEM;
   
- -              pmd = pmd_offset(pud, addr);
- -              if (pmd_none(*pmd)) {
- -                      pte_t entry;
- -                      void *p;
+ +              if (!cpu_has_pse) {
+ +                      next = (addr + PAGE_SIZE) & PAGE_MASK;
+ +                      pmd = vmemmap_pmd_populate(pud, addr, node);
+ +
+ +                      if (!pmd)
+ +                              return -ENOMEM;
+ +
+ +                      p = vmemmap_pte_populate(pmd, addr, node);
   
- -                      p = vmemmap_alloc_block(PMD_SIZE, node);
                         if (!p)
                                 return -ENOMEM;
   
- -                      entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- -                                                      PAGE_KERNEL_LARGE);
- -                      set_pmd(pmd, __pmd(pte_val(entry)));
- -
- -                      /* check to see if we have contiguous blocks */
- -                      if (p_end != p || node_start != node) {
- -                              if (p_start)
- -                                      printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- -                                              addr_start, addr_end-1, p_start, p_end-1, node_start);
- -                              addr_start = addr;
- -                              node_start = node;
- -                              p_start = p;
- -                      }
- -                      addr_end = addr + PMD_SIZE;
- -                      p_end = p + PMD_SIZE;
+ +                      addr_end = addr + PAGE_SIZE;
+ +                      p_end = p + PAGE_SIZE;
                 } else {
- -                      vmemmap_verify((pte_t *)pmd, node, addr, next);
+ +                      next = pmd_addr_end(addr, end);
+ +
+ +                      pmd = pmd_offset(pud, addr);
+ +                      if (pmd_none(*pmd)) {
+ +                              pte_t entry;
+ +
+ +                              p = vmemmap_alloc_block(PMD_SIZE, node);
+ +                              if (!p)
+ +                                      return -ENOMEM;
+ +
+ +                              entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ +                                              PAGE_KERNEL_LARGE);
+ +                              set_pmd(pmd, __pmd(pte_val(entry)));
+ +
+ +                              /* check to see if we have contiguous blocks */
+ +                              if (p_end != p || node_start != node) {
+ +                                      if (p_start)
+ +                                              printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ +                                                     addr_start, addr_end-1, p_start, p_end-1, node_start);
+ +                                      addr_start = addr;
+ +                                      node_start = node;
+ +                                      p_start = p;
+ +                              }
+ +
+ +                              addr_end = addr + PMD_SIZE;
+ +                              p_end = p + PMD_SIZE;
+ +                      } else
+ +                              vmemmap_verify((pte_t *)pmd, node, addr, next);
                 }
+ +
         }
         return 0;
   }
diff --combined arch/x86/mm/ioremap.c

index 115f13ee40c9156fc1f954d5181b415e97c78ed3,e92aa461f4d6ddd62a77ea1cf5382a2ad2a0e0c7..24c1d3c30186c893c400eff0942b857207a833fa
--- 1/arch/x86/mm/ioremap.c
--- 2/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/module.h>
   #include <linux/slab.h>
   #include <linux/vmalloc.h>
+ #include <linux/mmiotrace.h>
   
   #include <asm/cacheflush.h>
   #include <asm/e820.h>
@@@ -122,10 -123,13 +123,13 @@@ static void __iomem *__ioremap_caller(r
   {
         unsigned long pfn, offset, vaddr;
         resource_size_t last_addr;
+       const resource_size_t unaligned_phys_addr = phys_addr;
+       const unsigned long unaligned_size = size;
         struct vm_struct *area;
         unsigned long new_prot_val;
         pgprot_t prot;
         int retval;
+       void __iomem *ret_addr;
   
         /* Don't allow wraparound or zero size */
         last_addr = phys_addr + size - 1;
@@@ -142,7 -146,7 +146,7 @@@
         /*
          * Don't remap the low PCI/ISA area, it's always mapped..
          */
- -      if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+ +      if (is_ISA_range(phys_addr, last_addr))
                 return (__force void __iomem *)phys_to_virt(phys_addr);
   
         /*
@@@ -233,7 -237,10 +237,10 @@@
                 return NULL;
         }
   
-       return (void __iomem *) (vaddr + offset);
+       ret_addr = (void __iomem *) (vaddr + offset);
+       mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+ 
+       return ret_addr;
   }
   
   /**
@@@ -261,7 -268,7 +268,7 @@@ void __iomem *ioremap_nocache(resource_
   {
         /*
          * Ideally, this should be:
- -       *      pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+ +       *      pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
          *
          * Till we fix all X drivers to use ioremap_wc(), we will use
          * UC MINUS.
@@@ -285,7 -292,7 +292,7 @@@ EXPORT_SYMBOL(ioremap_nocache)
    */
   void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
   {
- -      if (pat_wc_enabled)
+ +      if (pat_enabled)
                 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
                                         __builtin_return_address(0));
         else
@@@ -300,29 -307,6 +307,29 @@@ void __iomem *ioremap_cache(resource_si
   }
   EXPORT_SYMBOL(ioremap_cache);
   
+ +static void __iomem *ioremap_default(resource_size_t phys_addr,
+ +                                      unsigned long size)
+ +{
+ +      unsigned long flags;
+ +      void *ret;
+ +      int err;
+ +
+ +      /*
+ +       * - WB for WB-able memory and no other conflicting mappings
+ +       * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ +       * - Inherit from confliting mappings otherwise
+ +       */
+ +      err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags);
+ +      if (err < 0)
+ +              return NULL;
+ +
+ +      ret = (void *) __ioremap_caller(phys_addr, size, flags,
+ +                                      __builtin_return_address(0));
+ +
+ +      free_memtype(phys_addr, phys_addr + size);
+ +      return (void __iomem *)ret;
+ +}
+ +
   /**
    * iounmap - Free a IO remapping
    * @addr: virtual address from ioremap_*
@@@ -341,13 -325,15 +348,15 @@@ void iounmap(volatile void __iomem *add
          * vm_area and by simply returning an address into the kernel mapping
          * of ISA space.   So handle that here.
          */
- -      if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
- -          addr < phys_to_virt(ISA_END_ADDRESS))
+ +      if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
+ +          (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
                 return;
   
         addr = (volatile void __iomem *)
                 (PAGE_MASK & (unsigned long __force)addr);
   
+       mmiotrace_iounmap(addr);
+ 
         /* Use the vm area unlocked, assuming the caller
            ensures there isn't another iounmap for the same address
            in parallel. Reuse of the virtual address is prevented by
@@@ -355,7 -341,7 +364,7 @@@
            cpa takes care of the direct mappings. */
         read_lock(&vmlist_lock);
         for (p = vmlist; p; p = p->next) {
- -              if (p->addr == addr)
+ +              if (p->addr == (void __force *)addr)
                         break;
         }
         read_unlock(&vmlist_lock);
@@@ -369,7 -355,7 +378,7 @@@
         free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
   
         /* Finally remove it */
- -      o = remove_vm_area((void *)addr);
+ +      o = remove_vm_area((void __force *)addr);
         BUG_ON(p != o || o == NULL);
         kfree(p);
   }
@@@ -388,7 -374,7 +397,7 @@@ void *xlate_dev_mem_ptr(unsigned long p
         if (page_is_ram(start >> PAGE_SHIFT))
                 return __va(phys);
   
- -      addr = (void *)ioremap(start, PAGE_SIZE);
+ +      addr = (void __force *)ioremap_default(start, PAGE_SIZE);
         if (addr)
                 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
   
@@@ -404,6 -390,8 +413,6 @@@ void unxlate_dev_mem_ptr(unsigned long 
         return;
   }
   
- -#ifdef CONFIG_X86_32
- -
   int __initdata early_ioremap_debug;
   
   static int __init early_ioremap_debug_setup(char *str)
@@@ -415,7 -403,8 +424,7 @@@
   early_param("early_ioremap_debug", early_ioremap_debug_setup);
   
   static __initdata int after_paging_init;
- -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
- -              __section(.bss.page_aligned);
+ +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
   
   static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
   {
@@@ -504,11 -493,10 +513,11 @@@ static void __init __early_set_fixmap(e
                 return;
         }
         pte = early_ioremap_pte(addr);
+ +
         if (pgprot_val(flags))
                 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
         else
- -              pte_clear(NULL, addr, pte);
+ +              pte_clear(&init_mm, addr, pte);
         __flush_tlb_one(addr);
   }
   
@@@ -646,3 -634,5 +655,3 @@@ void __this_fixmap_does_not_exist(void
   {
         WARN_ON(1);
   }
- -
- -#endif /* CONFIG_X86_32 */
diff --combined arch/x86/mm/pageattr.c

index fb6f2ab40dda092f5029b1e29542fb2d9accb6f7,57970f2935c0aaef818b27e9f8a94ef493c0738c..47f4e2e4a0968ca848c2d16358d6ae7e43a6f565
--- 1/arch/x86/mm/pageattr.c
--- 2/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@@ -34,41 -34,6 +34,41 @@@ struct cpa_data 
         unsigned        force_split : 1;
   };
   
+ +#ifdef CONFIG_PROC_FS
+ +static unsigned long direct_pages_count[PG_LEVEL_NUM];
+ +
+ +void update_page_count(int level, unsigned long pages)
+ +{
+ +      unsigned long flags;
+ +
+ +      /* Protect against CPA */
+ +      spin_lock_irqsave(&pgd_lock, flags);
+ +      direct_pages_count[level] += pages;
+ +      spin_unlock_irqrestore(&pgd_lock, flags);
+ +}
+ +
+ +static void split_page_count(int level)
+ +{
+ +      direct_pages_count[level]--;
+ +      direct_pages_count[level - 1] += PTRS_PER_PTE;
+ +}
+ +
+ +int arch_report_meminfo(char *page)
+ +{
+ +      int n = sprintf(page, "DirectMap4k:  %8lu\n"
+ +                      "DirectMap2M:  %8lu\n",
+ +                      direct_pages_count[PG_LEVEL_4K],
+ +                      direct_pages_count[PG_LEVEL_2M]);
+ +#ifdef CONFIG_X86_64
+ +      n += sprintf(page + n, "DirectMap1G:  %8lu\n",
+ +                   direct_pages_count[PG_LEVEL_1G]);
+ +#endif
+ +      return n;
+ +}
+ +#else
+ +static inline void split_page_count(int level) { }
+ +#endif
+ +
   #ifdef CONFIG_X86_64
   
   static inline unsigned long highmap_start_pfn(void)
@@@ -262,6 -227,7 +262,7 @@@ pte_t *lookup_address(unsigned long add
   
         return pte_offset_kernel(pmd, address);
   }
+ EXPORT_SYMBOL_GPL(lookup_address);
   
   /*
    * Set the new pmd in all the pgds we know about:
@@@ -535,16 -501,6 +536,16 @@@ static int split_large_page(pte_t *kpte
         for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
                 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
   
+ +      if (address >= (unsigned long)__va(0) &&
+ +              address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ +              split_page_count(level);
+ +
+ +#ifdef CONFIG_X86_64
+ +      if (address >= (unsigned long)__va(1UL<<32) &&
+ +              address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+ +              split_page_count(level);
+ +#endif
+ +
         /*
          * Install the new, split up pagetable. Important details here:
          *
@@@ -658,24 -614,15 +659,24 @@@ static int cpa_process_alias(struct cpa
         struct cpa_data alias_cpa;
         int ret = 0;
   
- -      if (cpa->pfn > max_pfn_mapped)
+ +      if (cpa->pfn >= max_pfn_mapped)
                 return 0;
   
+ +#ifdef CONFIG_X86_64
+ +      if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+ +              return 0;
+ +#endif
         /*
          * No need to redo, when the primary call touched the direct
          * mapping already:
          */
- -      if (!within(cpa->vaddr, PAGE_OFFSET,
- -                  PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ +      if (!(within(cpa->vaddr, PAGE_OFFSET,
+ +                  PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+ +#ifdef CONFIG_X86_64
+ +              || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ +                  PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+ +#endif
+ +      )) {
   
                 alias_cpa = *cpa;
                 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
@@@ -859,7 -806,7 +860,7 @@@ int _set_memory_wc(unsigned long addr, 
   
   int set_memory_wc(unsigned long addr, int numpages)
   {
- -      if (!pat_wc_enabled)
+ +      if (!pat_enabled)
                 return set_memory_uc(addr, numpages);
   
         if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
diff --combined include/asm-x86/irqflags.h

index 17e7a1701c97be5c6a734e25e2add35347bb96b5,24d71b1eb18905e5d746b6e751a19b03cde44aa5..424acb48cd61baf681f7d71fb6b4ad56e1cfc3ee
--- 1/include/asm-x86/irqflags.h
--- 2/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@@ -111,35 -111,14 +111,35 @@@ static inline unsigned long __raw_local
   #define DISABLE_INTERRUPTS(x) cli
   
   #ifdef CONFIG_X86_64
+ +#define SWAPGS        swapgs
+ +/*
+ + * Currently paravirt can't handle swapgs nicely when we
+ + * don't have a stack we can rely on (such as a user space
+ + * stack).  So we either find a way around these or just fault
+ + * and emulate if a guest tries to call swapgs directly.
+ + *
+ + * Either way, this is a good way to document that we don't
+ + * have a reliable stack. x86_64 only.
+ + */
+ +#define SWAPGS_UNSAFE_STACK   swapgs
+ +
+ +#define PARAVIRT_ADJUST_EXCEPTION_FRAME       /*  */
+ +
   #define INTERRUPT_RETURN      iretq
- -#define ENABLE_INTERRUPTS_SYSCALL_RET                 \
- -                      movq    %gs:pda_oldrsp, %rsp;   \
- -                      swapgs;                         \
- -                      sysretq;
+ +#define USERGS_SYSRET64                               \
+ +      swapgs;                                 \
+ +      sysretq;
+ +#define USERGS_SYSRET32                               \
+ +      swapgs;                                 \
+ +      sysretl
+ +#define ENABLE_INTERRUPTS_SYSEXIT32           \
+ +      swapgs;                                 \
+ +      sti;                                    \
+ +      sysexit
+ +
   #else
   #define INTERRUPT_RETURN              iret
- -#define ENABLE_INTERRUPTS_SYSCALL_RET sti; sysexit
+ +#define ENABLE_INTERRUPTS_SYSEXIT     sti; sysexit
   #define GET_CR0_INTO_EAX              movl %cr0, %eax
   #endif
   
@@@ -190,8 -169,16 +190,6 @@@ static inline void trace_hardirqs_fixup
   #else
   
   #ifdef CONFIG_X86_64
- #define ARCH_TRACE_IRQS_ON            call trace_hardirqs_on_thunk
- #define ARCH_TRACE_IRQS_OFF           call trace_hardirqs_off_thunk
- -/*
- - * Currently paravirt can't handle swapgs nicely when we
- - * don't have a stack we can rely on (such as a user space
- - * stack).  So we either find a way around these or just fault
- - * and emulate if a guest tries to call swapgs directly.
- - *
- - * Either way, this is a good way to document that we don't
- - * have a reliable stack. x86_64 only.
- - */
- -#define SWAPGS_UNSAFE_STACK   swapgs
   #define ARCH_LOCKDEP_SYS_EXIT         call lockdep_sys_exit_thunk
   #define ARCH_LOCKDEP_SYS_EXIT_IRQ     \
         TRACE_IRQS_ON; \
@@@ -203,24 -190,6 +201,6 @@@
         TRACE_IRQS_OFF;
   
   #else
- #define ARCH_TRACE_IRQS_ON                    \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_on;                 \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
- 
- #define ARCH_TRACE_IRQS_OFF                   \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_off;                \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
- 
   #define ARCH_LOCKDEP_SYS_EXIT                 \
         pushl %eax;                             \
         pushl %ecx;                             \
@@@ -234,8 -203,8 +214,8 @@@
   #endif
   
   #ifdef CONFIG_TRACE_IRQFLAGS
- #  define TRACE_IRQS_ON               ARCH_TRACE_IRQS_ON
- #  define TRACE_IRQS_OFF      ARCH_TRACE_IRQS_OFF
+ #  define TRACE_IRQS_ON               call trace_hardirqs_on_thunk;
+ #  define TRACE_IRQS_OFF      call trace_hardirqs_off_thunk;
   #else
   #  define TRACE_IRQS_ON
   #  define TRACE_IRQS_OFF
diff --combined include/linux/linkage.h

index 9fd1f859021b5018baffdb440261379dab4a55b3,14f329c64ba8ee54cf2c4166d09272167c836bd7..56ba37394656c7f211eaebdadf5481eda844acca
--- 1/include/linux/linkage.h
--- 2/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@@ -1,9 -1,10 +1,11 @@@
   #ifndef _LINUX_LINKAGE_H
   #define _LINUX_LINKAGE_H
   
+ +#include <linux/compiler.h>
   #include <asm/linkage.h>
   
+ #define notrace __attribute__((no_instrument_function))
+ 
   #ifdef __cplusplus
   #define CPP_ASMLINKAGE extern "C"
   #else
@@@ -18,9 -19,6 +20,9 @@@
   # define asmregparm
   #endif
   
+ +#define __page_aligned_data   __section(.data.page_aligned) __aligned(PAGE_SIZE)
+ +#define __page_aligned_bss    __section(.bss.page_aligned) __aligned(PAGE_SIZE)
+ +
   /*
    * This is used by architectures to keep arguments on the stack
    * untouched by the compiler by keeping them live until the end.
diff --combined include/linux/sched.h

index f6cd60f2de63ba70cc127fcd1f05ebf4efdd7c45,aa609858aef07ee9dbada675f2beb5d98842883b..5d1af10b90c3fabe104ef21f82c5c4aaa5899e92
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -134,6 -134,7 +134,6 @@@ extern unsigned long nr_running(void)
   extern unsigned long nr_uninterruptible(void);
   extern unsigned long nr_active(void);
   extern unsigned long nr_iowait(void);
- -extern unsigned long weighted_cpuload(const int cpu);
   
   struct seq_file;
   struct cfs_rq;
@@@ -245,6 -246,8 +245,8 @@@ extern asmlinkage void schedule_tail(st
   extern void init_idle(struct task_struct *idle, int cpu);
   extern void init_idle_bootup_task(struct task_struct *idle);
   
+ extern int runqueue_is_locked(void);
+ 
   extern cpumask_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
@@@ -783,8 -786,6 +785,8 @@@ struct sched_domain 
         unsigned int balance_interval;  /* initialise to 1. units in ms. */
         unsigned int nr_balance_failed; /* initialise to 0 */
   
+ +      u64 last_update;
+ +
   #ifdef CONFIG_SCHEDSTATS
         /* load_balance() stats */
         unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@@ -824,6 -825,23 +826,6 @@@ extern int arch_reinit_sched_domains(vo
   
   #endif        /* CONFIG_SMP */
   
- -/*
- - * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
- - * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
- - * task of nice 0 or enough lower priority tasks to bring up the
- - * weighted_cpuload
- - */
- -static inline int above_background_load(void)
- -{
- -      unsigned long cpu;
- -
- -      for_each_online_cpu(cpu) {
- -              if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
- -                      return 1;
- -      }
- -      return 0;
- -}
- -
   struct io_context;                    /* See blkdev.h */
   #define NGROUPS_SMALL         32
   #define NGROUPS_PER_BLOCK     ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
@@@ -905,8 -923,8 +907,8 @@@ struct sched_class 
         void (*set_cpus_allowed)(struct task_struct *p,
                                  const cpumask_t *newmask);
   
- -      void (*join_domain)(struct rq *rq);
- -      void (*leave_domain)(struct rq *rq);
+ +      void (*rq_online)(struct rq *rq);
+ +      void (*rq_offline)(struct rq *rq);
   
         void (*switched_from) (struct rq *this_rq, struct task_struct *task,
                                int running);
@@@ -1023,7 -1041,6 +1025,7 @@@ struct task_struct 
   #endif
   
         int prio, static_prio, normal_prio;
+ +      unsigned int rt_priority;
         const struct sched_class *sched_class;
         struct sched_entity se;
         struct sched_rt_entity rt;
@@@ -1107,6 -1124,7 +1109,6 @@@
         int __user *set_child_tid;              /* CLONE_CHILD_SETTID */
         int __user *clear_child_tid;            /* CLONE_CHILD_CLEARTID */
   
- -      unsigned int rt_priority;
         cputime_t utime, stime, utimescaled, stimescaled;
         cputime_t gtime;
         cputime_t prev_utime, prev_stime;
@@@ -1125,12 -1143,12 +1127,12 @@@
         gid_t gid,egid,sgid,fsgid;
         struct group_info *group_info;
         kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
- -      unsigned securebits;
         struct user_struct *user;
+ +      unsigned securebits;
   #ifdef CONFIG_KEYS
+ +      unsigned char jit_keyring;      /* default keyring to attach requested keys to */
         struct key *request_key_auth;   /* assumed request_key authority */
         struct key *thread_keyring;     /* keyring private to this thread */
- -      unsigned char jit_keyring;      /* default keyring to attach requested keys to */
   #endif
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
@@@ -1217,8 -1235,8 +1219,8 @@@
   # define MAX_LOCK_DEPTH 48UL
         u64 curr_chain_key;
         int lockdep_depth;
- -      struct held_lock held_locks[MAX_LOCK_DEPTH];
         unsigned int lockdep_recursion;
+ +      struct held_lock held_locks[MAX_LOCK_DEPTH];
   #endif
   
   /* journalling filesystem info */
@@@ -1246,6 -1264,10 +1248,6 @@@
         u64 acct_vm_mem1;       /* accumulated virtual memory usage */
         cputime_t acct_stimexpd;/* stime since last update */
   #endif
- -#ifdef CONFIG_NUMA
- -      struct mempolicy *mempolicy;
- -      short il_next;
- -#endif
   #ifdef CONFIG_CPUSETS
         nodemask_t mems_allowed;
         int cpuset_mems_generation;
@@@ -1264,10 -1286,6 +1266,10 @@@
   #endif
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
+ +#endif
+ +#ifdef CONFIG_NUMA
+ +      struct mempolicy *mempolicy;
+ +      short il_next;
   #endif
         atomic_t fs_excl;       /* holding fs exclusive resources */
         struct rcu_head rcu;
@@@ -1488,7 -1506,6 +1490,7 @@@ static inline void put_task_struct(stru
   #define PF_SWAPWRITE  0x00800000      /* Allowed to write to swap */
   #define PF_SPREAD_PAGE        0x01000000      /* Spread page cache over cpuset */
   #define PF_SPREAD_SLAB        0x02000000      /* Spread some slab caches over cpuset */
+ +#define PF_THREAD_BOUND       0x04000000      /* Thread bound to specific cpu */
   #define PF_MEMPOLICY  0x10000000      /* Non-default NUMA mempolicy */
   #define PF_MUTEX_TESTER       0x20000000      /* Thread belongs to the rt mutex tester */
   #define PF_FREEZER_SKIP       0x40000000      /* Freezer should not count it as freezeable */
@@@ -1558,28 -1575,13 +1560,28 @@@ static inline void sched_clock_idle_sle
   static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
   {
   }
- -#else
+ +
+ +#ifdef CONFIG_NO_HZ
+ +static inline void sched_clock_tick_stop(int cpu)
+ +{
+ +}
+ +
+ +static inline void sched_clock_tick_start(int cpu)
+ +{
+ +}
+ +#endif
+ +
+ +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
   extern void sched_clock_init(void);
   extern u64 sched_clock_cpu(int cpu);
   extern void sched_clock_tick(void);
   extern void sched_clock_idle_sleep_event(void);
   extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+ +#ifdef CONFIG_NO_HZ
+ +extern void sched_clock_tick_stop(int cpu);
+ +extern void sched_clock_tick_start(int cpu);
   #endif
+ +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
   
   /*
    * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@@ -1622,7 -1624,6 +1624,7 @@@ extern unsigned int sysctl_sched_child_
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
+ +extern unsigned int sysctl_sched_shares_ratelimit;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
@@@ -2132,6 -2133,18 +2134,18 @@@ static inline void arch_pick_mmap_layou
   }
   #endif
   
+ #ifdef CONFIG_TRACING
+ extern void
+ __trace_special(void *__tr, void *__data,
+               unsigned long arg1, unsigned long arg2, unsigned long arg3);
+ #else
+ static inline void
+ __trace_special(void *__tr, void *__data,
+               unsigned long arg1, unsigned long arg2, unsigned long arg3)
+ {
+ }
+ #endif
+ 
   extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
   extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
   
@@@ -2226,6 -2239,8 +2240,8 @@@ static inline void mm_init_owner(struc
   }
   #endif /* CONFIG_MM_OWNER */
   
+ #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+ 
   #endif /* __KERNEL__ */
   
   #endif
diff --combined kernel/Makefile

index 6c55301112e064cf7b95795c899897739a6cfe14,480976275d98d40bff28d04d675fb663f358b253..f6328e16dfdde5749b05279e503f2172a2831544
--- 1/kernel/Makefile
--- 2/kernel/Makefile
+++ b/kernel/Makefile
@@@ -3,7 -3,7 +3,7 @@@
   #
   
   obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
- -          exit.o itimer.o time.o softirq.o resource.o \
+ +          cpu.o exit.o itimer.o time.o softirq.o resource.o \
             sysctl.o capability.o ptrace.o timer.o user.o \
             signal.o sys.o kmod.o workqueue.o pid.o \
             rcupdate.o extable.o params.o posix-timers.o \
@@@ -11,6 -11,18 +11,18 @@@
             hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
             notifier.o ksysfs.o pm_qos_params.o sched_clock.o
   
+ CFLAGS_REMOVE_sched.o = -mno-spe
+ 
+ ifdef CONFIG_FTRACE
+ # Do not trace debug files and internal ftrace files
+ CFLAGS_REMOVE_lockdep.o = -pg
+ CFLAGS_REMOVE_lockdep_proc.o = -pg
+ CFLAGS_REMOVE_mutex-debug.o = -pg
+ CFLAGS_REMOVE_rtmutex-debug.o = -pg
+ CFLAGS_REMOVE_cgroup-debug.o = -pg
+ CFLAGS_REMOVE_sched_clock.o = -pg
+ endif
+ 
   obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
   obj-$(CONFIG_STACKTRACE) += stacktrace.o
   obj-y += time/
@@@ -27,7 -39,7 +39,7 @@@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.
   obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
   obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
   obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
- -obj-$(CONFIG_SMP) += cpu.o spinlock.o
+ +obj-$(CONFIG_SMP) += spinlock.o
   obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
   obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
   obj-$(CONFIG_UID16) += uid16.o
@@@ -69,7 -81,8 +81,9 @@@ obj-$(CONFIG_TASK_DELAY_ACCT) += delaya
   obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
   obj-$(CONFIG_MARKERS) += marker.o
   obj-$(CONFIG_LATENCYTOP) += latencytop.o
+ obj-$(CONFIG_FTRACE) += trace/
+ obj-$(CONFIG_TRACING) += trace/
+ +obj-$(CONFIG_SMP) += sched_cpupri.o
   
   ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
   # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --combined kernel/printk.c

index 625d240d7ada4633de19e99659d481af68b59913,75ef3af39132e717a0ea905106fc454b3692ea67..5d81a11321fd72d4c956b677110589acf51379fd
--- 1/kernel/printk.c
--- 2/kernel/printk.c
+++ b/kernel/printk.c
@@@ -75,8 -75,6 +75,8 @@@ EXPORT_SYMBOL(oops_in_progress)
   static DECLARE_MUTEX(console_sem);
   static DECLARE_MUTEX(secondary_console_sem);
   struct console *console_drivers;
+ +EXPORT_SYMBOL_GPL(console_drivers);
+ +
   /*
    * This is used for debugging the mess that is the VT code by
    * keeping track if we have the console semaphore held. It's
@@@ -123,8 -121,6 +123,8 @@@ struct console_cmdlin
   static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
   static int selected_console = -1;
   static int preferred_console = -1;
+ +int console_set_on_cmdline;
+ +EXPORT_SYMBOL(console_set_on_cmdline);
   
   /* Flag: console code may call schedule() */
   static int console_may_schedule;
@@@ -894,7 -890,6 +894,7 @@@ static int __init console_setup(char *s
         *s = 0;
   
         __add_preferred_console(buf, idx, options, brl_options);
+ +      console_set_on_cmdline = 1;
         return 1;
   }
   __setup("console=", console_setup);
@@@ -1046,7 -1041,9 +1046,9 @@@ void release_console_sem(void
                 _log_end = log_end;
                 con_start = log_end;            /* Flush */
                 spin_unlock(&logbuf_lock);
+               stop_critical_timings();        /* don't trace print latency */
                 call_console_drivers(_con_start, _log_end);
+               start_critical_timings();
                 local_irq_restore(flags);
         }
         console_locked = 0;
diff --combined kernel/sched.c

index 591d5e7f757ad7438e9696bd3735f4b098dfda92,42899dce837d6f27a5f90db3d2037c2ff0a05489..c74b0d23c7525c1db91d73c5ebab56931bccae4a
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -70,12 -70,11 +70,13 @@@
   #include <linux/bootmem.h>
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
+ #include <linux/ftrace.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
   
+ +#include "sched_cpupri.h"
+ +
   /*
    * Convert user-nice values [ -20 ... 0 ... 19 ]
    * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@@ -291,15 -290,15 +292,15 @@@ struct task_group root_task_group
   static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
   /* Default task group's cfs_rq on each cpu */
   static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
- -#endif
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
   static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
- -#endif
- -#else
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +#else /* !CONFIG_FAIR_GROUP_SCHED */
   #define root_task_group init_task_group
- -#endif
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
   /* task_group_lock serializes add/remove of task groups and also changes to
    * a task group's cpu shares.
@@@ -309,9 -308,9 +310,9 @@@ static DEFINE_SPINLOCK(task_group_lock)
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
- -#else
+ +#else /* !CONFIG_USER_SCHED */
   # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
- -#endif
+ +#endif /* CONFIG_USER_SCHED */
   
   /*
    * A weight of 0 or 1 can cause arithmetics problems.
@@@ -365,10 -364,6 +366,10 @@@ static inline void set_task_rq(struct t
   #else
   
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+ +static inline struct task_group *task_group(struct task_struct *p)
+ +{
+ +      return NULL;
+ +}
   
   #endif        /* CONFIG_GROUP_SCHED */
   
@@@ -379,7 -374,6 +380,7 @@@ struct cfs_rq 
   
         u64 exec_clock;
         u64 min_vruntime;
+ +      u64 pair_start;
   
         struct rb_root tasks_timeline;
         struct rb_node *rb_leftmost;
@@@ -408,31 -402,6 +409,31 @@@
          */
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
+ +
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * the part of load.weight contributed by tasks
+ +       */
+ +      unsigned long task_weight;
+ +
+ +      /*
+ +       *   h_load = weight * f(tg)
+ +       *
+ +       * Where f(tg) is the recursive weight fraction assigned to
+ +       * this group.
+ +       */
+ +      unsigned long h_load;
+ +
+ +      /*
+ +       * this cpu's part of tg->shares
+ +       */
+ +      unsigned long shares;
+ +
+ +      /*
+ +       * load.weight at the time we set shares
+ +       */
+ +      unsigned long rq_weight;
+ +#endif
   #endif
   };
   
@@@ -484,9 -453,6 +485,9 @@@ struct root_domain 
          */
         cpumask_t rto_mask;
         atomic_t rto_count;
+ +#ifdef CONFIG_SMP
+ +      struct cpupri cpupri;
+ +#endif
   };
   
   /*
@@@ -561,9 -527,6 +562,9 @@@ struct rq 
         int push_cpu;
         /* cpu of this runqueue: */
         int cpu;
+ +      int online;
+ +
+ +      unsigned long avg_load_per_task;
   
         struct task_struct *migration_thread;
         struct list_head migration_queue;
@@@ -645,6 -608,24 +646,24 @@@ static inline void update_rq_clock(stru
   # define const_debug static const
   #endif
   
+ /**
+  * runqueue_is_locked
+  *
+  * Returns true if the current cpu runqueue is locked.
+  * This interface allows printk to be called with the runqueue lock
+  * held and know whether or not it is OK to wake up the klogd.
+  */
+ int runqueue_is_locked(void)
+ {
+       int cpu = get_cpu();
+       struct rq *rq = cpu_rq(cpu);
+       int ret;
+ 
+       ret = spin_is_locked(&rq->lock);
+       put_cpu();
+       return ret;
+ }
+ 
   /*
    * Debugging: various feature bits
    */
@@@ -786,12 -767,6 +805,12 @@@ late_initcall(sched_init_debug)
    */
   const_debug unsigned int sysctl_sched_nr_migrate = 32;
   
+ +/*
+ + * ratelimit for updating the group shares.
+ + * default: 0.5ms
+ + */
+ +const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+ +
   /*
    * period over which we measure -rt task cpu usage in us.
    * default: 1s
@@@ -819,6 -794,82 +838,6 @@@ static inline u64 global_rt_runtime(voi
         return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
   }
   
- -unsigned long long time_sync_thresh = 100000;
- -
- -static DEFINE_PER_CPU(unsigned long long, time_offset);
- -static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
- -
- -/*
- - * Global lock which we take every now and then to synchronize
- - * the CPUs time. This method is not warp-safe, but it's good
- - * enough to synchronize slowly diverging time sources and thus
- - * it's good enough for tracing:
- - */
- -static DEFINE_SPINLOCK(time_sync_lock);
- -static unsigned long long prev_global_time;
- -
- -static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
- -{
- -      /*
- -       * We want this inlined, to not get tracer function calls
- -       * in this critical section:
- -       */
- -      spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
- -      __raw_spin_lock(&time_sync_lock.raw_lock);
- -
- -      if (time < prev_global_time) {
- -              per_cpu(time_offset, cpu) += prev_global_time - time;
- -              time = prev_global_time;
- -      } else {
- -              prev_global_time = time;
- -      }
- -
- -      __raw_spin_unlock(&time_sync_lock.raw_lock);
- -      spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
- -
- -      return time;
- -}
- -
- -static unsigned long long __cpu_clock(int cpu)
- -{
- -      unsigned long long now;
- -
- -      /*
- -       * Only call sched_clock() if the scheduler has already been
- -       * initialized (some code might call cpu_clock() very early):
- -       */
- -      if (unlikely(!scheduler_running))
- -              return 0;
- -
- -      now = sched_clock_cpu(cpu);
- -
- -      return now;
- -}
- -
- -/*
- - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- - * clock constructed from sched_clock():
- - */
- -unsigned long long notrace cpu_clock(int cpu)
- -{
- -      unsigned long long prev_cpu_time, time, delta_time;
- -      unsigned long flags;
- -
- -      local_irq_save(flags);
- -      prev_cpu_time = per_cpu(prev_cpu_time, cpu);
- -      time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
- -      delta_time = time-prev_cpu_time;
- -
- -      if (unlikely(delta_time > time_sync_thresh)) {
- -              time = __sync_cpu_clock(time, cpu);
- -              per_cpu(prev_cpu_time, cpu) = time;
- -      }
- -      local_irq_restore(flags);
- -
- -      return time;
- -}
- -EXPORT_SYMBOL_GPL(cpu_clock);
- -
   #ifndef prepare_arch_switch
   # define prepare_arch_switch(next)    do { } while (0)
   #endif
@@@ -1281,15 -1332,15 +1300,15 @@@ void wake_up_idle_cpu(int cpu
         if (!tsk_is_polling(rq->idle))
                 smp_send_reschedule(cpu);
   }
- -#endif
+ +#endif /* CONFIG_NO_HZ */
   
- -#else
+ +#else /* !CONFIG_SMP */
   static void __resched_task(struct task_struct *p, int tif_bit)
   {
         assert_spin_locked(&task_rq(p)->lock);
         set_tsk_thread_flag(p, tif_bit);
   }
- -#endif
+ +#endif /* CONFIG_SMP */
   
   #if BITS_PER_LONG == 32
   # define WMULT_CONST  (~0UL)
@@@ -1304,9 -1355,6 +1323,9 @@@
    */
   #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
   
+ +/*
+ + * delta *= weight / lw
+ + */
   static unsigned long
   calc_delta_mine(unsigned long delta_exec, unsigned long weight,
                 struct load_weight *lw)
@@@ -1334,6 -1382,12 +1353,6 @@@
         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
   }
   
- -static inline unsigned long
- -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
- -{
- -      return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
- -}
- -
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
   {
         lw->weight += inc;
@@@ -1444,211 -1498,17 +1463,211 @@@ static inline void dec_cpu_load(struct 
   #ifdef CONFIG_SMP
   static unsigned long source_load(int cpu, int type);
   static unsigned long target_load(int cpu, int type);
- -static unsigned long cpu_avg_load_per_task(int cpu);
   static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- -#else /* CONFIG_SMP */
+ +
+ +static unsigned long cpu_avg_load_per_task(int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +
+ +      if (rq->nr_running)
+ +              rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ +
+ +      return rq->avg_load_per_task;
+ +}
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ +
+ +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ +
+ +/*
+ + * Iterate the full tree, calling @down when first entering a node and @up when
+ + * leaving it for the final time.
+ + */
+ +static void
+ +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
   {
+ +      struct task_group *parent, *child;
+ +
+ +      rcu_read_lock();
+ +      parent = &root_task_group;
+ +down:
+ +      (*down)(parent, cpu, sd);
+ +      list_for_each_entry_rcu(child, &parent->children, siblings) {
+ +              parent = child;
+ +              goto down;
+ +
+ +up:
+ +              continue;
+ +      }
+ +      (*up)(parent, cpu, sd);
+ +
+ +      child = parent;
+ +      parent = parent->parent;
+ +      if (parent)
+ +              goto up;
+ +      rcu_read_unlock();
+ +}
+ +
+ +static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+ +
+ +/*
+ + * Calculate and set the cpu's group shares.
+ + */
+ +static void
+ +__update_group_shares_cpu(struct task_group *tg, int cpu,
+ +                        unsigned long sd_shares, unsigned long sd_rq_weight)
+ +{
+ +      int boost = 0;
+ +      unsigned long shares;
+ +      unsigned long rq_weight;
+ +
+ +      if (!tg->se[cpu])
+ +              return;
+ +
+ +      rq_weight = tg->cfs_rq[cpu]->load.weight;
+ +
+ +      /*
+ +       * If there are currently no tasks on the cpu pretend there is one of
+ +       * average load so that when a new task gets to run here it will not
+ +       * get delayed by group starvation.
+ +       */
+ +      if (!rq_weight) {
+ +              boost = 1;
+ +              rq_weight = NICE_0_LOAD;
+ +      }
+ +
+ +      if (unlikely(rq_weight > sd_rq_weight))
+ +              rq_weight = sd_rq_weight;
+ +
+ +      /*
+ +       *           \Sum shares * rq_weight
+ +       * shares =  -----------------------
+ +       *               \Sum rq_weight
+ +       *
+ +       */
+ +      shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ +
+ +      /*
+ +       * record the actual number of shares, not the boosted amount.
+ +       */
+ +      tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ +      tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ +
+ +      if (shares < MIN_SHARES)
+ +              shares = MIN_SHARES;
+ +      else if (shares > MAX_SHARES)
+ +              shares = MAX_SHARES;
+ +
+ +      __set_se_shares(tg->se[cpu], shares);
+ +}
+ +
+ +/*
+ + * Re-compute the task group their per cpu shares over the given domain.
+ + * This needs to be done in a bottom-up fashion because the rq weight of a
+ + * parent group depends on the shares of its child groups.
+ + */
+ +static void
+ +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ +{
+ +      unsigned long rq_weight = 0;
+ +      unsigned long shares = 0;
+ +      int i;
+ +
+ +      for_each_cpu_mask(i, sd->span) {
+ +              rq_weight += tg->cfs_rq[i]->load.weight;
+ +              shares += tg->cfs_rq[i]->shares;
+ +      }
+ +
+ +      if ((!shares && rq_weight) || shares > tg->shares)
+ +              shares = tg->shares;
+ +
+ +      if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+ +              shares = tg->shares;
+ +
+ +      if (!rq_weight)
+ +              rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+ +
+ +      for_each_cpu_mask(i, sd->span) {
+ +              struct rq *rq = cpu_rq(i);
+ +              unsigned long flags;
+ +
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              __update_group_shares_cpu(tg, i, shares, rq_weight);
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +      }
+ +}
+ +
+ +/*
+ + * Compute the cpu's hierarchical load factor for each task group.
+ + * This needs to be done in a top-down fashion because the load of a child
+ + * group is a fraction of its parents load.
+ + */
+ +static void
+ +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ +{
+ +      unsigned long load;
+ +
+ +      if (!tg->parent) {
+ +              load = cpu_rq(cpu)->load.weight;
+ +      } else {
+ +              load = tg->parent->cfs_rq[cpu]->h_load;
+ +              load *= tg->cfs_rq[cpu]->shares;
+ +              load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+ +      }
+ +
+ +      tg->cfs_rq[cpu]->h_load = load;
+ +}
+ +
+ +static void
+ +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+ +{
+ +}
+ +
+ +static void update_shares(struct sched_domain *sd)
+ +{
+ +      u64 now = cpu_clock(raw_smp_processor_id());
+ +      s64 elapsed = now - sd->last_update;
+ +
+ +      if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+ +              sd->last_update = now;
+ +              walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+ +      }
   }
+ +
+ +static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ +{
+ +      spin_unlock(&rq->lock);
+ +      update_shares(sd);
+ +      spin_lock(&rq->lock);
+ +}
+ +
+ +static void update_h_load(int cpu)
+ +{
+ +      walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ +}
+ +
+ +#else
+ +
+ +static inline void update_shares(struct sched_domain *sd)
+ +{
+ +}
+ +
+ +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+ +{
+ +}
+ +
   #endif
   
- -#endif /* CONFIG_SMP */
+ +#endif
+ +
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ +{
+ +#ifdef CONFIG_SMP
+ +      cfs_rq->shares = shares;
+ +#endif
+ +}
+ +#endif
   
   #include "sched_stats.h"
   #include "sched_idletask.c"
@@@ -1659,17 -1519,27 +1678,17 @@@
   #endif
   
   #define sched_class_highest (&rt_sched_class)
+ +#define for_each_class(class) \
+ +   for (class = sched_class_highest; class; class = class->next)
   
- -static inline void inc_load(struct rq *rq, const struct task_struct *p)
- -{
- -      update_load_add(&rq->load, p->se.load.weight);
- -}
- -
- -static inline void dec_load(struct rq *rq, const struct task_struct *p)
- -{
- -      update_load_sub(&rq->load, p->se.load.weight);
- -}
- -
- -static void inc_nr_running(struct task_struct *p, struct rq *rq)
+ +static void inc_nr_running(struct rq *rq)
   {
         rq->nr_running++;
- -      inc_load(rq, p);
   }
   
- -static void dec_nr_running(struct task_struct *p, struct rq *rq)
+ +static void dec_nr_running(struct rq *rq)
   {
         rq->nr_running--;
- -      dec_load(rq, p);
   }
   
   static void set_load_weight(struct task_struct *p)
@@@ -1693,12 -1563,6 +1712,12 @@@
         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
   }
   
+ +static void update_avg(u64 *avg, u64 sample)
+ +{
+ +      s64 diff = sample - *avg;
+ +      *avg += diff >> 3;
+ +}
+ +
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
         sched_info_queued(p);
@@@ -1708,13 -1572,6 +1727,13 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
+ +      if (sleep && p->se.last_wakeup) {
+ +              update_avg(&p->se.avg_overlap,
+ +                         p->se.sum_exec_runtime - p->se.last_wakeup);
+ +              p->se.last_wakeup = 0;
+ +      }
+ +
+ +      sched_info_dequeued(p);
         p->sched_class->dequeue_task(rq, p, sleep);
         p->se.on_rq = 0;
   }
@@@ -1774,7 -1631,7 +1793,7 @@@ static void activate_task(struct rq *rq
                 rq->nr_uninterruptible--;
   
         enqueue_task(rq, p, wakeup);
- -      inc_nr_running(p, rq);
+ +      inc_nr_running(rq);
   }
   
   /*
@@@ -1786,7 -1643,7 +1805,7 @@@ static void deactivate_task(struct rq *
                 rq->nr_uninterruptible++;
   
         dequeue_task(rq, p, sleep);
- -      dec_nr_running(p, rq);
+ +      dec_nr_running(rq);
   }
   
   /**
@@@ -1798,6 -1655,12 +1817,6 @@@ inline int task_curr(const struct task_
         return cpu_curr(task_cpu(p)) == p;
   }
   
- -/* Used instead of source_load when we know the type == 0 */
- -unsigned long weighted_cpuload(const int cpu)
- -{
- -      return cpu_rq(cpu)->load.weight;
- -}
- -
   static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
   {
         set_task_rq(p, cpu);
@@@ -1826,12 -1689,6 +1845,12 @@@ static inline void check_class_changed(
   
   #ifdef CONFIG_SMP
   
+ +/* Used instead of source_load when we know the type == 0 */
+ +static unsigned long weighted_cpuload(const int cpu)
+ +{
+ +      return cpu_rq(cpu)->load.weight;
+ +}
+ +
   /*
    * Is this task likely cache-hot:
    */
@@@ -2042,7 -1899,7 +2061,7 @@@ static unsigned long source_load(int cp
         struct rq *rq = cpu_rq(cpu);
         unsigned long total = weighted_cpuload(cpu);
   
- -      if (type == 0)
+ +      if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
   
         return min(rq->cpu_load[type-1], total);
@@@ -2057,12 -1914,24 +2076,12 @@@ static unsigned long target_load(int cp
         struct rq *rq = cpu_rq(cpu);
         unsigned long total = weighted_cpuload(cpu);
   
- -      if (type == 0)
+ +      if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
   
         return max(rq->cpu_load[type-1], total);
   }
   
- -/*
- - * Return the average load per task on the cpu's run queue
- - */
- -static unsigned long cpu_avg_load_per_task(int cpu)
- -{
- -      struct rq *rq = cpu_rq(cpu);
- -      unsigned long total = weighted_cpuload(cpu);
- -      unsigned long n = rq->nr_running;
- -
- -      return n ? total / n : SCHED_LOAD_SCALE;
- -}
- -
   /*
    * find_idlest_group finds and returns the least busy CPU group within the
    * domain.
@@@ -2169,9 -2038,6 +2188,9 @@@ static int sched_balance_self(int cpu, 
                         sd = tmp;
         }
   
+ +      if (sd)
+ +              update_shares(sd);
+ +
         while (sd) {
                 cpumask_t span, tmpmask;
                 struct sched_group *group;
@@@ -2238,22 -2104,6 +2257,22 @@@ static int try_to_wake_up(struct task_s
         if (!sched_feat(SYNC_WAKEUPS))
                 sync = 0;
   
+ +#ifdef CONFIG_SMP
+ +      if (sched_feat(LB_WAKEUP_UPDATE)) {
+ +              struct sched_domain *sd;
+ +
+ +              this_cpu = raw_smp_processor_id();
+ +              cpu = task_cpu(p);
+ +
+ +              for_each_domain(this_cpu, sd) {
+ +                      if (cpu_isset(cpu, sd->span)) {
+ +                              update_shares(sd);
+ +                              break;
+ +                      }
+ +              }
+ +      }
+ +#endif
+ +
         smp_wmb();
         rq = task_rq_lock(p, &flags);
         old_state = p->state;
@@@ -2300,7 -2150,7 +2319,7 @@@
                         }
                 }
         }
- -#endif
+ +#endif /* CONFIG_SCHEDSTATS */
   
   out_activate:
   #endif /* CONFIG_SMP */
@@@ -2318,6 -2168,9 +2337,9 @@@
         success = 1;
   
   out_running:
+       trace_mark(kernel_sched_wakeup,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   
         p->state = TASK_RUNNING;
@@@ -2326,8 -2179,6 +2348,8 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
+ +      current->se.last_wakeup = current->se.sum_exec_runtime;
+ +
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2448,8 -2299,11 +2470,11 @@@ void wake_up_new_task(struct task_struc
                  * management (if any):
                  */
                 p->sched_class->task_new(rq, p);
- -              inc_nr_running(p, rq);
+ +              inc_nr_running(rq);
         }
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2502,7 -2356,7 +2527,7 @@@ fire_sched_out_preempt_notifiers(struc
                 notifier->ops->sched_out(notifier, next);
   }
   
- -#else
+ +#else /* !CONFIG_PREEMPT_NOTIFIERS */
   
   static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   {
@@@ -2514,7 -2368,7 +2539,7 @@@ fire_sched_out_preempt_notifiers(struc
   {
   }
   
- -#endif
+ +#endif /* CONFIG_PREEMPT_NOTIFIERS */
   
   /**
    * prepare_task_switch - prepare to switch tasks
@@@ -2622,6 -2476,11 +2647,11 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
+       trace_mark(kernel_sched_schedule,
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               prev->pid, next->pid, prev->state,
+               rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -2956,7 -2815,7 +2986,7 @@@ balance_tasks(struct rq *this_rq, int t
               enum cpu_idle_type idle, int *all_pinned,
               int *this_best_prio, struct rq_iterator *iterator)
   {
- -      int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+ +      int loops = 0, pulled = 0, pinned = 0;
         struct task_struct *p;
         long rem_load_move = max_load_move;
   
@@@ -2972,8 -2831,14 +3002,8 @@@
   next:
         if (!p || loops++ > sysctl_sched_nr_migrate)
                 goto out;
- -      /*
- -       * To help distribute high priority tasks across CPUs we don't
- -       * skip a task if it will be the highest priority task (i.e. smallest
- -       * prio value) on its new queue regardless of its load weight
- -       */
- -      skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
- -                                                       SCHED_LOAD_SCALE_FUZZ;
- -      if ((skip_for_load && p->prio >= *this_best_prio) ||
+ +
+ +      if ((p->se.load.weight >> 1) > rem_load_move ||
             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
                 p = iterator->next(iterator->arg);
                 goto next;
@@@ -3028,10 -2893,6 +3058,10 @@@ static int move_tasks(struct rq *this_r
                                 max_load_move - total_load_moved,
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
+ +
+ +              if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+ +                      break;
+ +
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3108,7 -2969,6 +3138,7 @@@ find_busiest_group(struct sched_domain 
         max_load = this_load = total_load = total_pwr = 0;
         busiest_load_per_task = busiest_nr_running = 0;
         this_load_per_task = this_nr_running = 0;
+ +
         if (idle == CPU_NOT_IDLE)
                 load_idx = sd->busy_idx;
         else if (idle == CPU_NEWLY_IDLE)
@@@ -3123,8 -2983,6 +3153,8 @@@
                 int __group_imb = 0;
                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
                 unsigned long sum_nr_running, sum_weighted_load;
+ +              unsigned long sum_avg_load_per_task;
+ +              unsigned long avg_load_per_task;
   
                 local_group = cpu_isset(this_cpu, group->cpumask);
   
@@@ -3133,8 -2991,6 +3163,8 @@@
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
+ +              sum_avg_load_per_task = avg_load_per_task = 0;
+ +
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
@@@ -3168,8 -3024,6 +3198,8 @@@
                         avg_load += load;
                         sum_nr_running += rq->nr_running;
                         sum_weighted_load += weighted_cpuload(i);
+ +
+ +                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
                 }
   
                 /*
@@@ -3191,20 -3045,7 +3221,20 @@@
                 avg_load = sg_div_cpu_power(group,
                                 avg_load * SCHED_LOAD_SCALE);
   
- -              if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+ +
+ +              /*
+ +               * Consider the group unbalanced when the imbalance is larger
+ +               * than the average weight of two tasks.
+ +               *
+ +               * APZ: with cgroup the avg task weight can vary wildly and
+ +               *      might not be a suitable number - should we keep a
+ +               *      normalized nr_running number somewhere that negates
+ +               *      the hierarchy?
+ +               */
+ +              avg_load_per_task = sg_div_cpu_power(group,
+ +                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +
+ +              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                         __group_imb = 1;
   
                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@@ -3345,9 -3186,9 +3375,9 @@@ small_imbalance
                         if (busiest_load_per_task > this_load_per_task)
                                 imbn = 1;
                 } else
- -                      this_load_per_task = SCHED_LOAD_SCALE;
+ +                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
   
- -              if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+ +              if (max_load - this_load + 2*busiest_load_per_task >=
                                         busiest_load_per_task * imbn) {
                         *imbalance = busiest_load_per_task;
                         return busiest;
@@@ -3473,7 -3314,6 +3503,7 @@@ static int load_balance(int this_cpu, s
         schedstat_inc(sd, lb_count[idle]);
   
   redo:
+ +      update_shares(sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                    cpus, balance);
   
@@@ -3576,9 -3416,8 +3606,9 @@@
   
         if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              return -1;
- -      return ld_moved;
+ +              ld_moved = -1;
+ +
+ +      goto out;
   
   out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
@@@ -3593,13 -3432,8 +3623,13 @@@ out_one_pinned
   
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- -              return -1;
- -      return 0;
+ +              ld_moved = -1;
+ +      else
+ +              ld_moved = 0;
+ +out:
+ +      if (ld_moved)
+ +              update_shares(sd);
+ +      return ld_moved;
   }
   
   /*
@@@ -3634,7 -3468,6 +3664,7 @@@ load_balance_newidle(int this_cpu, stru
   
         schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
   redo:
+ +      update_shares_locked(this_rq, sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
                                    &sd_idle, cpus, NULL);
         if (!group) {
@@@ -3678,7 -3511,6 +3708,7 @@@
         } else
                 sd->nr_balance_failed = 0;
   
+ +      update_shares_locked(this_rq, sd);
         return ld_moved;
   
   out_balanced:
@@@ -3870,7 -3702,6 +3900,7 @@@ static void rebalance_domains(int cpu, 
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
+ +      int need_serialize;
         cpumask_t tmp;
   
         for_each_domain(cpu, sd) {
@@@ -3888,9 -3719,8 +3918,9 @@@
                 if (interval > HZ*NR_CPUS/10)
                         interval = HZ*NR_CPUS/10;
   
+ +              need_serialize = sd->flags & SD_SERIALIZE;
   
- -              if (sd->flags & SD_SERIALIZE) {
+ +              if (need_serialize) {
                         if (!spin_trylock(&balancing))
                                 goto out;
                 }
@@@ -3906,7 -3736,7 +3936,7 @@@
                         }
                         sd->last_balance = jiffies;
                 }
- -              if (sd->flags & SD_SERIALIZE)
+ +              if (need_serialize)
                         spin_unlock(&balancing);
   out:
                 if (time_after(next_balance, sd->last_balance + interval)) {
@@@ -4221,26 -4051,44 +4251,44 @@@ void scheduler_tick(void
   #endif
   }
   
- #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+ 
+ static inline unsigned long get_parent_ip(unsigned long addr)
+ {
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+ }
   
   void __kprobes add_preempt_count(int val)
   {
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
+ #endif
         preempt_count() += val;
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
          */
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
+ #endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
   }
   EXPORT_SYMBOL(add_preempt_count);
   
   void __kprobes sub_preempt_count(int val)
   {
+ #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
@@@ -4252,7 -4100,10 +4300,10 @@@
         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                         !(preempt_count() & PREEMPT_MASK)))
                 return;
+ #endif
   
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
         preempt_count() -= val;
   }
   EXPORT_SYMBOL(sub_preempt_count);
@@@ -4270,7 -4121,6 +4321,7 @@@ static noinline void __schedule_bug(str
                 prev->comm, prev->pid, preempt_count());
   
         debug_show_held_locks(prev);
+ +      print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
   
@@@ -4344,7 -4194,7 +4395,7 @@@ asmlinkage void __sched schedule(void
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
- -      int cpu;
+ +      int cpu, hrtick = sched_feat(HRTICK);
   
   need_resched:
         preempt_disable();
@@@ -4359,8 -4209,7 +4410,8 @@@ need_resched_nonpreemptible
   
         schedule_debug(prev);
   
- -      hrtick_clear(rq);
+ +      if (hrtick)
+ +              hrtick_clear(rq);
   
         /*
          * Do the rq-clock update outside the rq lock:
@@@ -4406,8 -4255,7 +4457,8 @@@
         } else
                 spin_unlock_irq(&rq->lock);
   
- -      hrtick_set(rq);
+ +      if (hrtick)
+ +              hrtick_set(rq);
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
@@@ -4789,8 -4637,10 +4840,8 @@@ void set_user_nice(struct task_struct *
                 goto out_unlock;
         }
         on_rq = p->se.on_rq;
- -      if (on_rq) {
+ +      if (on_rq)
                 dequeue_task(rq, p, 0);
- -              dec_load(rq, p);
- -      }
   
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@@ -4800,6 -4650,7 +4851,6 @@@
   
         if (on_rq) {
                 enqueue_task(rq, p, 0);
- -              inc_load(rq, p);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -5270,6 -5121,24 +5321,6 @@@ asmlinkage long sys_sched_setaffinity(p
         return sched_setaffinity(pid, &new_mask);
   }
   
- -/*
- - * Represents all cpu's present in the system
- - * In systems capable of hotplug, this map could dynamically grow
- - * as new cpu's are detected in the system via any platform specific
- - * method, such as ACPI for e.g.
- - */
- -
- -cpumask_t cpu_present_map __read_mostly;
- -EXPORT_SYMBOL(cpu_present_map);
- -
- -#ifndef CONFIG_SMP
- -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
- -EXPORT_SYMBOL(cpu_online_map);
- -
- -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
- -EXPORT_SYMBOL(cpu_possible_map);
- -#endif
- -
   long sched_getaffinity(pid_t pid, cpumask_t *mask)
   {
         struct task_struct *p;
@@@ -5566,7 -5435,7 +5617,7 @@@ out_unlock
         return retval;
   }
   
- static const char stat_nam[] = "RSDTtZX";
+ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
   
   void sched_show_task(struct task_struct *p)
   {
@@@ -5753,12 -5622,6 +5804,12 @@@ int set_cpus_allowed_ptr(struct task_st
                 goto out;
         }
   
+ +      if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+ +                   !cpus_equal(p->cpus_allowed, *new_mask))) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
@@@ -5810,10 -5673,10 +5861,10 @@@ static int __migrate_task(struct task_s
         double_rq_lock(rq_src, rq_dest);
         /* Already moved. */
         if (task_cpu(p) != src_cpu)
- -              goto out;
+ +              goto done;
         /* Affinity changed (again). */
         if (!cpu_isset(dest_cpu, p->cpus_allowed))
- -              goto out;
+ +              goto fail;
   
         on_rq = p->se.on_rq;
         if (on_rq)
@@@ -5824,9 -5687,8 +5875,9 @@@
                 activate_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p);
         }
+ +done:
         ret = 1;
- -out:
+ +fail:
         double_rq_unlock(rq_src, rq_dest);
         return ret;
   }
@@@ -6248,36 -6110,6 +6299,36 @@@ static void unregister_sched_domain_sys
   }
   #endif
   
+ +static void set_rq_online(struct rq *rq)
+ +{
+ +      if (!rq->online) {
+ +              const struct sched_class *class;
+ +
+ +              cpu_set(rq->cpu, rq->rd->online);
+ +              rq->online = 1;
+ +
+ +              for_each_class(class) {
+ +                      if (class->rq_online)
+ +                              class->rq_online(rq);
+ +              }
+ +      }
+ +}
+ +
+ +static void set_rq_offline(struct rq *rq)
+ +{
+ +      if (rq->online) {
+ +              const struct sched_class *class;
+ +
+ +              for_each_class(class) {
+ +                      if (class->rq_offline)
+ +                              class->rq_offline(rq);
+ +              }
+ +
+ +              cpu_clear(rq->cpu, rq->rd->online);
+ +              rq->online = 0;
+ +      }
+ +}
+ +
   /*
    * migration_call - callback that gets triggered when a CPU is added.
    * Here we can start up the necessary migration thread for the new CPU.
@@@ -6315,8 -6147,7 +6366,8 @@@ migration_call(struct notifier_block *n
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpu_isset(cpu, rq->rd->span));
- -                      cpu_set(cpu, rq->rd->online);
+ +
+ +                      set_rq_online(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
                 break;
@@@ -6377,7 -6208,7 +6428,7 @@@
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpu_isset(cpu, rq->rd->span));
- -                      cpu_clear(cpu, rq->rd->online);
+ +                      set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
                 break;
@@@ -6411,28 -6242,6 +6462,28 @@@ void __init migration_init(void
   
   #ifdef CONFIG_SCHED_DEBUG
   
+ +static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+ +{
+ +      switch (lvl) {
+ +      case SD_LV_NONE:
+ +                      return "NONE";
+ +      case SD_LV_SIBLING:
+ +                      return "SIBLING";
+ +      case SD_LV_MC:
+ +                      return "MC";
+ +      case SD_LV_CPU:
+ +                      return "CPU";
+ +      case SD_LV_NODE:
+ +                      return "NODE";
+ +      case SD_LV_ALLNODES:
+ +                      return "ALLNODES";
+ +      case SD_LV_MAX:
+ +                      return "MAX";
+ +
+ +      }
+ +      return "MAX";
+ +}
+ +
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                   cpumask_t *groupmask)
   {
@@@ -6452,8 -6261,7 +6503,8 @@@
                 return -1;
         }
   
- -      printk(KERN_CONT "span %s\n", str);
+ +      printk(KERN_CONT "span %s level %s\n",
+ +              str, sd_level_to_string(sd->level));
   
         if (!cpu_isset(cpu, sd->span)) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
@@@ -6537,9 -6345,9 +6588,9 @@@ static void sched_domain_debug(struct s
         }
         kfree(groupmask);
   }
- -#else
+ +#else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
- -#endif
+ +#endif /* CONFIG_SCHED_DEBUG */
   
   static int sd_degenerate(struct sched_domain *sd)
   {
@@@ -6599,16 -6407,20 +6650,16 @@@ sd_parent_degenerate(struct sched_domai
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
- -      const struct sched_class *class;
   
         spin_lock_irqsave(&rq->lock, flags);
   
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
- -              for (class = sched_class_highest; class; class = class->next) {
- -                      if (class->leave_domain)
- -                              class->leave_domain(rq);
- -              }
+ +              if (cpu_isset(rq->cpu, old_rd->online))
+ +                      set_rq_offline(rq);
   
                 cpu_clear(rq->cpu, old_rd->span);
- -              cpu_clear(rq->cpu, old_rd->online);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
                         kfree(old_rd);
@@@ -6619,7 -6431,12 +6670,7 @@@
   
         cpu_set(rq->cpu, rd->span);
         if (cpu_isset(rq->cpu, cpu_online_map))
- -              cpu_set(rq->cpu, rd->online);
- -
- -      for (class = sched_class_highest; class; class = class->next) {
- -              if (class->join_domain)
- -                      class->join_domain(rq);
- -      }
+ +              set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
@@@ -6630,8 -6447,6 +6681,8 @@@ static void init_rootdomain(struct root
   
         cpus_clear(rd->span);
         cpus_clear(rd->online);
+ +
+ +      cpupri_init(&rd->cpupri);
   }
   
   static void init_defrootdomain(void)
@@@ -6774,9 -6589,9 +6825,9 @@@ static int find_next_best_node(int node
   
         min_val = INT_MAX;
   
- -      for (i = 0; i < MAX_NUMNODES; i++) {
+ +      for (i = 0; i < nr_node_ids; i++) {
                 /* Start at @node */
- -              n = (node + i) % MAX_NUMNODES;
+ +              n = (node + i) % nr_node_ids;
   
                 if (!nr_cpus_node(n))
                         continue;
@@@ -6826,7 -6641,7 +6877,7 @@@ static void sched_domain_node_span(int 
                 cpus_or(*span, *span, *nodemask);
         }
   }
- -#endif
+ +#endif /* CONFIG_NUMA */
   
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
@@@ -6845,7 -6660,7 +6896,7 @@@ cpu_to_cpu_group(int cpu, const cpumask
                 *sg = &per_cpu(sched_group_cpus, cpu);
         return cpu;
   }
- -#endif
+ +#endif /* CONFIG_SCHED_SMT */
   
   /*
    * multi-core sched-domains:
@@@ -6853,7 -6668,7 +6904,7 @@@
   #ifdef CONFIG_SCHED_MC
   static DEFINE_PER_CPU(struct sched_domain, core_domains);
   static DEFINE_PER_CPU(struct sched_group, sched_group_core);
- -#endif
+ +#endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
@@@ -6955,7 -6770,7 +7006,7 @@@ static void init_numa_sched_groups_powe
                 sg = sg->next;
         } while (sg != group_head);
   }
- -#endif
+ +#endif /* CONFIG_NUMA */
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
@@@ -6970,7 -6785,7 +7021,7 @@@ static void free_sched_groups(const cpu
                 if (!sched_group_nodes)
                         continue;
   
- -              for (i = 0; i < MAX_NUMNODES; i++) {
+ +              for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
   
                         *nodemask = node_to_cpumask(i);
@@@ -6992,11 -6807,11 +7043,11 @@@ next_sg
                 sched_group_nodes_bycpu[cpu] = NULL;
         }
   }
- -#else
+ +#else /* !CONFIG_NUMA */
   static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
   {
   }
- -#endif
+ +#endif /* CONFIG_NUMA */
   
   /*
    * Initialize sched groups cpu_power.
@@@ -7163,7 -6978,7 +7214,7 @@@ static int __build_sched_domains(const 
         /*
          * Allocate the per-node list of sched groups
          */
- -      sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+ +      sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
                                     GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@@ -7302,7 -7117,7 +7353,7 @@@
   #endif
   
         /* Set up physical groups */
- -      for (i = 0; i < MAX_NUMNODES; i++) {
+ +      for (i = 0; i < nr_node_ids; i++) {
                 SCHED_CPUMASK_VAR(nodemask, allmasks);
                 SCHED_CPUMASK_VAR(send_covered, allmasks);
   
@@@ -7326,7 -7141,7 +7377,7 @@@
                                         send_covered, tmpmask);
         }
   
- -      for (i = 0; i < MAX_NUMNODES; i++) {
+ +      for (i = 0; i < nr_node_ids; i++) {
                 /* Set up node groups */
                 struct sched_group *sg, *prev;
                 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@@ -7365,9 -7180,9 +7416,9 @@@
                 cpus_or(*covered, *covered, *nodemask);
                 prev = sg;
   
- -              for (j = 0; j < MAX_NUMNODES; j++) {
+ +              for (j = 0; j < nr_node_ids; j++) {
                         SCHED_CPUMASK_VAR(notcovered, allmasks);
- -                      int n = (i + j) % MAX_NUMNODES;
+ +                      int n = (i + j) % nr_node_ids;
                         node_to_cpumask_ptr(pnodemask, n);
   
                         cpus_complement(*notcovered, *covered);
@@@ -7420,7 -7235,7 +7471,7 @@@
         }
   
   #ifdef CONFIG_NUMA
- -      for (i = 0; i < MAX_NUMNODES; i++)
+ +      for (i = 0; i < nr_node_ids; i++)
                 init_numa_sched_groups_power(sched_group_nodes[i]);
   
         if (sd_allnodes) {
@@@ -7705,7 -7520,7 +7756,7 @@@ int sched_create_sysfs_power_savings_en
   #endif
         return err;
   }
- -#endif
+ +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
   /*
    * Force a reinitialization of the sched domains hierarchy. The domains
@@@ -7716,28 -7531,21 +7767,28 @@@
   static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
   {
+ +      int cpu = (int)(long)hcpu;
+ +
         switch (action) {
- -      case CPU_UP_PREPARE:
- -      case CPU_UP_PREPARE_FROZEN:
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
+ +              disable_runtime(cpu_rq(cpu));
+ +              /* fall-through */
+ +      case CPU_UP_PREPARE:
+ +      case CPU_UP_PREPARE_FROZEN:
                 detach_destroy_domains(&cpu_online_map);
                 free_sched_domains();
                 return NOTIFY_OK;
   
- -      case CPU_UP_CANCELED:
- -      case CPU_UP_CANCELED_FROZEN:
+ +
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
+ +              enable_runtime(cpu_rq(cpu));
+ +              /* fall-through */
+ +      case CPU_UP_CANCELED:
+ +      case CPU_UP_CANCELED_FROZEN:
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
                 /*
@@@ -7937,8 -7745,8 +7988,8 @@@ void __init sched_init(void
   
                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
- -#endif
- -#endif
+ +#endif /* CONFIG_USER_SCHED */
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   #ifdef CONFIG_RT_GROUP_SCHED
                 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
@@@ -7952,8 -7760,8 +8003,8 @@@
   
                 root_task_group.rt_rq = (struct rt_rq **)ptr;
                 ptr += nr_cpu_ids * sizeof(void **);
- -#endif
- -#endif
+ +#endif /* CONFIG_USER_SCHED */
+ +#endif /* CONFIG_RT_GROUP_SCHED */
         }
   
   #ifdef CONFIG_SMP
@@@ -7969,8 -7777,8 +8020,8 @@@
   #ifdef CONFIG_USER_SCHED
         init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), RUNTIME_INF);
- -#endif
- -#endif
+ +#endif /* CONFIG_USER_SCHED */
+ +#endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_GROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
@@@ -7980,8 -7788,8 +8031,8 @@@
         INIT_LIST_HEAD(&root_task_group.children);
         init_task_group.parent = &root_task_group;
         list_add(&init_task_group.siblings, &root_task_group.children);
- -#endif
- -#endif
+ +#endif /* CONFIG_USER_SCHED */
+ +#endif /* CONFIG_GROUP_SCHED */
   
         for_each_possible_cpu(i) {
                 struct rq *rq;
@@@ -8061,7 -7869,6 +8112,7 @@@
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
                 rq->cpu = i;
+ +              rq->online = 0;
                 rq->migration_thread = NULL;
                 INIT_LIST_HEAD(&rq->migration_queue);
                 rq_attach_root(rq, &def_root_domain);
@@@ -8301,7 -8108,7 +8352,7 @@@ static inline void unregister_fair_sche
   {
         list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
   }
- -#else
+ +#else /* !CONFG_FAIR_GROUP_SCHED */
   static inline void free_fair_sched_group(struct task_group *tg)
   {
   }
@@@ -8319,7 -8126,7 +8370,7 @@@ static inline void register_fair_sched_
   static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
   {
   }
- -#endif
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static void free_rt_sched_group(struct task_group *tg)
@@@ -8390,7 -8197,7 +8441,7 @@@ static inline void unregister_rt_sched_
   {
         list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
   }
- -#else
+ +#else /* !CONFIG_RT_GROUP_SCHED */
   static inline void free_rt_sched_group(struct task_group *tg)
   {
   }
@@@ -8408,7 -8215,7 +8459,7 @@@ static inline void register_rt_sched_gr
   static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
   {
   }
- -#endif
+ +#endif /* CONFIG_RT_GROUP_SCHED */
   
   #ifdef CONFIG_GROUP_SCHED
   static void free_sched_group(struct task_group *tg)
@@@ -8519,14 -8326,17 +8570,14 @@@ void sched_move_task(struct task_struc
   
         task_rq_unlock(rq, &flags);
   }
- -#endif
+ +#endif /* CONFIG_GROUP_SCHED */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ +static void __set_se_shares(struct sched_entity *se, unsigned long shares)
   {
         struct cfs_rq *cfs_rq = se->cfs_rq;
- -      struct rq *rq = cfs_rq->rq;
         int on_rq;
   
- -      spin_lock_irq(&rq->lock);
- -
         on_rq = se->on_rq;
         if (on_rq)
                 dequeue_entity(cfs_rq, se, 0);
@@@ -8536,17 -8346,8 +8587,17 @@@
   
         if (on_rq)
                 enqueue_entity(cfs_rq, se, 0);
+ +}
   
- -      spin_unlock_irq(&rq->lock);
+ +static void set_se_shares(struct sched_entity *se, unsigned long shares)
+ +{
+ +      struct cfs_rq *cfs_rq = se->cfs_rq;
+ +      struct rq *rq = cfs_rq->rq;
+ +      unsigned long flags;
+ +
+ +      spin_lock_irqsave(&rq->lock, flags);
+ +      __set_se_shares(se, shares);
+ +      spin_unlock_irqrestore(&rq->lock, flags);
   }
   
   static DEFINE_MUTEX(shares_mutex);
@@@ -8585,13 -8386,8 +8636,13 @@@ int sched_group_set_shares(struct task_
          * w/o tripping rebalance_share or load_balance_fair.
          */
         tg->shares = shares;
- -      for_each_possible_cpu(i)
+ +      for_each_possible_cpu(i) {
+ +              /*
+ +               * force a rebalance
+ +               */
+ +              cfs_rq_set_shares(tg->cfs_rq[i], 0);
                 set_se_shares(tg->se[i], shares);
+ +      }
   
         /*
          * Enable load balance activity on this group, by inserting it back on
@@@ -8630,7 -8426,7 +8681,7 @@@ static unsigned long to_ratio(u64 perio
   #ifdef CONFIG_CGROUP_SCHED
   static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
   {
- -      struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+ +      struct task_group *tgi, *parent = tg->parent;
         unsigned long total = 0;
   
         if (!parent) {
@@@ -8654,7 -8450,7 +8705,7 @@@
         }
         rcu_read_unlock();
   
- -      return total + to_ratio(period, runtime) <
+ +      return total + to_ratio(period, runtime) <=
                 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
                                 parent->rt_bandwidth.rt_runtime);
   }
@@@ -8774,21 -8570,16 +8825,21 @@@ long sched_group_rt_period(struct task_
   
   static int sched_rt_global_constraints(void)
   {
+ +      struct task_group *tg = &root_task_group;
+ +      u64 rt_runtime, rt_period;
         int ret = 0;
   
+ +      rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ +      rt_runtime = tg->rt_bandwidth.rt_runtime;
+ +
         mutex_lock(&rt_constraints_mutex);
- -      if (!__rt_schedulable(NULL, 1, 0))
+ +      if (!__rt_schedulable(tg, rt_period, rt_runtime))
                 ret = -EINVAL;
         mutex_unlock(&rt_constraints_mutex);
   
         return ret;
   }
- -#else
+ +#else /* !CONFIG_RT_GROUP_SCHED */
   static int sched_rt_global_constraints(void)
   {
         unsigned long flags;
@@@ -8806,7 -8597,7 +8857,7 @@@
   
         return 0;
   }
- -#endif
+ +#endif /* CONFIG_RT_GROUP_SCHED */
   
   int sched_rt_handler(struct ctl_table *table, int write,
                 struct file *filp, void __user *buffer, size_t *lenp,
@@@ -8914,7 -8705,7 +8965,7 @@@ static u64 cpu_shares_read_u64(struct c
   
         return (u64) tg->shares;
   }
- -#endif
+ +#endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@@ -8938,7 -8729,7 +8989,7 @@@ static u64 cpu_rt_period_read_uint(stru
   {
         return sched_group_rt_period(cgroup_tg(cgrp));
   }
- -#endif
+ +#endif /* CONFIG_RT_GROUP_SCHED */
   
   static struct cftype cpu_files[] = {
   #ifdef CONFIG_FAIR_GROUP_SCHED
diff --combined kernel/sysctl.c

index fe8cdc80ff028ac5542e0a342c5a2c9c5a823833,efaf7c5500e97102c2cc90a68fb8b05997dc6e56..18943985ddee42f4163e80d4537b991886f0dad1
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -46,6 -46,7 +46,7 @@@
   #include <linux/nfs_fs.h>
   #include <linux/acpi.h>
   #include <linux/reboot.h>
+ #include <linux/ftrace.h>
   
   #include <asm/uaccess.h>
   #include <asm/processor.h>
@@@ -264,14 -265,6 +265,14 @@@ static struct ctl_table kern_table[] = 
                 .extra1         = &min_wakeup_granularity_ns,
                 .extra2         = &max_wakeup_granularity_ns,
         },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "sched_shares_ratelimit",
+ +              .data           = &sysctl_sched_shares_ratelimit,
+ +              .maxlen         = sizeof(unsigned int),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "sched_child_runs_first",
@@@ -463,6 -456,16 +464,16 @@@
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+ #ifdef CONFIG_FTRACE
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_enabled",
+               .data           = &ftrace_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &ftrace_enable_sysctl,
+       },
+ #endif
   #ifdef CONFIG_KMOD
         {
                 .ctl_name       = KERN_MODPROBE,
author	Ingo Molnar <mingo@elte.hu>
	Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
		1	2
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/machine_kexec_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/machine_kexec_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vsyscall_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/x8664_ksyms_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/lib/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/ioremap.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-x86/irqflags.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/linkage.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/printk.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history