vmware-host-modules/vmmon-only/common/vmx86.c

/*********************************************************
 * Copyright (C) 1998-2022 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

/*
 * vmx86.c --
 *
 *     Platform independent routines for creating/destroying/running
 *     virtual machine monitors.
 */

#ifdef __linux__
/* Must come before any kernel header file --hpreg */
#   include "driver-config.h"

#   include <linux/string.h> /* memset() in the kernel  */
#   include <linux/sched.h>  /* jiffies from the kernel */
#else
#   include <string.h>
#endif

#include "vm_assert.h"
#include "vm_basic_math.h"
#include "vmx86.h"
#include "task.h"
#include "iocontrols.h"
#include "hostif.h"
#include "cpuid.h"
#include "vcpuset.h"
#include "memtrack.h"
#if defined(_WIN64)
#include "vmmon-asm-x86-64.h"
#endif
#include "x86vt.h"
#include "x86svm.h"
#include "x86cpuid_asm.h"
#if defined(__linux__)
#include <linux/timex.h>
#endif
#include "perfctr.h"
#include "x86vtinstr.h"
#include "bootstrap_vmm.h"
#include "monLoader.h"
#include "vmmblob.h"
#include "sharedAreaVmmon.h"
#include "statVarsVmmon.h"
#include "intelVT.h"
#include "cpu_defs.h"
#include "x86cet.h"

PseudoTSC pseudoTSC;

/*
 * Keep track of the virtual machines that have been
 * created using the following structures.
 */

static VMDriver *vmDriverList = NULL;

static LockedPageLimit lockedPageLimit = {
   0,                        // host: does not need to be initialized.
   0,                        // configured: must be set by some VM as it is powered on.
};

/* Percentage of guest "paged" memory that must fit within the hard limit. */
static Percent minVmMemPct;

/* Number of pages actually locked by all virtual machines */
static PageCnt numLockedPages;

/* Total virtual machines on this host */
static unsigned vmCount;

/* Total number of open vmmon file handles. */
static unsigned fdCount;

/*
 * We implement a list of allocated VM ID's using an array.
 * The array is initialized with the values 1...MAX_VMS-1, INVALID_VMID.
 * vmIDsAllocated holds the last VM ID given out and vmIDsUnused
 * holds the next VM ID to give out.
 */

#define INVALID_VMID (-1)
static int vmIDList[MAX_VMS];
static int vmIDsAllocated;
static int vmIDsUnused;

/* Max rate requested for fast clock by any virtual machine. */
static unsigned globalFastClockRate;

/* 3 physically contiguous pages for the I/O bitmap.  SVM only. */
HostIFContigMemMap *hvIOBitmap;

typedef struct {
   Atomic_uint32 *index; // OUT: array of cpu counters for queries.
   MSRQuery *query;  // IN/OUT: array of query items
   uint32 numItems;  // IN
} Vmx86GetMSRData;

static Bool hostUsesNX;

typedef struct NXData {
   Atomic_uint32 responded;
   Atomic_uint32 hasNX;
} NXData;

/*
 * A structure for holding MSR indexes, values for MSR uniformity checks.
 */
typedef struct VMX86MSRCacheInfo {
   uint32 msrIndex;
   uint64 msrValue;
} VMX86MSRCacheInfo;

struct MSRCache {
   Vmx86GetMSRData *queryCache;
   uint32 nPCPUs;
};

static Vmx86GetMSRData msrCacheQueryData;

/*
 * A MSR cache list for checking uniformity across physical cpus and for
 * generating least common denominated values across pcpus.
 * {MSR_Index, Member_Name}
 */
#define UNIFORMITY_CACHE_MSRS                                                  \
   MSRNUM(IA32_MSR_ARCH_CAPABILITIES,     ArchCap)                             \
   MSRNUM(MSR_BIOS_SIGN_ID,               BIOSSignID)                          \
   MSRNUM(MSR_PLATFORM_INFO,              Join)                                \
   MSRNUM(MSR_TSX_CTRL,                   Join)                                \
   MSRNUM(MSR_VM_CR,                      VMCR)                                \
   MSRNUMVT(MSR_FEATCTL,                  FeatureCtl)                          \
   MSRNUMVT(MSR_VMX_BASIC,                Basic)                               \
   MSRNUMVT(MSR_VMX_MISC,                 Misc)                                \
   MSRNUMVT(MSR_VMX_VMCS_ENUM,            Enum)                                \
   MSRNUMVT(MSR_VMX_EPT_VPID,             EPT)                                 \
   MSRNUMVT(MSR_VMX_VMFUNC,               VMFunc)                              \
   MSRNUMVT(MSR_VMX_3RD_CTLS,             3rd)                                 \
   MSRNUMVT2(MSR_VMX_PINBASED_CTLS,       Ctls)                                \
   MSRNUMVT2(MSR_VMX_PROCBASED_CTLS,      Ctls)                                \
   MSRNUMVT2(MSR_VMX_EXIT_CTLS,           Ctls)                                \
   MSRNUMVT2(MSR_VMX_ENTRY_CTLS,          Ctls)                                \
   MSRNUMVT2(MSR_VMX_2ND_CTLS,            Ctls)                                \
   MSRNUMVT2(MSR_VMX_TRUE_PINBASED_CTLS,  Ctls)                                \
   MSRNUMVT2(MSR_VMX_TRUE_PROCBASED_CTLS, Ctls)                                \
   MSRNUMVT2(MSR_VMX_TRUE_EXIT_CTLS,      Ctls)                                \
   MSRNUMVT2(MSR_VMX_TRUE_ENTRY_CTLS,     Ctls)                                \
   MSRNUMVT2(MSR_VMX_CR0_FIXED0,          Fixed0)                              \
   MSRNUMVT2(MSR_VMX_CR4_FIXED0,          Fixed0)                              \
   MSRNUMVT2(MSR_VMX_CR0_FIXED1,          Fixed1)                              \
   MSRNUMVT2(MSR_VMX_CR4_FIXED1,          Fixed1)                              \

static VMX86MSRCacheInfo msrUniformityCacheInfo[] = {
#define MSRNUM(msr, member) {msr, CONST64(0)},
#define MSRNUMVT    MSRNUM
#define MSRNUMVT2   MSRNUM
   UNIFORMITY_CACHE_MSRS
};
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2


/*
 *----------------------------------------------------------------------
 *
 * Vmx86AdjustLimitForOverheads --
 *
 *        This function adjusts an overall limit on the number of
 *        locked pages to take into account overhead for the vmx processes, etc.
 *        since the hostOS will also see this as overhead. We do this for all
 *        vmx processes, not just ones whose vms have been admitted.
 *
 *        If "vm" is NULL, we are allocating a global page and have no
 *        perVMOverhead term to take into account.
 *
 * Results:
 *       Number of remaining pages considered to be lockable on this host.
 *
 * Side effects:
 *       None.
 *
 *----------------------------------------------------------------------
 */

static INLINE PageCnt
Vmx86AdjustLimitForOverheads(const VMDriver* vm,
                             const PageCnt limit)
{
   PageCnt extraCost = (vm != NULL) ? vmCount * vm->memInfo.perVMOverhead : 0;
   ASSERT(HostIF_GlobalLockIsHeld());
   return (extraCost < limit) ?  (limit - extraCost) : 0;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86LockedPageLimit --
 *
 *       There are three limits controlling how many pages we can lock on
 *       a host:
 *
 *       lockedPageLimit.configured is controlled by UI,
 *       lockedPageLimit.host is calculated dynamically based on kernel stats
 *       by vmmon using kernel stats.
 *
 *       We can lock the MIN of these values.
 *
 * Results:
 *       Number of pages to lock on this host.
 *
 * Side effects:
 *       Updates the host locked pages limit.
 *
 *----------------------------------------------------------------------
 */

static INLINE PageCnt
Vmx86LockedPageLimit(const VMDriver* vm)  // IN:
{
   PageCnt overallLimit;
   ASSERT(HostIF_GlobalLockIsHeld());
   lockedPageLimit.host = HostIF_EstimateLockedPageLimit(vm, numLockedPages);
   overallLimit = MIN(MIN(lockedPageLimit.configured, lockedPageLimit.host),
                      MAX_LOCKED_PAGES);

   return Vmx86AdjustLimitForOverheads(vm, overallLimit);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86HasFreePages --
 *
 *       Returns TRUE if the vm can lock more pages.  This is true if
 *       we are below the host's hard memory limit and this vm has not
 *       exceeded its maximum allocation.
 *       Callers must ensure driver-wide and VM serialization
 *       typically by using HostIF_GlobalLock() and  HostIF_VMLock().
 *
 * Results:
 *       TRUE if pages can be locked, FALSE otherwise
 *
 * Side effects:
 *       None
 *
 *----------------------------------------------------------------------
 */

static INLINE Bool
Vmx86HasFreePages(VMDriver *vm,
                  PageCnt numPages,
                  Bool checkVM)
{
   /*
    * 1) Be careful with overflow.
    * 2) lockedPageLimit and vm->memInfo.maxAllocation can be decreased below
    *    the current numLockedPages and vm->memInfo.locked
    * 3) lockedPageLimit.host can go lower than numLockedPages.
    */

   ASSERT(HostIF_GlobalLockIsHeld() &&
          (!checkVM || HostIF_VMLockIsHeld(vm)));

   if (checkVM) {
      /* Check the per-vm limit. */
      ASSERT(HostIF_VMLockIsHeld(vm));
      if (vm->memInfo.admitted) {
         if (vm->memInfo.maxAllocation <= vm->memInfo.locked) {
            return FALSE;
         } else if (vm->memInfo.maxAllocation - vm->memInfo.locked < numPages) {
            return FALSE;
         }
      }
   } else {
      /* Check the global limit. */
      PageCnt limit = Vmx86LockedPageLimit(vm);
      if (limit <= numLockedPages) {
         return FALSE;
      } else if (limit - numLockedPages < numPages) {
         return FALSE;
      }
   }

   return TRUE;
}


#ifdef VMX86_DEBUG
/*
 *----------------------------------------------------------------------
 *
 * Vmx86VMIsRegistered --
 *
 *      Check if "vm" is on the list of VMDrivers.
 *
 * Results:
 *      Return TRUE iff "vm" is on the list of VMDrivers.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------
 */

static Bool
Vmx86VMIsRegistered(VMDriver *vm, Bool needsLock)
{
   VMDriver *tmp;
   Bool      found = FALSE;

   ASSERT(needsLock || HostIF_GlobalLockIsHeld());

   if (needsLock) {
      HostIF_GlobalLock(5);
   }

   for (tmp = vmDriverList; tmp != NULL; tmp = tmp->nextDriver) {
      if (tmp == vm) {
         found = TRUE;
         break;
      }
   }

   if (needsLock) {
      HostIF_GlobalUnlock(5);
   }

   return found;
}
#endif


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_InitIDList --
 *
 *       Called when the driver is initialized.
 *       Set up the list of available VM ID's.
 *
 * Results:
 *       None. Sets up global data.
 *
 * Side effects:
 *       None.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_InitIDList(void)
{
   int i;

   HostIF_GlobalLock(32);

   for (i = 0; i < MAX_VMS; i++) {
      vmIDList[i] = i + 1;
   }
   vmIDList[MAX_VMS - 1] = INVALID_VMID;
   vmIDsUnused = 0;
   vmIDsAllocated = INVALID_VMID;

   HostIF_GlobalUnlock(32);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86FreeVMID --
 *
 *       Return a VM ID to the list of available VM ID's.
 *
 * Results:
 *       None
 *
 * Side effects:
 *       None
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86FreeVMID(int vmID) // IN
{
   int i;

   ASSERT(HostIF_GlobalLockIsHeld());

   /* Deleting head of the list. */
   if (vmID == vmIDsAllocated) {
      int tmp;

      tmp = vmIDList[vmIDsAllocated];
      vmIDList[vmIDsAllocated] = vmIDsUnused;
      vmIDsAllocated = tmp;
      vmIDsUnused = vmID;

      return;
   }

   for (i = vmIDsAllocated; vmIDList[i] != INVALID_VMID; i = vmIDList[i]) {
      if (vmIDList[i] == vmID) {
         vmIDList[i] = vmIDList[vmID];
         vmIDList[vmID] = vmIDsUnused;
         vmIDsUnused = vmID;

         return;
      }
   }
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86AllocVMID --
 *
 *       Grab a VM ID from the list of available VM ID's.
 *
 * Results:
 *       The VM ID, in the range [ 0 ; MAX_VMS ).
 *
 * Side effects:
 *       None
 *
 *----------------------------------------------------------------------
 */

static int
Vmx86AllocVMID(void)
{
   int vmID;

   ASSERT(HostIF_GlobalLockIsHeld());

   vmID = vmIDsUnused;
   ASSERT(0 <= vmID && vmID < MAX_VMS);
   vmIDsUnused = vmIDList[vmID];
   vmIDList[vmID] = vmIDsAllocated;
   vmIDsAllocated = vmID;

   return vmID;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86RegisterVMOnList --
 *
 *      Add a VM to the list of registered VMs and increment
 *      the count of VMs.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Add VM to linked list.
 *      Increment count of VMs.
 *
 *----------------------------------------------------------------
 */

static void
Vmx86RegisterVMOnList(VMDriver *vm) // IN
{
   int vmID;
   VMDriver **vmp;

   ASSERT(HostIF_GlobalLockIsHeld());
   vmCount++;
   vmID = Vmx86AllocVMID();
   ASSERT(vm->userID == 0);
   vm->userID = vmID + 1;
   ASSERT(vm->userID > 0);

   for (vmp = &vmDriverList; *vmp != NULL; vmp = &(*vmp)->nextDriver) {
      if (*vmp == vm) {
         Warning("VM already registered on the list of VMs.\n");
         return;
      }
   }
   *vmp = vm;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86DeleteVMFromList --
 *
 *      Delete a VM from the list of registered VMs and decrement
 *      the count of VMs. This function should be called on any
 *      VM registered on the VMDriverList before invoking
 *      Vmx86FreeAllVMResources to free its memory.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Remove VM from linked list.
 *      Decrement count of VMs.
 *
 *----------------------------------------------------------------
 */

static void
Vmx86DeleteVMFromList(VMDriver *vm)
{
   VMDriver **vmp;

   ASSERT(vm);
   ASSERT(HostIF_GlobalLockIsHeld());
   for (vmp = &vmDriverList; *vmp != vm; vmp = &(*vmp)->nextDriver) {
      if (*vmp == NULL) {
         Warning("VM is not on the list of registered VMs.\n");
         return;
      }
   }
   *vmp = vm->nextDriver;
   vmCount--;

   Vmx86FreeVMID(vm->userID - 1);
   numLockedPages -= vm->memInfo.locked;

   /*
    * If no VM is running, reset the configured locked-page limit so
    * that the next VM to power on sets it appropriately.
    */

   if (vmCount == 0) {
      lockedPageLimit.configured = 0;
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_Free --
 *
 *      A wrapper around HostIF_FreeKernelMem that checks if the given
 *      pointer is NULL before freeing memory.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

void
Vmx86_Free(void *ptr)
{
   if (ptr != NULL) {
      HostIF_FreeKernelMem(ptr);
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_Calloc --
 *
 *      A wrapper around HostIF_AllocKernelMem that zeroes memory and
 *      fails if integer overflow would occur in the computed
 *      allocation size.
 *
 * Results:
 *      Pointer to allocated memory or NULL on failure. Use
 *      HostIF_FreeKernelMem or Vmx86_Free to free.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

void *
Vmx86_Calloc(size_t numElements, // IN
             size_t elementSize, // IN
             Bool nonPageable)   // IN
{
   size_t numBytes = numElements * elementSize;
   void *retval;

   if (UNLIKELY(numBytes / numElements != elementSize)) { // Overflow.
      return NULL;
   }

   retval = HostIF_AllocKernelMem(numBytes, nonPageable);
   if (retval != NULL) {
      memset(retval, 0, numBytes);
   }
   return retval;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86AllocCrossPages --
 *
 *      Allocate numVCPUs pages suitable to be used as the VCPU's
 *      crosspage area.
 *
 * Results:
 *      TRUE if the required crosspages are allocated successfully.
 *      FALSE otherwise.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
Vmx86AllocCrossPages(VMDriver *vm)
{
   Vcpuid v;

   for (v = 0; v < vm->numVCPUs; v++) {
      MPN unused;

      UNUSED_VARIABLE(unused);
      vm->crosspage[v] = HostIF_AllocKernelPages(1, &unused);

      if (vm->crosspage[v] == NULL) {
         return FALSE;
      }
      memset(vm->crosspage[v], 0, PAGE_SIZE);
   }
   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86FreeCrossPages --
 *
 *      Free the crosspages allocated for the given VM.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86FreeCrossPages(VMDriver *vm)
{
   Vcpuid v;

   if (vm->crosspage != NULL) {
      for (v = 0; v < vm->numVCPUs; v++) {
         if (vm->crosspage[v] != NULL) {
            HostIF_FreeKernelPages(1, vm->crosspage[v]);
         }
      }
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86FreeVMDriver --
 *
 *      Release kernel memory allocated for the driver structure.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86FreeVMDriver(VMDriver *vm)
{
   Vmx86_Free(vm->ptRootMpns);
   Vmx86_Free(vm->crosspage);
   Vmx86_Free(vm->crosscallWaitSet);
   Vmx86_Free(vm->ptscOffsets);
   Vmx86_Free(vm->currentHostCpu);
   vm->ptRootMpns       = NULL;
   vm->crosspage        = NULL;
   vm->crosscallWaitSet = NULL;
   vm->ptscOffsets      = NULL;
   vm->currentHostCpu   = NULL;
   HostIF_FreeKernelMem(vm);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86AllocVMDriver --
 *
 *      Allocate the driver structure for a virtual machine.
 *
 * Results:
 *      Zeroed VMDriver structure or NULL on error.
 *
 * Side effects:
 *      May allocate kernel memory.
 *
 *-----------------------------------------------------------------------------
 */

static VMDriver *
Vmx86AllocVMDriver(uint32 numVCPUs)
{
   VMDriver *vm = Vmx86_Calloc(1, sizeof *vm, TRUE);
   if (vm == NULL) {
      return NULL;
   }
   if ((vm->ptRootMpns =
        Vmx86_Calloc(numVCPUs, sizeof *vm->ptRootMpns, TRUE))       != NULL &&
       (vm->crosspage =
        Vmx86_Calloc(numVCPUs, sizeof *vm->crosspage, TRUE))        != NULL &&
       (vm->crosscallWaitSet =
        Vmx86_Calloc(numVCPUs, sizeof *vm->crosscallWaitSet, TRUE)) != NULL &&
       (vm->ptscOffsets =
        Vmx86_Calloc(numVCPUs, sizeof *vm->ptscOffsets, TRUE))      != NULL &&
       (vm->currentHostCpu =
        Vmx86_Calloc(numVCPUs, sizeof *vm->currentHostCpu, TRUE))   != NULL) {
      return vm;
   }
   Vmx86FreeVMDriver(vm);
   return NULL;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86VMMPageFree --
 *
 *     Unmaps the VMM page corresponding to this entry from in the host
 *     kernel. This function is used as a callback by MemTrack_Cleanup().
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86VMMPageFree(void *unused, MemTrackEntry *entry)
{
   ASSERT(entry->vpn != 0 && entry->mpn != 0);
   Vmx86_UnmapPage(entry->vpn);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_CleanupVMMPages --
 *
 *     Ummaps all VMM pages from the host kernel address space and frees
 *     the VMM MemTracker.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_CleanupVMMPages(VMDriver *vm)
{
   MemTrack_Cleanup(vm->vmmTracker, Vmx86VMMPageFree, NULL);
   vm->vmmTracker = NULL;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86CleanupContigMappings --
 *
 *     Frees all allocations from HostIF_AllocContigPages that are associated
 *     with the given vm.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86CleanupContigMappings(VMDriver *vm)
{
   HostIFContigMemMap *m, *next;

   HostIF_VMLock(vm, 48);
   for (m = vm->contigMappings; m != NULL; m = next) {
      next = m->next;
      HostIF_FreeContigPages(vm, m);
   }
   HostIF_VMUnlock(vm, 48);
   vm->contigMappings = NULL;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86FreeAllVMResources
 *
 *     Free the resources allocated for a vm that is not registered
 *     on the VMDriverList.  Except in the case of Vmx86_CreateVM(),
 *     this should be called only after a call to Vmx86DeleteVMFromList().
 *
 * Results:
 *      None
 *
 * Side effects:
 *      Memory freed.
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86FreeAllVMResources(VMDriver *vm)
{
   ASSERT(!HostIF_GlobalLockIsHeld());
   if (vm) {
      ASSERT(!Vmx86VMIsRegistered(vm, TRUE));

      Vmx86_SetHostClockRate(vm, 0);

      Vmx86FreeCrossPages(vm);
      if (vm->ptpTracker != NULL) {
         Task_SwitchPTPPageCleanup(vm);
      }
      if (vm->vmmTracker != NULL) {
         Vmx86_CleanupVMMPages(vm);
      }
      if (vm->blobInfo != NULL) {
         VmmBlob_Cleanup(vm->blobInfo);
         vm->blobInfo = NULL;
      }
      if (vm->sharedArea != NULL) {
         SharedAreaVmmon_Cleanup(vm->sharedArea);
         vm->sharedArea = NULL;
      }
      if (vm->statVars != NULL) {
         StatVarsVmmon_Cleanup(vm->statVars);
         vm->statVars = NULL;
      }
      if (vm->contigMappings != NULL) {
         Vmx86CleanupContigMappings(vm);
      }
      HostIF_FreeAllResources(vm);

      Vmx86FreeVMDriver(vm);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86ReserveFreePages --
 *
 *       Returns TRUE and increases locked page counts if the vm can lock
 *       more pages.  This is true if we are below the host's hard memory
 *       limit and this vm has not exceeded its maximum allocation.
 *       The function is thread-safe.
 *
 *       If ignoreLimits is TRUE then additional pages may be reserved even
 *       if limits are violated. The request to ignore limits may come in
 *       cases of anonymous page allocations. Swapping is not always possible
 *       at those points but a swap target will have been posted so that the
 *       vmm will release memory shortly allowing the excessive reservation
 *       to be reduced.
 *
 * Results:
 *       TRUE if pages are reserved for locking, FALSE otherwise
 *
 * Side effects:
 *       The global lock and VM's lock are acquired and released.
 *
 *----------------------------------------------------------------------
 */

static Bool
Vmx86ReserveFreePages(VMDriver *vm,
                      PageCnt numPages,
                      Bool ignoreLimits)
{
   Bool retval = FALSE;
   int retries = 3;

   ASSERT(vm);

   for (retries = 3; !retval && (retries > 0); retries--) {
      HostIF_GlobalLock(17);
      HostIF_VMLock(vm, 0);
      /* Check VM's limit and don't wait. */
      retval = Vmx86HasFreePages(vm, numPages, TRUE);
      if (!retval) {
         HostIF_VMUnlock(vm, 0);
         HostIF_GlobalUnlock(17);
         break;
      } else {
         /* Wait to satisfy the global limit. */
         retval = Vmx86HasFreePages(vm, numPages, FALSE);
         if (retval) {
            numLockedPages += numPages;
            vm->memInfo.locked += numPages;
            HostIF_VMUnlock(vm, 0);
            HostIF_GlobalUnlock(17);
            break;
         } else {
            /*
             * There are not enough pages -- drop the locks and wait for
             * the host and/or other VMs to produce free pages.
             */

            HostIF_VMUnlock(vm, 0);
            HostIF_GlobalUnlock(17);
            HostIF_WaitForFreePages(10);
         }
      }
   }

   if (!retval && ignoreLimits) {
      HostIF_GlobalLock(17);
      HostIF_VMLock(vm, 0);
      numLockedPages += numPages;
      vm->memInfo.locked += numPages;
      HostIF_VMUnlock(vm, 0);
      HostIF_GlobalUnlock(17);
      retval = TRUE;
   }

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86UnreserveFreePages --
 *
 *       Decreases the global and VM's locked page counts.
 *       The function is thread-safe.
 *
 * Results:
 *       void
 *
 * Side effects:
 *       The global lock and VM's lock are acquired and released.
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86UnreserveFreePages(VMDriver *vm,
                        PageCnt numPages)
{
   ASSERT(vm);

   HostIF_GlobalLock(18);
   HostIF_VMLock(vm, 1);

   ASSERT(numLockedPages >= numPages);
   ASSERT(vm->memInfo.locked >= numPages);

   numLockedPages -= numPages;
   vm->memInfo.locked -= numPages;

   HostIF_VMUnlock(vm, 1);
   HostIF_GlobalUnlock(18);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86GetNX --
 *
 *       Checks whether NX is enabled on the current CPU.
 *
 * Results:
 *       None.
 *
 * Side effects:
 *       Increments responded-CPU counter, may increment NX CPU counter.
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86GetNX(void *clientData) // IN/OUT: A NXData *
{
   NXData *nxData = (NXData *)clientData;
   uint64 efer = X86MSR_GetMSR(MSR_EFER);

   Atomic_Inc32(&nxData->responded);
   if ((efer & MSR_EFER_NXE) == MSR_EFER_NXE) {
      Atomic_Inc32(&nxData->hasNX);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_CacheNXState --
 *
 *       Checks whether every CPU on the host has NX/XD enabled and
 *       caches this value.
 *
 * Results:
 *       None.
 *
 * Side effects:
 *       Caches host NX value.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_CacheNXState(void)
{
   NXData nxData;
   Atomic_Write32(&nxData.responded, 0);
   Atomic_Write32(&nxData.hasNX, 0);
   HostIF_CallOnEachCPU(Vmx86GetNX, &nxData);
   hostUsesNX = Atomic_Read32(&nxData.hasNX) ==
                Atomic_Read32(&nxData.responded);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx865LvlPagingEnabled --
 *
 *       Checks if 5-level paging enabled on the current CPU.  It is assumed
 *       that the host OS will not support a mix of 4 and 5-level paging.
 *
 * Results:
 *       None.
 *
 * Side effects:
 *       None.
 *
 *----------------------------------------------------------------------
 */

static Bool
Vmx865LvlPagingEnabled(void)
{
   uintptr_t cr4;

   GET_CR4(cr4);
   return (cr4 & CR4_LA57) != 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_CreateVM --
 *
 *      Allocate and initialize a driver structure for a virtual machine.
 *
 * Results:
 *      VMDriver structure or NULL on error.
 *      'status' is populated with the status of the operation.
 *
 * Side effects:
 *      May allocate kernel memory.
 *
 *-----------------------------------------------------------------------------
 */

VMDriver *
Vmx86_CreateVM(VA64 bsBlob,            // IN:
               uint32 bsBlobSize,      // IN:
               uint32 numVCPUs,        // IN:
               VMCreateStatus *status) // OUT:
{
   VMDriver *vm;
   Vcpuid v;
   void *bsBuf = NULL;
   BSVMM_HostParams *bsParams;

   /* Disallow VM creation if the vmx passes us an invalid number of vcpus. */
   if (numVCPUs == 0) {
      *status = VM_CREATE_ERR_NO_VCPUS;
      return NULL;
   }

   if (numVCPUs > MAX_VCPUS) {
      *status = VM_CREATE_ERR_TOO_MANY_VCPUS;
      return NULL;
   }

   /* Disallow VM creation if NX is disabled on the host as VMM requires NX. */
   if (!hostUsesNX) {
      *status = VM_CREATE_ERR_NO_NX;
      Log("NX/XD must be enabled.  Cannot create VM.\n");
      return NULL;
   }

   vm = Vmx86AllocVMDriver(numVCPUs);
   if (vm == NULL) {
      *status = VM_CREATE_ERR_NO_MEM;
      return NULL;
   }

   vm->userID = 0;
   vm->numVCPUs = numVCPUs;
   vm->memInfo.admitted = FALSE;

   for (v = 0; v < numVCPUs; v++) {
      Atomic_Write32(&vm->currentHostCpu[v], INVALID_PCPU);
      vm->ptRootMpns[v] = INVALID_MPN;
   }
   if (!HostIF_Init(vm, numVCPUs)) {
      *status = VM_CREATE_ERR_NO_MEM;
      goto cleanup;
   }

   /* If the BS blob exists then the VMM is in use. */
   if (bsBlobSize != 0) {

      /* Disallow VM creation if 5 level paging is enabled with the VMM. */
      if (Vmx865LvlPagingEnabled()) {
         Log("5 level paging must not be enabled.  Cannot create VM.\n");
         *status = VM_CREATE_ERR_5LP;
         goto cleanup;
      }

      /* The ULM does not use the cross GDT. */
      bsBuf = HostIF_AllocKernelMem(bsBlobSize, FALSE);
      if (bsBuf == NULL) {
         *status = VM_CREATE_ERR_NO_MEM;
         goto cleanup;
      }
      if (HostIF_CopyFromUser(bsBuf, bsBlob, bsBlobSize) != 0) {
         *status = VM_CREATE_ERR_NO_BLOB;
         goto cleanup;
      }
      bsParams = BSVMM_Validate(bsBuf, bsBlobSize);
      if (bsParams == NULL) {
         *status = VM_CREATE_ERR_INV_BLOB;
         Warning("Could not validate the VMM bootstrap blob");
         goto cleanup;
      }

      if (!Task_CreateCrossGDT(&bsParams->gdtInit)) {
         *status = VM_CREATE_ERR_CROSS_GDT;
         goto cleanup;
      }
   }

   vm->ptpTracker = MemTrack_Init(vm);
   if (vm->ptpTracker == NULL) {
      *status = VM_CREATE_ERR_NO_MEM;
      goto cleanup;
   }
   vm->vmmTracker = MemTrack_Init(vm);
   if (vm->vmmTracker == NULL) {
      *status = VM_CREATE_ERR_NO_MEM;
      goto cleanup;
   }
   vm->sharedArea = SharedAreaVmmon_Init(vm);
   if (vm->sharedArea == NULL) {
      *status = VM_CREATE_ERR_NO_MEM;
      goto cleanup;
   }
   vm->statVars = StatVarsVmmon_Init(vm);
   if (vm->statVars == NULL) {
      *status = VM_CREATE_ERR_NO_MEM;
      goto cleanup;
   }

   HostIF_GlobalLock(0);

#ifdef _WIN32
   if (vmCount >= MAX_VMS_WIN32) {
      *status = VM_CREATE_ERR_TOO_MANY_VMS;
      HostIF_GlobalUnlock(0);
      goto cleanup;
   }
#endif
   if (vmCount >= MAX_VMS) {
      *status = VM_CREATE_ERR_TOO_MANY_VMS;
      HostIF_GlobalUnlock(0);
      goto cleanup;
   }

   Vmx86RegisterVMOnList(vm);

   HostIF_GlobalUnlock(0);

   if (bsBuf != NULL) {
      HostIF_FreeKernelMem(bsBuf);
   }
   *status = VM_CREATE_SUCCESS;
   return vm;

cleanup:
   ASSERT(*status != VM_CREATE_SUCCESS);

   if (bsBuf != NULL) {
      HostIF_FreeKernelMem(bsBuf);
   }
   /*
    * The VM is not on a list, "vmCount" has not been incremented,
    * "vm->cowID" is INVALID_VMID, and either the VM's mutex hasn't
    * been initialized or we've only taken the global lock and checked
    * a counter since, so we know that the VM has not yet locked any
    * pages.
    */

   ASSERT(vm->memInfo.locked == 0);
   Vmx86FreeAllVMResources(vm);

   return NULL;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86SetPageTableRoots  --
 *
 *      Translates the user VA corresponding to the root page tables
 *      for all VCPUs into MPNs and stores them in VMDriver.
 *
 * Results:
 *      TRUE if successful, FALSE otherwise.
 *
 *----------------------------------------------------------------------
 */

static Bool
Vmx86SetPageTableRoots(VMDriver *vm, PerVcpuPages *perVcpuPages,
                       uint16 numVCPUs)
{
   uint16 vcpu;

   if (numVCPUs > vm->numVCPUs) {
      return FALSE;
   }
   for (vcpu = 0; vcpu < numVCPUs; vcpu++) {
      VA64 ptRoot = perVcpuPages[vcpu].ptRoot;

      if ((ptRoot & (PAGE_SIZE - 1)) != 0) {
         Warning("Error: page table VA %"FMT64"x is not page-aligned\n",
                 ptRoot);
         return FALSE;
      }
      ASSERT(vm->ptRootMpns[vcpu] == INVALID_MPN);
      HostIF_VMLock(vm, 38);
      if (HostIF_LookupUserMPN(vm, ptRoot, &vm->ptRootMpns[vcpu]) !=
          PAGE_LOOKUP_SUCCESS) {
         HostIF_VMUnlock(vm, 38);
         Warning("Failure looking up page table root MPN for VCPU %d\n", vcpu);
         return FALSE;
      }
      HostIF_VMUnlock(vm, 38);
   }
   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_LookupUserMPN --
 *
 *      Look up the MPN of a locked user page by user VA under the VM lock.
 *
 * Results:
 *      A status code and the MPN on success.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_LookupUserMPN(VMDriver *vm, // IN: VMDriver
                    VA64 uAddr,   // IN: user VA of the page
                    MPN *mpn)     // OUT
{
   int ret;
   HostIF_VMLock(vm, 38);
   ret = HostIF_LookupUserMPN(vm, uAddr, mpn);
   HostIF_VMUnlock(vm, 38);
   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_ProcessBootstrap  --
 *
 *     Copies the VMM bootstrap blob header and processes it by invoking
 *     MonLoader.
 *
 * Results:
 *      TRUE if successful, FALSE otherwise.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_ProcessBootstrap(VMDriver *vm,
                       VA64 bsBlobAddr,
                       uint32 numBytes,
                       uint32 headerOffset,
                       uint16 numVCPUs,
                       PerVcpuPages *perVcpuPages,
                       VMSharedRegion *shRegions)
{
   VmmBlobInfo *bi = NULL;
   unsigned errLine;
   Vcpuid errVcpu;
   MonLoaderError ret;
   MonLoaderArgs args;
   MonLoaderHeader *header;

   if (!VmmBlob_Load(bsBlobAddr, numBytes, headerOffset, &bi)) {
      Warning("Error loading VMM bootstrap blob\n");
      goto error;
   }
   vm->blobInfo = bi;
   header = bi->header;
   if (!Vmx86SetPageTableRoots(vm, perVcpuPages, numVCPUs)) {
      goto error;
   }

   if (!pseudoTSC.initialized) {
      Warning("%s: PseudoTSC has not been initialized\n", __FUNCTION__);
      goto error;
   }

   if (!Vmx86AllocCrossPages(vm)) {
      Warning("Failed to allocate cross pages.\n");
      goto error;
   }
   /*
    * Initialize the driver's part of the cross-over page used to
    * talk to the monitor.
    */
   if (!Task_InitCrosspage(vm, header->monStartLPN, header->monEndLPN,
                           perVcpuPages)) {
      Warning("Error initializing crosspage\n");
      goto error;
   }

   args.vm = vm;
   args.shRegions = shRegions;
   ret = MonLoader_Process(header, numVCPUs, &args, &errLine, &errVcpu);
   if (ret != ML_OK) {
      Warning("Error processing bootstrap: error %d at line %u, vcpu %u\n",
               ret, errLine, errVcpu);
      goto error;
   }
   return TRUE;

error:
   if (bi != NULL) {
      VmmBlob_Cleanup(bi);
      vm->blobInfo = NULL;
   }
   return FALSE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_ReleaseVM  --
 *
 *      Release a VM (either created here or from a bind).
 *
 * Results:
 *      zero if successful
 *
 * Side effects:
 *      Decrement VM reference count.
 *      Release resources (those that are left) when count reaches 0.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_ReleaseVM(VMDriver *vm)  // IN:
{
   ASSERT(vm);
   HostIF_GlobalLock(1);
   Vmx86DeleteVMFromList(vm);
   HostIF_GlobalUnlock(1);
   Vmx86FreeAllVMResources(vm);

   return 0;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_Open --
 *
 *      Called on open of the fd.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Bumps fdCount.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_Open(void)
{
   HostIF_GlobalLock(123);
   ASSERT(fdCount < MAX_INT32);
   if (fdCount < MAX_INT32) {
      fdCount++;
   }
   HostIF_GlobalUnlock(123);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_Close --
 *
 *      Called on close of the fd.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      Decrements fdCount
 *      May de-initialize ptsc.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_Close(void)
{
   HostIF_GlobalLock(124);

   /*
    * If fdCount hits MAX_INT32 saturate the counter and leave it at
    * MAX_INT32.
    */

   ASSERT(fdCount > 0);
   if (fdCount < MAX_INT32) {
      fdCount--;
   }

   /*
    * If no VMs are running and there are no open file handles, reset the
    * pseudo TSC state so that the next VM to initialize is free to
    * initialize the system wide PTSC however it wants.  See PR 403505.
    */

   if (fdCount == 0) {
      ASSERT(vmCount == 0);
      pseudoTSC.initialized = FALSE;
   }
   HostIF_GlobalUnlock(124);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_ReadTSCAndUptime --
 *
 *      Atomically read the TSC and the uptime.
 *
 * Results:
 *      The current TSC and uptime values.
 *
 * Side effects:
 *      none
 *
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_ReadTSCAndUptime(VmTimeStart *st)  // OUT: return value
{
   uintptr_t flags;

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();

   st->count = RDTSC();
   st->time = HostIF_ReadUptime();

   RESTORE_FLAGS(flags);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_ComputekHz --
 *
 *      Given aggregate cycles and system uptime, computes cycle rate as,
 *
 *      khz = cycles / (uptime / HostIF_UptimeFrequency()) / 1000
 *
 *      We need to do the computation carefully to avoid overflow or
 *      undue loss of precision.  Also, on Linux we can't do a
 *      64/64=64 bit division directly, as the gcc stub for that
 *      is not linked into the kernel.
 *
 * Results:
 *      Returns the computed khz value, or 0 if uptime == 0.
 *
 * Side effects:
 *      none
 *
 *----------------------------------------------------------------------
 */

uint32
Vmx86_ComputekHz(uint64 cycles, uint64 uptime)
{
   uint64 hz;
   uint64 freq;

   freq = HostIF_UptimeFrequency();
   while (cycles > MAX_UINT64 / freq) {
      cycles >>= 1;
      uptime >>= 1;
   }

   if (uptime == 0) {
      return 0;
   }

   hz  = (cycles * freq) / uptime;
   return (uint32) ((hz + 500) / 1000);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86GetkHzEstimate
 *
 *      Return an estimate of the processor's kHz rating, based on
 *      the ratio of the cycle counter and system uptime since the
 *      driver was loaded.
 *      This function could be called (on Windows) at IRQL DISPATCH_LEVEL.
 *
 *----------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER uint32
Vmx86GetkHzEstimate(VmTimeStart *st)   // IN: start time
{
   uint64 cDiff, tDiff;
   uintptr_t flags;

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();
   cDiff = RDTSC() - st->count;
   tDiff = HostIF_ReadUptime() - st->time;
   RESTORE_FLAGS(flags);

   return Vmx86_ComputekHz(cDiff, tDiff);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetkHzEstimate
 *
 *      Return an estimate of the processor's kHz rating, based on
 *      the ratio of the cycle counter and system uptime since the
 *      driver was loaded.  Or based on a spinloop.
 *
 *      This function could be called (on Windows) at IRQL DISPATCH_LEVEL.
 *
 * Results:
 *      Processor speed in kHz.
 *
 * Side effects:
 *      Result is cached.
 *
 *----------------------------------------------------------------------
 */

uint32
Vmx86_GetkHzEstimate(VmTimeStart *st)   // IN: start time
{
   static uint32 kHz;

   /*
    * Cache and return the first result for consistency.
    * TSC values can be changed without notification.
    * TSC frequency can be vary too (SpeedStep, slowing clock on HALT, etc.)
    */
   if (kHz != 0) {
      return kHz;
   }

   return kHz = Vmx86GetkHzEstimate(st);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_SetHostClockRate --
 *
 *      The monitor wants to poll for events at the given rate. If no VM
 *      is specified, then 'rate' is ignored and the last set rate is set
 *      again.
 *
 * Results:
 *      0 for success, host-specific error code for failure.
 *
 * Side effects:
 *      May increase the host timer interrupt rate, etc.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_SetHostClockRate(VMDriver *vm,  // IN: VM instance pointer
                       unsigned rate) // IN: rate in Hz
{
   unsigned newGlobalRate;
   VMDriver *cur;
   int retval = 0;

   if (!vm) {
      Log("Resetting last set host clock rate of %d\n", globalFastClockRate);
      HostIF_FastClockLock(0);
      retval = HostIF_SetFastClockRate(globalFastClockRate);
      HostIF_FastClockUnlock(0);

      return retval;
   }

   /* Quick test before locks are acquired. */
   if (vm->fastClockRate == rate) {
      return retval;
   }

   HostIF_FastClockLock(2);
   if (vm->fastClockRate == rate) {
      HostIF_FastClockUnlock(2);

      return retval;
   }

   /*
    * Loop through all vms to find new max rate.
    */
   newGlobalRate = rate;
   HostIF_GlobalLock(19);
   for (cur = vmDriverList; cur != NULL; cur = cur->nextDriver) {
      if (cur != vm && cur->fastClockRate > newGlobalRate) {
         newGlobalRate = cur->fastClockRate;
      }
   }
   HostIF_GlobalUnlock(19);

   if (newGlobalRate != globalFastClockRate) {
      retval = HostIF_SetFastClockRate(newGlobalRate);
      if (!retval) {
         globalFastClockRate = newGlobalRate;
      }
   }
   if (!retval) {
      vm->fastClockRate = rate;
   }
   HostIF_FastClockUnlock(2);

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_MonTimerIPI --
 *
 *      Check for VCPUs that are in the monitor and need an IPI to fire
 *      their next MonTimer callback.  Should be called once per fast
 *      timer interrupt if the fast timer is in use.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      May send IPIs.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_MonTimerIPI(void)
{
   VMDriver *vm;
   VmAbsoluteTS pNow, expiry;

   /*
    * Loop through all vms -- needs the global lock to protect vmDriverList.
    */

   HostIF_GlobalLock(21);

   pNow = Vmx86_GetPseudoTSC();

   for (vm = vmDriverList; vm != NULL; vm = vm->nextDriver) {
      Vcpuid v;
      VCPUSet expiredVCPUs;
      Bool hasWork = FALSE;
      VCPUSet_Empty(&expiredVCPUs);

      for (v = 0; v < vm->numVCPUs; v++) {
         VMCrossPageData *crosspage = vm->crosspage[v];

         if (crosspage == NULL) {
            continue;  // VCPU is not initialized yet
         }
         expiry = crosspage->monTimerExpiry;
         if (expiry != 0 && expiry <= pNow) {
            VCPUSet_Include(&expiredVCPUs, v);
            hasWork = TRUE;
         }
      }
      if (hasWork) {
         HostIF_IPI(vm, &expiredVCPUs);
      }
   }
   HostIF_GlobalUnlock(21);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetNumVMs  --
 *
 *      Return the number of VMs.
 *
 * Results:
 *      The number of VMs.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

int32
Vmx86_GetNumVMs(void)
{
   return vmCount;
}


static INLINE PageCnt
Vmx86MinAllocationFunc(PageCnt nonpaged,
                       PageCnt anonymous,
                       PageCnt mainmem,
                       Percent memPct)
{
   return (memPct * mainmem) / 100 + nonpaged + anonymous;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86MinAllocation --
 *
 *      Computes the minimum number of pages that must be allocated to a
 *      specific vm.  The minAllocation for a vm is defined as
 *      some percentage of guest memory plus 100% of nonpagable (overhead)
 *      memory.
 *
 * Results:
 *      The minAllocation for this vm.
 *
 *
 * Side effects:
 *      Analyzes the vm info, requiring the vm lock.
 *
 *----------------------------------------------------------------------
 */

static INLINE PageCnt
Vmx86MinAllocation(VMDriver *vm,
                   Percent memPct)
{
   ASSERT(HostIF_VMLockIsHeld(vm));
   return Vmx86MinAllocationFunc(vm->memInfo.nonpaged, vm->memInfo.anonymous,
                                 vm->memInfo.mainMemSize, memPct);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86CalculateGlobalMinAllocation --
 *
 *      Computes the sum of minimum allocations of each vm assuming a given
 *      percentage of guest memory must fit within host RAM.
 *
 * Results:
 *      Number of pages that must fit within host ram for a given overcommit
 *      level.
 *
 *
 * Side effects:
 *      None. The actual minAllocations of each vm are NOT updated during
 *      this computation.
 *
 *----------------------------------------------------------------------
 */

static PageCnt
Vmx86CalculateGlobalMinAllocation(Percent memPct)
{
   VMDriver *vm;
   PageCnt minAllocation = 0;
   ASSERT(HostIF_GlobalLockIsHeld());
   /* Pages of other vms required to fit inside the hard limit. */
   for (vm = vmDriverList; vm; vm = vm->nextDriver) {
      HostIF_VMLock(vm, 2);
      if (vm->memInfo.admitted) {
         minAllocation += Vmx86MinAllocation(vm, memPct);
      }
      HostIF_VMUnlock(vm, 2);
   }

   return minAllocation;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86UpdateMinAllocations --
 *
 *      Updates the minimum allocation for each vm based on the global
 *      overcommitment percentage.
 *
 * Results:
 *      minAllocations for vms are changed.
 *
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
Vmx86UpdateMinAllocations(Percent memPct)  // IN:
{
   VMDriver *vm;
   ASSERT(HostIF_GlobalLockIsHeld());
   /* Pages of other vms required to fit inside the hard limit. */
   for (vm = vmDriverList; vm; vm = vm->nextDriver) {
      HostIF_VMLock(vm, 3);
      if (vm->memInfo.admitted) {
         vm->memInfo.minAllocation = Vmx86MinAllocation(vm, memPct);
      }
      HostIF_VMUnlock(vm, 3);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_SetConfiguredLockedPagesLimit --
 *
 *      Set the user defined limit on the number of pages that can
 *      be locked.  This limit can be raised at any time but not lowered.
 *      This avoids having a user lower the limit as vms are running and
 *      inadvertently cause the vms to crash because of memory starvation.
 *
 * Results:
 *      Returns TRUE on success and FALSE on failure to set the limit
 *
 * Side effects:
 *      Hard limit may be changed.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_SetConfiguredLockedPagesLimit(PageCnt limit)  // IN:
{
   Bool retval = FALSE;

   HostIF_GlobalLock(4);
   if (limit >= lockedPageLimit.configured) {
      lockedPageLimit.configured = limit;
      retval = TRUE;
   }
   HostIF_GlobalUnlock(4);

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_LockPage --
 *
 *      Lock a page.
 *
 * Results:
 *      A PAGE_LOCK_* status code and the MPN of the locked page on success.
 *
 * Side effects:
 *      Number of global and per-VM locked pages increased.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_LockPage(VMDriver *vm,                 // IN: VMDriver
               VA64 uAddr,                   // IN: VA of the page to lock
               Bool allowMultipleMPNsPerVA,  // IN: allow locking many pages with the same VA
               MPN *mpn)                     // OUT
{
   int retval;

   /* Atomically check and reserve locked memory */
   if (!Vmx86ReserveFreePages(vm, 1, FALSE)) {
      return PAGE_LOCK_LIMIT_EXCEEDED;
   }

   HostIF_VMLock(vm, 4);
   retval = HostIF_LockPage(vm, uAddr, allowMultipleMPNsPerVA, mpn);
   HostIF_VMUnlock(vm, 4);

   if (retval != PAGE_LOCK_SUCCESS) {
      Vmx86UnreserveFreePages(vm, 1);
   }

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_UnlockPage --
 *
 *      Unlock a page.
 *
 * Results:
 *      A PAGE_UNLOCK_* status code.
 *
 * Side effects:
 *      Number of global and per-VM locked pages decreased.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_UnlockPage(VMDriver *vm, // IN
                 VA64 uAddr)   // IN
{
   int retval;

   HostIF_VMLock(vm, 5);
   retval = HostIF_UnlockPage(vm, uAddr);
   HostIF_VMUnlock(vm, 5);

   if (retval == PAGE_UNLOCK_SUCCESS) {
      Vmx86UnreserveFreePages(vm, 1);
   }

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_UnlockPageByMPN --
 *
 *      Unlock a page.
 *
 * Results:
 *      A PAGE_UNLOCK_* status code.
 *
 * Side effects:
 *      Number of global and per-VM locked pages decreased.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver
                      MPN mpn,      // IN: the page to unlock
                      VA64 uAddr)   // IN: optional valid VA for this MPN
{
   int retval;

   HostIF_VMLock(vm, 6);
   retval = HostIF_UnlockPageByMPN(vm, mpn, uAddr);
   HostIF_VMUnlock(vm, 6);

   if (retval == PAGE_UNLOCK_SUCCESS) {
      Vmx86UnreserveFreePages(vm, 1);
   }

   return retval;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_AllocLockedPages --
 *
 *      Allocate physical locked pages from the kernel.
 *
 *      Initially the pages are not mapped to any user or kernel
 *      address space.
 *
 * Results:
 *      Non-negative value on partial/full completion: actual number of
 *      allocated MPNs. MPNs of the allocated pages are copied to the
 *      caller's buffer at 'addr'.
 *
 *      Negative system specific error code on error (NTSTATUS on Windows,
 *      etc.)
 *
 * Side effects:
 *      Number of global and per-VM locked pages is increased.
 *
 *-----------------------------------------------------------------------------
 */

int64
Vmx86_AllocLockedPages(VMDriver *vm,         // IN: VMDriver
                       VA64 addr,            // OUT: VA of an array for
                                             //      allocated MPNs.
                       PageCnt numPages,     // IN: number of pages to allocate
                       Bool kernelMPNBuffer, // IN: is the MPN buffer in kernel
                                             //     or user address space?
                       Bool ignoreLimits)    // IN: should limits be ignored?
{
   int64 allocatedPages;
   if (!Vmx86ReserveFreePages(vm, numPages, ignoreLimits)) {
      // XXX What kind of system-specific error code is that? --hpreg
      return PAGE_LOCK_LIMIT_EXCEEDED;
   }
   HostIF_VMLock(vm, 7);
   allocatedPages = HostIF_AllocLockedPages(vm, addr, numPages,
                                            kernelMPNBuffer);
   HostIF_VMUnlock(vm, 7);
   if (allocatedPages < 0) {
      Vmx86UnreserveFreePages(vm, numPages);
   } else if (allocatedPages < numPages) {
      Vmx86UnreserveFreePages(vm, numPages - allocatedPages);
   }
   return allocatedPages;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_FreeLockedPages --
 *
 *      Frees physical locked pages from the kernel previosly allocated
 *      by Vmx86_AllocLockedPages().
 *
 * Results:
 *      0 on success,
 *      non-0 system specific error code on error (NTSTATUS on Windows, etc.)
 *
 * Side effects:
 *      Number of global and per-VM locked pages is decreased.
 *
 *----------------------------------------------------------------------
 */

int
Vmx86_FreeLockedPages(VMDriver *vm,         // IN: VM instance pointer
                      MPN *mpns,            // IN: MPNs to free
                      PageCnt numPages)     // IN: number of pages to free
{
   int ret;

   HostIF_VMLock(vm, 8);
   ret = HostIF_FreeLockedPages(vm, mpns, numPages);
   HostIF_VMUnlock(vm, 8);

   if (ret == 0) {
      Vmx86UnreserveFreePages(vm, numPages);
   }

   return ret;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_AllocLowPage --
 *
 *      Allocate a zeroed locked low page.
 *
 * Results:
 *      Allocated MPN on success. INVALID_MPN on failure.
 *
 * Side effects:
 *      Number of global and per-VM locked pages is increased.
 *
 *-----------------------------------------------------------------------------
 */

MPN
Vmx86_AllocLowPage(VMDriver *vm,      // IN: VMDriver
                   Bool ignoreLimits) // IN: should limits be ignored?
{
   MPN mpn;

   if (!Vmx86ReserveFreePages(vm, 1, ignoreLimits)) {
      return INVALID_MPN;
   }

   HostIF_VMLock(vm, 49);
   mpn = HostIF_AllocLowPage(vm);
   HostIF_VMUnlock(vm, 49);

   if (mpn == INVALID_MPN) {
      Vmx86UnreserveFreePages(vm, 1);
   }

   return mpn;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetNextAnonPage --
 *
 *      Queries the driver to retrieve the list of anonymous pages.
 *      A supplied value of INVALID_MPN will start the query from
 *      the head of the list. Callers supply the previously received
 *      mpn to retrieve the next in the chain. Note: There is no
 *      guarantee of coherency.
 *
 * Results:
 *      A valid mpn or INVALID_MPN if the list has been exhausted.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

MPN
Vmx86_GetNextAnonPage(VMDriver *vm,       // IN: VM instance pointer
                      MPN mpn)            // IN: MPN
{
   MPN ret;

   HostIF_VMLock(vm, 22);
   ret = HostIF_GetNextAnonPage(vm, mpn);
   HostIF_VMUnlock(vm, 22);

   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetNumAnonPages --
 *
 *      Queries the driver for the total number of anonymous pages.
 *
 * Results:
 *      Total number of anonymous pages
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

PageCnt
Vmx86_GetNumAnonPages(VMDriver *vm)       // IN: VM instance pointer
{
   PageCnt ret;
   HostIF_VMLock(vm, 45);
   ret = HostIF_GetNumAnonPages(vm);
   HostIF_VMUnlock(vm, 45);
   return ret;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetMemInfo --
 *
 *      Return the info about all VMs.
 *
 * Results:
 *      TRUE if all info was successfully copied.
 *
 * Side effects:
 *      VMGetMemInfoArgs is filled in. If the supplied curVM is null
 *      then only the baseline information will be returned. Calling
 *      with a null curVM may return results for maxLockedPages
 *      that differ from those  when the vm is passed if huge pages
 *      are in use.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_GetMemInfo(VMDriver *curVM,
                 Bool curVMOnly,
                 VMMemInfoArgs *outArgs,
                 int outArgsLength)
{
   VMDriver *vm;
   int outSize;
   int wantedVMs;

   HostIF_GlobalLock(7);

   if (curVMOnly) {
      wantedVMs = 1;
   } else {
      wantedVMs = vmCount;
   }

   outSize = VM_GET_MEM_INFO_SIZE(wantedVMs);
   if (outSize > outArgsLength) {
      HostIF_GlobalUnlock(7);

      return FALSE;
   }

   outArgs->numVMs = wantedVMs;
   outArgs->numLockedPages = numLockedPages;
   outArgs->maxLockedPages = Vmx86LockedPageLimit(curVM);
   outArgs->lockedPageLimit = lockedPageLimit;
   outArgs->globalMinAllocation = Vmx86CalculateGlobalMinAllocation(minVmMemPct);
   outArgs->minVmMemPct = minVmMemPct;
   outArgs->callerIndex = (uint32)-1;
   outArgs->currentTime = HostIF_ReadUptime() / HostIF_UptimeFrequency();

   if (curVM == NULL) {
      HostIF_GlobalUnlock(7);

      return TRUE;
   }

   curVM->memInfo.timestamp = outArgs->currentTime;
   if (wantedVMs == 1) {
      outArgs->memInfo[0] = curVM->memInfo;
      outArgs->callerIndex = 0;
   } else {
      int i;
      for (i = 0, vm = vmDriverList;
           vm != NULL && i < vmCount;
           i++, vm = vm->nextDriver) {
         if (vm == curVM) {
            outArgs->callerIndex = i;
         }
         HostIF_VMLock(vm, 10);
         outArgs->memInfo[i] = vm->memInfo;
         HostIF_VMUnlock(vm, 10);
      }
   }

   HostIF_GlobalUnlock(7);
   if (outArgs->callerIndex == -1) {
      return FALSE;
   }
   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86SetMemoryUsage --
 *
 *      Updates the paged, nonpaged, and anonymous memory reserved memory
 *      values for the vm.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86SetMemoryUsage(VMDriver *curVM,       // IN/OUT
                    PageCnt  paged,        // IN
                    PageCnt  nonpaged,     // IN
                    PageCnt  anonymous,    // IN
                    Percent  aminVmMemPct) // IN
{
   ASSERT(HostIF_VMLockIsHeld(curVM));
   curVM->memInfo.paged         = paged;
   curVM->memInfo.nonpaged      = nonpaged;
   curVM->memInfo.anonymous     = anonymous;
   curVM->memInfo.minAllocation = Vmx86MinAllocation(curVM, aminVmMemPct);
   curVM->memInfo.maxAllocation = curVM->memInfo.mainMemSize + nonpaged +
                                  anonymous;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_Admit --
 *
 *      Set the memory management information about this VM and handles
 *      admission control. We allow vm to power on if there is room for
 *      the minimum allocation for all running vms in memory.  Note that
 *      the hard memory limit can change dynamically in windows so we
 *      don't have guarantees due to admission control.
 *
 * Results:
 *      Returns global information about the memory state in args as well
 *      as a value indicating whether or not the virtual machine was
 *      started.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_Admit(VMDriver *curVM,     // IN
            VMMemInfoArgs *args) // IN/OUT
{
   Bool allowAdmissionCheck = FALSE;
   PageCnt globalMinAllocation;
   HostIF_GlobalLock(9);

   /*
    * Update the overcommitment level and minimums for all vms if they can
    * fit under new minimum limit.  If they do not fit, do nothing.  And of
    * course if existing VMs cannot fit under limit, likelihood that new VM
    * will fit in is zero.
    */

   globalMinAllocation = Vmx86CalculateGlobalMinAllocation(args->minVmMemPct);
   if (globalMinAllocation <= Vmx86LockedPageLimit(NULL)) {
      allowAdmissionCheck = TRUE;
      minVmMemPct = args->minVmMemPct;
      Vmx86UpdateMinAllocations(args->minVmMemPct);
   }

   HostIF_VMLock(curVM, 12);

   curVM->memInfo.shares = args->memInfo->shares;
   curVM->memInfo.touchedPct = 100;
   curVM->memInfo.dirtiedPct = 100;
   curVM->memInfo.mainMemSize = args->memInfo->mainMemSize;
   curVM->memInfo.perVMOverhead = args->memInfo->perVMOverhead;

  /*
   * Always set the allocations required for the current configuration
   * so that the user will know how bad situation really is with the
   * suggested percentage.
   */

  curVM->memInfo.admitted = FALSE;
  Vmx86SetMemoryUsage(curVM, args->memInfo->paged, args->memInfo->nonpaged,
                      args->memInfo->anonymous, args->minVmMemPct);
  if (allowAdmissionCheck &&
      globalMinAllocation + curVM->memInfo.minAllocation <=
         Vmx86LockedPageLimit(curVM)) {
      curVM->memInfo.admitted = TRUE;
   }

#if defined _WIN32
   if (curVM->memInfo.admitted) {
      PageCnt allocatedPages, nonpaged;
      int64 pages;
      MPN *mpns;

      /*
       * More admission control: Get enough memory for the nonpaged portion
       * of the VM.  Drop locks for this long operation.
       * XXX Timeout?
       */

      HostIF_VMUnlock(curVM, 12);
      HostIF_GlobalUnlock(9);

#define ALLOCATE_CHUNK_SIZE 64
      allocatedPages = 0;
      nonpaged = args->memInfo->nonpaged + args->memInfo->anonymous;
      mpns = HostIF_AllocKernelMem(nonpaged * sizeof *mpns, FALSE);
      if (mpns == NULL) {
         goto undoAdmission;
      }
      while (allocatedPages < nonpaged) {
         pages = Vmx86_AllocLockedPages(curVM,
                                        PtrToVA64(mpns + allocatedPages),
                                        MIN(ALLOCATE_CHUNK_SIZE, nonpaged - allocatedPages),
                                        TRUE,
                                        FALSE);
         if (pages <= 0) {
            break;
         }
         allocatedPages += pages;
      }

      /*
       * Free the allocated pages.
       * XXX Do not free the pages but hand them directly to the admitted VM.
       */

      Vmx86_FreeLockedPages(curVM, mpns, allocatedPages);
      HostIF_FreeKernelMem(mpns);
#undef ALLOCATE_CHUNK_SIZE

undoAdmission:
      if (allocatedPages != nonpaged) {
          curVM->memInfo.admitted = FALSE; // undo admission
      }

      HostIF_GlobalLock(9);
      HostIF_VMLock(curVM, 12);
   }
#endif

   /* Return global state to the caller. */
   args->memInfo[0] = curVM->memInfo;
   args->numVMs = vmCount;
   args->numLockedPages = numLockedPages;
   args->maxLockedPages = Vmx86LockedPageLimit(curVM);
   args->lockedPageLimit = lockedPageLimit;
   args->globalMinAllocation = globalMinAllocation;
   HostIF_VMUnlock(curVM, 12);
   HostIF_GlobalUnlock(9);
}


Bool
Vmx86_Readmit(VMDriver *curVM, OvhdMem_Deltas *delta)
{
   PageCnt globalMinAllocation, newMinAllocation;
   Bool retval = FALSE;
   int64 paged;
   int64 nonpaged;
   int64 anonymous;

   HostIF_GlobalLock(31);
   globalMinAllocation = Vmx86CalculateGlobalMinAllocation(minVmMemPct);
   HostIF_VMLock(curVM, 31);
   paged = curVM->memInfo.paged + delta->paged;
   nonpaged = curVM->memInfo.nonpaged + delta->nonpaged;
   anonymous = curVM->memInfo.anonymous + delta->anonymous;

   if (nonpaged >= 0 && paged >= 0 && anonymous >= 0) {
      globalMinAllocation -= Vmx86MinAllocation(curVM, minVmMemPct);
      newMinAllocation = Vmx86MinAllocationFunc(nonpaged, anonymous,
                                                curVM->memInfo.mainMemSize,
                                                minVmMemPct);
      if (globalMinAllocation + newMinAllocation <= Vmx86LockedPageLimit(curVM) ||
          (delta->paged <= 0 && delta->nonpaged <= 0 && delta->anonymous <= 0)) {
         Vmx86SetMemoryUsage(curVM, paged, nonpaged, anonymous, minVmMemPct);
         retval = TRUE;
      }
   }
   HostIF_VMUnlock(curVM, 31);
   HostIF_GlobalUnlock(31);

   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_UpdateMemInfo --
 *
 *      Updates information about this VM with the new data supplied in
 *      a patch.
 *
 * Results:
 *      Sets the memory usage by this vm based on its memSample data.
 *
 * Side effects:
 *      None
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_UpdateMemInfo(VMDriver *curVM,
                    const VMMemMgmtInfoPatch *patch)
{
   ASSERT(patch->touchedPct <= 100 && patch->dirtiedPct <= 100);
   HostIF_VMLock(curVM, 13);
   curVM->memInfo.touchedPct = AsPercent(patch->touchedPct);
   curVM->memInfo.dirtiedPct = AsPercent(patch->dirtiedPct);
   curVM->memInfo.hugePageBytes = patch->hugePageBytes;
   HostIF_VMUnlock(curVM, 13);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86VMXEnabled --
 *
 *      Test the VMXE bit as an easy proxy for whether VMX operation
 *      is enabled.
 *
 * Results:
 *      TRUE if the CPU supports VT and CR4.VMXE is set.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static Bool
Vmx86VMXEnabled(void)
{
   if (VT_CapableCPU()) {
      uintptr_t cr4;

      GET_CR4(cr4);

      return (cr4 & CR4_VMXE) != 0;
   } else {
      return FALSE;
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86EnableHVOnCPU --
 *
 *      Enable HV on the current CPU, if possible.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      HV will be enabled, if possible.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86EnableHVOnCPU(void)
{
   if (CPUID_HostSupportsSVM()) {
      uint64 vmCR = X86MSR_GetMSR(MSR_VM_CR);
      if (!SVM_LockedFromFeatures(vmCR)) {
         CPUIDRegs regs;
         __GET_CPUID(0x8000000A, &regs);
         if (CPUID_GET(0x8000000A, EDX, SVM_LOCK, regs.edx) != 0) {
            X86MSR_SetMSR(MSR_VM_CR, (vmCR & ~MSR_VM_CR_SVME_DISABLE) |
                                      MSR_VM_CR_SVM_LOCK);
         }
      }
   } else if (CPUID_HostSupportsVT()) {
      uint64 featCtl = X86MSR_GetMSR(MSR_FEATCTL);
      if (!VT_LockedFromFeatures(featCtl)) {
         X86MSR_SetMSR(MSR_FEATCTL,
                       featCtl | MSR_FEATCTL_LOCK | MSR_FEATCTL_VMXE);
      }
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86RefClockInCycles --
 *
 *    Convert the reference clock (HostIF_Uptime) to cycle units.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE uint64
Vmx86RefClockInCycles(uint64 uptime)
{
   return Mul64x3264(uptime,
                     pseudoTSC.refClockToPTSC.ratio.mult,
                     pseudoTSC.refClockToPTSC.ratio.shift);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86RefClockToPTSC --
 *
 *    Convert from the reference clock (HostIF_Uptime) time to pseudo TSC.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE uint64
Vmx86RefClockToPTSC(uint64 uptime)
{
   return Vmx86RefClockInCycles(uptime) +
      Atomic_Read64(&pseudoTSC.refClockToPTSC.add);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_InitPseudoTSC --
 *
 *      Initialize the pseudo TSC state if it is not already initialized.
 *      If another vmx has initialized the pseudo TSC, then we continue to
 *      use the parameters specified by the first vmx.
 *
 * Results:
 *      None
 *
 * Side effects:
 *      - Updates tscHz, the frequency of the PTSC in Hz. That frequency may
 *        differ from the value passed in if another VM is already running.
 *      - Updates the refClkToTSC parameters to be consistent with the tscHz
 *        value that's in use.
 *
 *-----------------------------------------------------------------------------
 */

void
Vmx86_InitPseudoTSC(PTSCInitParams *params) // IN/OUT
{
   VmTimeStart startTime;
   uint64 tsc, uptime;

   HostIF_GlobalLock(36);

   if (!pseudoTSC.initialized) {
      Bool logParams = pseudoTSC.hz != params->tscHz ||
                       pseudoTSC.hwTSCsSynced != params->hwTSCsSynced ||
                       pseudoTSC.useRefClock != params->forceRefClock;

      pseudoTSC.hz = params->tscHz;
      pseudoTSC.refClockToPTSC.ratio.mult  = params->refClockToPTSC.mult;
      pseudoTSC.refClockToPTSC.ratio.shift = params->refClockToPTSC.shift;

      Vmx86_ReadTSCAndUptime(&startTime);
      tsc    = startTime.count;
      uptime = startTime.time;

      /* Start Pseudo TSC at initialPTSC (usually 0). */
      pseudoTSC.tscOffset = params->initialPTSC - tsc;
      Atomic_Write64(&pseudoTSC.refClockToPTSC.add,
                     params->initialPTSC - Vmx86RefClockInCycles(uptime));

      /* forceRefClock gets priority. */
      pseudoTSC.useRefClock           = params->forceRefClock;
      pseudoTSC.neverSwitchToRefClock = params->forceTSC;
      pseudoTSC.hwTSCsSynced          = params->hwTSCsSynced;
      if (logParams) {
         Log("PTSC: initialized at %"FMT64"u Hz using %s, TSCs are "
             "%ssynchronized.\n", pseudoTSC.hz,
             pseudoTSC.useRefClock ? "reference clock" : "TSC",
             pseudoTSC.hwTSCsSynced ? "" : "not ");
      }
      pseudoTSC.initialized = TRUE;
   }
   /*
    * Allow the calling vmx to respect ptsc.noTSC=TRUE config option
    * even if another vmx is already running (pseudoTSC was already
    * initialized).  Useful for testing.
    */
   if (params->forceRefClock) {
      Vmx86_SetPseudoTSCUseRefClock();
   }
   params->refClockToPTSC.mult  = pseudoTSC.refClockToPTSC.ratio.mult;
   params->refClockToPTSC.shift = pseudoTSC.refClockToPTSC.ratio.shift;
   params->refClockToPTSC.add   = Atomic_Read64(&pseudoTSC.refClockToPTSC.add);
   params->tscOffset    = pseudoTSC.tscOffset;
   params->tscHz        = pseudoTSC.hz;
   params->hwTSCsSynced = pseudoTSC.hwTSCsSynced;

   HostIF_GlobalUnlock(36);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_GetPseudoTSC --
 *
 *    Read the pseudo TSC.  We prefer to implement the pseudo TSC using
 *    TSC.  On systems where the TSC varies its rate (e.g. Pentium M),
 *    stops advancing when the core is in deep sleep (e.g. Core 2 Duo),
 *    or the TSCs can get out of sync across cores (e.g. Opteron due to
 *    halt clock ramping, Core 2 Duo due to independent core deep sleep
 *    states; though WinXP does handle the Core 2 Duo out of sync case;
 *    and on IBM x-Series NUMA machines), we use a reference clock
 *    (HostIF_ReadUptime()) as the basis for pseudo TSC.
 *
 *    Note that we depend on HostIF_ReadUptime being a high resolution
 *    timer that is synchronized across all cores.
 *
 * Results:
 *    Current value of the PTSC.
 *
 *-----------------------------------------------------------------------------
 */

uint64
Vmx86_GetPseudoTSC(void)
{
   if (Vmx86_PseudoTSCUsesRefClock()) {
      return Vmx86RefClockToPTSC(HostIF_ReadUptime());
   }
   return RDTSC() + pseudoTSC.tscOffset;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_CheckPseudoTSC --
 *
 *    Periodically called by userspace to check whether the TSC is
 *    reliable, using the reference clock as the trusted time source.
 *    If the TSC is unreliable, switch the basis of the PTSC from the
 *    TSC to the reference clock.
 *
 *    Also, recompute the "add" component of the reference clock to PTSC
 *    conversion, to periodically eliminate the drift between the two
 *    clocks.  That way, if the PTSC switches from using the TSC to the
 *    reference clock, PTSC will remain (roughly) continuous.  See PR
 *    547055.
 *
 *    Note that we might be executing concurrently with other threads,
 *    but it doesn't matter since we only ever go from using the TSC to
 *    using the reference clock, never the other direction.
 *
 * Results:
 *    TRUE if the PTSC is implemented by the reference clock.
 *    FALSE if the PTSC is implemented by the TSC.
 *
 * Side effects:
 *    May switch the basis of the PTSC from the TSC to the reference clock.
 *
 *-----------------------------------------------------------------------------
 */

Bool
Vmx86_CheckPseudoTSC(uint64 *lastTSC, // IN/OUT: last/current value of the TSC
                     uint64 *lastRC)  // IN/OUT: last/current value of the reference clock
{
   VmTimeStart curTime;

   Vmx86_ReadTSCAndUptime(&curTime);

   if (pseudoTSC.initialized && *lastTSC && !Vmx86_PseudoTSCUsesRefClock()) {
      uint64 tsc, refClkTS, refClkLastTS;
      uint64 tscDiff, refClkDiff;

      tsc = curTime.count;

      refClkTS     = Vmx86RefClockInCycles(curTime.time);
      refClkLastTS = Vmx86RefClockInCycles(*lastRC);

      tscDiff    = tsc - *lastTSC;
      refClkDiff = refClkTS - refClkLastTS;

      if (((int64)tscDiff < 0) ||
          (tscDiff * 100 < refClkDiff * 95) ||
          (tscDiff * 95 > refClkDiff * 100)) {
         /*
          * TSC went backwards or drifted from the reference clock by
          * more than 5% over the last poll period.
          */
         Vmx86_SetPseudoTSCUseRefClock();
      } else {
         uint64 ptscFromTSC = tsc + pseudoTSC.tscOffset;
         Atomic_Write64(&pseudoTSC.refClockToPTSC.add, ptscFromTSC - refClkTS);
      }
   }
   *lastTSC = curTime.count;
   *lastRC  = curTime.time;

   return Vmx86_PseudoTSCUsesRefClock();
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86GetMSR --
 *
 *      Collect MSR value on the current logical CPU.
 *
 *      Function must not block (it is invoked from interrupt context).
 *      Only VT MSRs are supported on VT-capable processors.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      'data->index' is atomically incremented by one.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86GetMSR(void *clientData) // IN/OUT: A Vmx86GetMSRData *
{
   uint32 i;
   Vmx86GetMSRData *data = (Vmx86GetMSRData *)clientData;
   uint32 numPCPUs = data->query->numLogicalCPUs;
   size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs;
   ASSERT(data && data->index && data->query);

   for (i = 0; i < data->numItems; ++i) {
      uint32 index;
      int err;
      Atomic_uint32 *cpus = &data->index[i];
      MSRQuery *query = (MSRQuery *) ((uint8 *)&data->query[0] + i * offset);

      index = Atomic_ReadInc32(cpus);
      if (index >= numPCPUs) {
         continue;
      }

      query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU();

      /*
       * We treat BIOS_SIGN_ID (microcode version) specially on Intel,
       * where the preferred read sequence involves a macro.
       */
      if (CPUID_GetVendor() == CPUID_VENDOR_INTEL &&
          query->msrNum == MSR_BIOS_SIGN_ID) {
         /* safe to read: MSR_BIOS_SIGN_ID architectural since Pentium Pro */
         query->logicalCPUs[index].msrVal = INTEL_MICROCODE_VERSION();
         err = 0;
      } else {
         /*
          * Try to enable HV any time these MSRs are queried.  We have seen
          * buggy firmware that forgets to re-enable HV after waking from
          * deep sleep. [PR 1020692]
          */
         if (query->msrNum == MSR_FEATCTL || query->msrNum == MSR_VM_CR) {
            Vmx86EnableHVOnCPU();
         }
         err =
            HostIF_SafeRDMSR(query->msrNum, &query->logicalCPUs[index].msrVal);
      }
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_GetAllMSRs --
 *
 *      Collect MSR value on number of logical CPUs requested.
 *
 *      The caller is responsible for ensuring that the requested MSR is valid
 *      on all logical CPUs.
 *
 *      'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output
 *      array.
 *
 * Results:
 *      On success: TRUE. 'query->logicalCPUs' is filled and
 *                  'query->numLogicalCPUs' is adjusted accordingly.
 *      On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small.
 *
 * Side effects:
 *      None
 *
 *-----------------------------------------------------------------------------
 */

Bool
Vmx86_GetAllMSRs(MSRQuery *query) // IN/OUT
{
   unsigned i, cpu;
   Atomic_uint32 index;
   Vmx86GetMSRData data;
   data.index = &index;
   data.numItems = 1;

   /* Check MSR uniformity cache first. */
   for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
      if (msrUniformityCacheInfo[i].msrIndex == query->msrNum) {
         for (cpu = 0; cpu < query->numLogicalCPUs; cpu++) {
            query->logicalCPUs[cpu].msrVal = msrUniformityCacheInfo[i].msrValue;
            query->logicalCPUs[cpu].tag = cpu;
         }
         return TRUE;
      }
   }

   Atomic_Write32(data.index, 0);
   data.query = query;

   HostIF_CallOnEachCPU(Vmx86GetMSR, &data);

   /*
    * At this point, Atomic_Read32(data.index) is the number of logical CPUs
    * who replied.
    */

   if (Atomic_Read32(data.index) > query->numLogicalCPUs) {
      return FALSE;
   }

   ASSERT(Atomic_Read32(data.index) <= query->numLogicalCPUs);
   query->numLogicalCPUs = Atomic_Read32(data.index);

   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86CheckVMXStatus --
 *
 *      Checks the status of the given operation and issues a warning if it was
 *      not successful. If it is a valid failure, the error code will be read
 *      and logged.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86CheckVMXStatus(const char *operation, // IN: Operation string
                    VMXStatus status)      // IN: Status to check
{
   if (status != VMX_Success) {
      Warning("%s failed with status %s.\n", operation,
              status == VMX_FailValid ? "VMX_FailValid" :
              status == VMX_FailInvalid ? "VMX_FailInvalid" : "UNKNOWN");
/*
 * We use a broken in-house version of binutils (2.16.1-vt) with gcc 4.3 which
 * doesn't handle VMREAD/VMWRITE operands properly.
 */
#ifdef __GNUC__
#if __GNUC__== 4 && __GNUC_MINOR__ > 3
      if (status == VMX_FailValid) {
         size_t errorCode;
         VMREAD_2_STATUS(VT_VMCS_VMINSTR_ERR, &errorCode);
         Log("VM-instruction error: Error %"FMTSZ"d\n", errorCode);
      }
#endif
#endif
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86FlushVMCSPage --
 *
 *      VMCLEAR the given VMCS page on the current logical CPU. We first enable
 *      HV if necessary, and execute a VMXON using the given VMXON region MPN.
 *      If HV was already enabled, it will remain enabled. If we enabled HV or
 *      executed a VMXON in non-root operation, we will restore the state of
 *      each respectively after the VMCLEAR.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      The hardware VMCS cache will be flushed.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86FlushVMCSPage(void *clientData) // IN: The MA of the VMCS to VMCLEAR
{
   MA vmxonRegion;
   Bool hvWasEnabled;
   MA vmcs = (MA)clientData;
   Bool vmxWasInRootOperation = FALSE;
   VMXStatus vmxonStatus, vmclearStatus, vmxoffStatus;

   ASSERT(vmcs);

   /* Enable HV if it isn't already enabled. */
   hvWasEnabled = Vmx86VMXEnabled();
   if (!hvWasEnabled) {
      uintptr_t cr4reg;
      ASSERT(VT_CapableCPU());
      Vmx86EnableHVOnCPU();
      GET_CR4(cr4reg);
      SET_CR4(cr4reg | CR4_VMXE);
   }

   /* VMXON using this CPUs's VMXON region. */
   vmxonRegion = MPN_2_MA(Task_GetHVRootPageForPCPU(HostIF_GetCurrentPCPU()));
   vmxonStatus = VMXON_2_STATUS(&vmxonRegion);
   if (vmxonStatus != VMX_Success) {
      /* VMXON failed, we must already be in VMX root operation. */
      vmxWasInRootOperation = TRUE;
   }

   /* VMCLEAR the given VMCS page. */
   vmclearStatus = VMCLEAR_2_STATUS(&vmcs);
   Vmx86CheckVMXStatus("VMCLEAR", vmclearStatus);

   /* VMXOFF if we were initially in VMX non-root operation. */
   if (!vmxWasInRootOperation) {
      vmxoffStatus = VMXOFF_2_STATUS();
      Vmx86CheckVMXStatus("VMXOFF", vmxoffStatus);
   }

   /* Disable HV if it was initially disabled. */
   if (!hvWasEnabled) {
      uintptr_t cr4reg;
      GET_CR4(cr4reg);
      SET_CR4(cr4reg & ~CR4_VMXE);
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_FlushVMCSAllCPUs --
 *
 *      Enable HV (if necessary) and VMCLEAR a VMCS page on all logical CPUs.
 *      This will prevent stale data from surfacing out of the VMCS cache when
 *      executing VMREADs.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      HV will be enabled and hardware VMCS caches will be flushed across all
 *      CPUs.
 *
 *-----------------------------------------------------------------------------
 */

void
Vmx86_FlushVMCSAllCPUs(MA vmcs) // IN: The MA of the VMCS to VMCLEAR
{
   HostIF_CallOnEachCPU(Vmx86FlushVMCSPage, (void *)vmcs);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_YieldToSet --
 *
 *      Yield the CPU until a vCPU from the requested set has run.
 *
 *      usecs is the total spin time in monitor.  Very low numbers
 *      indicate we detected there was a vCPU thread that was not
 *      in the monitor, so we didn't spin.  In that case, simply
 *      nudge the threads we want and return.
 *
 * Results:
 *      The current CPU yields whenever possible.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_YieldToSet(VMDriver *vm,       // IN:
                 Vcpuid currVcpu,    // IN:
                 const VCPUSet *req, // IN:
                 uint32 usecs,       // IN:
                 Bool skew)          // IN:
{
   VCPUSet vcpus;

   ASSERT(currVcpu < vm->numVCPUs);

   if (VCPUSet_IsEmpty(req)) {
      return;
   }

   /* Crosscalls should spin a few times before blocking */
   if (!skew && usecs < CROSSCALL_SPIN_SHORT_US) {
      HostIF_WakeUpYielders(vm, currVcpu);
      return;
   }

   if (HostIF_PrepareWaitForThreads(vm, currVcpu)) {
      return;
   }

   VCPUSet_Empty(&vcpus);
   FOR_EACH_VCPU_IN_SET_WITH_MAX(req, vcpuid, vm->numVCPUs) {
      if (vcpuid == currVcpu) {
         continue;
      }
      /*
       * First assume the vCPU we want to have wake up the current vCPU
       * is out of the monitor, so set its wakeup bit corresponding to
       * the current vCPU.  It may or may not actually be on the vmmon side.
       */

      VCPUSet_AtomicInclude(&vm->crosscallWaitSet[vcpuid], currVcpu);

      /*
       * Now that the bit is set, check whether the vCPU is in vmmon.  If
       * it was previously in vmmon, and then took a trip to the monitor
       * and back before we got here, then the wakeup has already been sent.
       * If it is in the monitor, either it started in vmmon and sent the
       * wakeup, or it was there the entire time.  In either case we can
       * clear the bit.  This is safe because the bit is directed solely
       * at the current vCPU.
       */

      if (Atomic_Read32(&vm->currentHostCpu[vcpuid]) != INVALID_PCPU) {
         VCPUSet_AtomicRemove(&vm->crosscallWaitSet[vcpuid], currVcpu);
      } else {
         if (VCPUSet_AtomicIsMember(&vm->crosscallWaitSet[vcpuid], currVcpu)) {
            VCPUSet_Include(&vcpus, vcpuid);
         }
      }
   } ROF_EACH_VCPU_IN_SET_WITH_MAX();

   /*
    * Wake up any threads that had previously yielded the processor to
    * let this one run.
    */

   HostIF_WakeUpYielders(vm, currVcpu);

   /*
    * If this thread has other threads to wait for, and no other threads
    * are waiting for this thread, block until one of the threads we're
    * waiting for has run.
    */

   if (!VCPUSet_IsEmpty(&vcpus) &&
       VCPUSet_IsEmpty(&vm->crosscallWaitSet[currVcpu])) {
      HostIF_WaitForThreads(vm, currVcpu);
   }

   /*
    * Tell other vcpus that they no longer have to wake this one.
    * This is optional, the other threads will eventually clear their
    * bits anyway.
    */

   FOR_EACH_VCPU_IN_SET_WITH_MAX(&vcpus, vcpuid, vm->numVCPUs) {
      VCPUSet_AtomicRemove(&vm->crosscallWaitSet[vcpuid], currVcpu);
   } ROF_EACH_VCPU_IN_SET_WITH_MAX();

   HostIF_CancelWaitForThreads(vm, currVcpu);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86PerfCtrInUse --
 *
 *      Determine which performance counters are already in use by the
 *      host on the current PCPU.  A performance counter is considered
 *      in use if its event select enable bit is set or if this method
 *      is unable to count events with the performance counter.
 *
 * Results:
 *      Return TRUE if counter is in use.
 *
 * Side effects:
 *      None.
 *----------------------------------------------------------------------
 */
static Bool
Vmx86PerfCtrInUse(Bool isGen, unsigned pmcNum, unsigned ctrlMSR,
                  unsigned cntMSR, Bool hasPGC)
{
   volatile unsigned delay;
   uint64 origPGC = hasPGC ? X86MSR_GetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR) : 0;
   uint64 pmcCtrl;
   uint64 pmcCount, count;
   uint64 ctrlEna, pgcEna;

   pmcCtrl = X86MSR_GetMSR(ctrlMSR);
   if (isGen) {
      ASSERT(pmcNum < 32);
      if ((pmcCtrl & PERFCTR_CPU_ENABLE) != 0) {
         return TRUE;
      }
      ctrlEna = PERFCTR_CPU_ENABLE | PERFCTR_CPU_KERNEL_MODE |
                PERFCTR_CORE_INST_RETIRED;
      pgcEna = CONST64U(1) << pmcNum;
   } else {
      ASSERT(pmcNum < PERFCTR_CORE_NUM_FIXED_COUNTERS);
      if ((pmcCtrl & PERFCTR_CORE_FIXED_ENABLE_MASKn(pmcNum)) != 0) {
         return TRUE;
      }
      ctrlEna = pmcCtrl | PERFCTR_CORE_FIXED_KERNEL_MASKn(pmcNum);
      pgcEna = CONST64U(1) << (pmcNum + 32);
   }
   pmcCount = X86MSR_GetMSR(cntMSR);
   /* Enable the counter. */
   X86MSR_SetMSR(ctrlMSR, ctrlEna);
   if (hasPGC) {
      X86MSR_SetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR, pgcEna | origPGC);
   }
   /* Retire some instructions and wait a few cycles. */
   for (delay = 0; delay < 100; delay++) ;
   /* Disable the counter. */
   if (hasPGC) {
      X86MSR_SetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR, origPGC);
   }
   count = X86MSR_GetMSR(cntMSR);
   X86MSR_SetMSR(ctrlMSR, pmcCtrl);
   X86MSR_SetMSR(cntMSR, pmcCount);
   return count == pmcCount;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86GetUnavailPerfCtrsOnCPU --
 *
 *      Determine which performance counters are already in use by the
 *      host on the current PCPU.
 *
 * Results:
 *      A bitset representing unavailable performance counter.
 *      Bits 0-31 represent general purpose counters, and bits 32-63
 *      represent fixed counters.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86GetUnavailPerfCtrsOnCPU(void *data)
{
   CPUIDRegs regs;
   unsigned i, numGen = 0, numFix = 0, stride = 1;
   uint32 selBase = 0;
   uint32 ctrBase = 0;
   Bool hasPGC = FALSE;
   Atomic_uint64 *ctrs = (Atomic_uint64 *)data;
   uintptr_t flags;
   if (CPUID_GetVendor() == CPUID_VENDOR_INTEL) {
      unsigned version;
      if (__GET_EAX_FROM_CPUID(0) < 0xA) {
         return;
      }
      __GET_CPUID(0xA, &regs);
      version = CPUID_GET(0xA, EAX, PMC_VERSION, regs.eax);
      if (version == 0) {
         return;
      }
      numGen = CPUID_GET(0xA, EAX, PMC_NUM_GEN, regs.eax);
      if (version >= 2) {
         numFix = CPUID_GET(0xA, EDX, PMC_NUM_FIXED, regs.edx);
         hasPGC = TRUE;
      }
      selBase = PERFCTR_CORE_PERFEVTSEL0_ADDR;
      ctrBase = PERFCTR_CORE_PERFCTR0_ADDR;
   } else if (CPUID_GetVendor() == CPUID_VENDOR_AMD ||
              CPUID_GetVendor() == CPUID_VENDOR_HYGON) {
     if (CPUID_ISSET(0x80000001, ECX, PERFCORE,
        __GET_ECX_FROM_CPUID(0x80000001))) {
         numGen  = 6;
         selBase = PERFCTR_AMD_EXT_BASE_ADDR + PERFCTR_AMD_EXT_EVENTSEL;
         ctrBase = PERFCTR_AMD_EXT_BASE_ADDR + PERFCTR_AMD_EXT_CTR;
         stride  = 2;
      } else {
         numGen  = 4;
         selBase = PERFCTR_AMD_PERFEVTSEL0_ADDR;
         ctrBase = PERFCTR_AMD_PERFCTR0_ADDR;
      }
   }
   ASSERT(numGen <= 32 && numFix <= 32);

   /*
    * Vmx86PerfCtrInUse modifies performance counters to determine if
    * if they are usable, disable interrupts to avoid racing with
    * interrupt handlers.
    */
   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();
   for (i = 0; i < numGen; i++) {
      if (Vmx86PerfCtrInUse(TRUE, i, selBase + i * stride,
                            ctrBase + i * stride, hasPGC)) {
         Atomic_SetBit64(ctrs, i);
      }
   }
   if (numFix > 0) {
      numFix = MIN(numFix, PERFCTR_CORE_NUM_FIXED_COUNTERS);
      for (i = 0; i < numFix; i++) {
         if (Vmx86PerfCtrInUse(FALSE, i, PERFCTR_CORE_FIXED_CTR_CTRL_ADDR,
                               PERFCTR_CORE_FIXED_CTR0_ADDR + i, hasPGC)) {
            Atomic_SetBit64(ctrs, i + 32);
         }
      }
   }
   RESTORE_FLAGS(flags);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetUnavailablePerfCtrs --
 *
 *      Determine which performance counters are already in use by the
 *      host on across all PCPUs, and therefore unavailable for use by
 *      the monitor.  A performance counter is considered in use if its
 *      event select enable bit on any PCPU is set.
 *
 * Results:
 *      A bitset representing unavailable performance counter.
 *      Bits 0-31 represent general purpose counters, and bits 32-63
 *      represent fixed counters.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

uint64
Vmx86_GetUnavailablePerfCtrs(void)
{
   Atomic_uint64 unavailCtrs;
   Atomic_Write64(&unavailCtrs, 0);
   HostIF_CallOnEachCPU(Vmx86GetUnavailPerfCtrsOnCPU, &unavailCtrs);
   return Atomic_Read64(&unavailCtrs);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetPageRoot --
 *
 *      Get the page root MPN for the specified VCPU.
 *
 * Results:
 *      TRUE and an MPN on success, FALSE on failure.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_GetPageRoot(VMDriver *vm,  // IN:
                  Vcpuid vcpuid, // IN:
                  MPN *mpn)      // OUT:
{
   if (vcpuid >= vm->numVCPUs) {
      return FALSE;
   }
   *mpn = vm->ptRootMpns[vcpuid];
   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_MapPage --
 *
 *      Maps the specified MPN into the host kernel address space.
 *      returns the VPN of the mapping.
 *
 * Results:
 *      The VPN in the kernel address space of the new mapping, or 0 if
 *      the mapping failed.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

VPN
Vmx86_MapPage(MPN mpn) // IN:
{
   return HostIF_MapPage(mpn);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_UnmapPage --
 *
 *      Unmaps the specified VPN from the host kernel address space.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_UnmapPage(VPN vpn) // IN:
{
   HostIF_UnmapPage(vpn);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_GetMonitorContext --
 *
 *      Gets most of the monitor's saved context (as of the last world switch)
 *      from a given VCPU's crosspage.  CR3 is omitted as it is privileged,
 *      while DS/SS/ES are returned due to their potential utility in debugging.
 *
 * Results:
 *      On success, TRUE and context is (partially) populated.  FALSE otherwise.
 *
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_GetMonitorContext(VMDriver *vm,       // IN: The VM instance.
                        Vcpuid vcpuid,      // IN: VCPU in question.
                        Context64 *context) // OUT: context.
{
   VMCrossPageData *cpData;

   if (vcpuid >= vm->numVCPUs || vm->crosspage[vcpuid] == NULL) {
      return FALSE;
   }
   cpData = vm->crosspage[vcpuid];

   memset(context, 0, sizeof *context);
   context->es  = cpData->monES;
   context->ss  = cpData->monSS;
   context->ds  = cpData->monDS;
   context->rbx = cpData->monRBX;
   context->rsp = cpData->monRSP;
   context->rbp = cpData->monRBP;
   context->r12 = cpData->monR12;
   context->r13 = cpData->monR13;
   context->r14 = cpData->monR14;
   context->r15 = cpData->monR15;
   context->rip = cpData->monRIP;
   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_CleanupHVIOBitmap --
 *
 *      Free any resources that were allocated for the HV I/O bitmap.
 *
 * Results:
 *      None.
 *
 *----------------------------------------------------------------------
 */

void
Vmx86_CleanupHVIOBitmap(void)
{
   if (hvIOBitmap != NULL) {
      HostIF_FreeContigPages(NULL, hvIOBitmap);
      hvIOBitmap = NULL;
   }
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_CreateHVIOBitmap --
 *
 *      Called on driver load to create and initialize the host wide SVM I/O
 *      bitmap.  This item is a physically contiguous region of
 *      SVM_VMCB_IO_BITMAP_PAGES pages and is initialized to all-bits-set.
 *
 * Results:
 *      TRUE on success or FALSE on failure.
 *
 *----------------------------------------------------------------------
 */

Bool
Vmx86_CreateHVIOBitmap(void)
{
   if (!CPUID_HostSupportsSVM()) {
      return TRUE;
   }
   hvIOBitmap = HostIF_AllocContigPages(NULL, SVM_VMCB_IO_BITMAP_PAGES);
   if (hvIOBitmap == NULL) {
      Warning("Failed to allocate SVM I/O bitmap.\n");
      return FALSE;
   }
   memset(hvIOBitmap->addr, 0xff, SVM_VMCB_IO_BITMAP_SIZE);
   return TRUE;
}

/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86RegisterCPU --
 *
 *      Registers each logical CPU by incrementing a counter.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      counter value pointed by 'data' is incremented by one.
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86RegisterCPU(void *data) // IN: *data
{
   Atomic_uint32 *numLogicalCPUs = data;
   ASSERT(numLogicalCPUs);
   Atomic_Inc32(numLogicalCPUs);
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86VTMSRCacheGet --
 *
 *      Retrieve the requested VT MSR value from the cache.  Returns zero
 *      for uncached values.
 *
 *----------------------------------------------------------------------
 */

static uint64
Vmx86VTMSRCacheGet(const MSRCache *cache, uint32 msrNum, unsigned cpu)
{
   ASSERT((msrNum >= MSR_VMX_BASIC && msrNum < MSR_VMX_BASIC + NUM_VMX_MSRS) ||
          msrNum == MSR_FEATCTL);
   if (cache != NULL && cache->queryCache != NULL) {
      size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * cache->nPCPUs;
      MSRQuery *query;
      unsigned ix;
      ASSERT(cpu < cache->nPCPUs);
      for (ix = 0; ix < cache->queryCache->numItems; ix++) {
         query = (MSRQuery *) ((uint8 *)&cache->queryCache->query[0] +
                               ix * offset);
         if (query->msrNum == msrNum) {
            return query->logicalCPUs[cpu].msrVal;
         }
      }
   }
   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86AllocMSRUniformityCache --
 * Vmx86FreeMSRUniformityCache --
 *
 *      Allocate/populate and cleanup MSR uniformity cache.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
Vmx86AllocMSRUniformityCache(uint32 numPCPUs)
{
   MSRQuery *query = NULL;
   uint32 i;
   uint32 numQueries = ARRAYSIZE(msrUniformityCacheInfo);
   Atomic_uint32 *cpuCounters;
   MSRQuery *multMSRQueryAllPcpus = Vmx86_Calloc(numQueries,
         sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs, FALSE);
   if (multMSRQueryAllPcpus == NULL) {
      return FALSE;
   }

   cpuCounters = Vmx86_Calloc(numQueries, sizeof(Atomic_uint32), FALSE);
   if (cpuCounters == NULL) {
      Vmx86_Free(multMSRQueryAllPcpus);
      return FALSE;
   }
   msrCacheQueryData.query = multMSRQueryAllPcpus;
   msrCacheQueryData.index = cpuCounters;
   msrCacheQueryData.numItems = numQueries;

   /*
    * Enumerates a MSR list and initializes MSR msrCacheQueryData structure
    * before the actual (safe) MSR query takes place.
    */
   for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
      query = (MSRQuery *) ((uint8 *)&msrCacheQueryData.query[0] +
                 i * (sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs));
      Atomic_Write32(&msrCacheQueryData.index[i], 0);
      query->msrNum = msrUniformityCacheInfo[i].msrIndex;
      query->numLogicalCPUs = numPCPUs;
   }

   /* Perform a single query for all of the MSRs in the uniformity check list.*/
   HostIF_CallOnEachCPU(Vmx86GetMSR, &msrCacheQueryData);
   return TRUE;
}


static void
Vmx86FreeMSRUniformityCache(void)
{
   Vmx86_Free(msrCacheQueryData.index);
   Vmx86_Free(msrCacheQueryData.query);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86CheckMSRUniformity --
 *
 *      Iterate MSR uniformity cache and test uniformity of each MSR across all
 *      physical cpu(s).
 *
 *-----------------------------------------------------------------------------
 */

static void
Vmx86CheckMSRUniformity(uint32 numPCPUs)
{
   uint32 i, j;
   MSRQuery *query = NULL;

   for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
      uint32 msrIndex = msrUniformityCacheInfo[i].msrIndex;
      query = (MSRQuery *)((uint8 *)&msrCacheQueryData.query[0] +
                 i * (sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs));
      ASSERT(Atomic_Read32(&msrCacheQueryData.index[i]) == numPCPUs);
      for (j = 1; j < numPCPUs; j++) {
         uint64 msrValuePCPU = query->logicalCPUs[j].msrVal;
         if (msrValuePCPU != query->logicalCPUs[0].msrVal) {
            Warning("Found a mismatch on MSR feature 0x%x; logical cpu%u "
                    "value = 0x%llx, but logical cpu%u value = 0x%llx\n",
                    msrIndex, j, msrValuePCPU, 0, query->logicalCPUs[0].msrVal);
         }
      }
   }
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86FindMSRQueryFromCache --
 *
 *      Iterate MSR uniformity cache and find query position for the given msr.
 *
 *-----------------------------------------------------------------------------
 */

static MSRQuery*
Vmx86FindMSRQueryFromCache(uint32 msrIndex, uint32 numPCPUs)
{
   uint32 i;
   MSRQuery *query = NULL;
   size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs;
   MSRQuery *first = &msrCacheQueryData.query[0];

   for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
      if (msrIndex == msrUniformityCacheInfo[i].msrIndex) {
         query = (MSRQuery *)((uint8 *)first + i * offset);
         break;
      }
   }
   return query;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86FindCommonMSRArchCap --
 * Vmx86FindCommonMSRBIOSSignID --
 * Vmx86FindCommonMSRVMCR --
 * Vmx86FindCommonMSRJoin --
 *
 *      Calculate least common denominator for IA32_MSR_ARCH_CAPABILITIES,
 *      MSR_BIOS_SIGN_ID, MSR_VM_CR, and general case respectively.
 *
 *-----------------------------------------------------------------------------
 */

static uint64
Vmx86FindCommonMSRArchCap(uint32 msrIndex, uint32 numPCPUs)
{
   uint32 j;
   uint64 msrCommonVal;

   MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
   ASSERT(query != NULL);
   ASSERT(msrIndex == IA32_MSR_ARCH_CAPABILITIES);

   msrCommonVal = query->logicalCPUs[0].msrVal;
   /*
    * MSR_ARCH_CAPABILITIES_RSBA bit 1 represents lack of feature while 0
    * represents presence. Therefore, bit is flipped for calculating the
    * least common set and flipped again on the final value for resetting.
    */
   msrCommonVal ^= MSR_ARCH_CAPABILITIES_RSBA;
   for (j = 1; j < numPCPUs; j++) {
      uint64 msrValuePCPU = query->logicalCPUs[j].msrVal;
      if (msrValuePCPU != query->logicalCPUs[0].msrVal) {
         msrValuePCPU ^= MSR_ARCH_CAPABILITIES_RSBA;
         msrCommonVal &= msrValuePCPU;
      }
   }
   msrCommonVal ^= MSR_ARCH_CAPABILITIES_RSBA;
   return msrCommonVal;
}


static uint64
Vmx86FindCommonMSRBIOSSignID(uint32 msrIndex, uint32 numPCPUs)
{
   unsigned cpu;
   uint64 commonVal;

   MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
   ASSERT(query != NULL);
   commonVal = ~0ULL;

   for (cpu = 0; cpu < numPCPUs; cpu++) {
      if (query->logicalCPUs[cpu].msrVal < commonVal) {
         commonVal = query->logicalCPUs[cpu].msrVal;
      }
   }

   return commonVal;
}


static uint64
Vmx86FindCommonMSRVMCR(uint32 msrIndex, uint32 numPCPUs)
{
   unsigned cpu;
   uint64 commonVal;

   MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
   ASSERT(query != NULL);
   commonVal = query->logicalCPUs[0].msrVal;

   for (cpu = 1; cpu < numPCPUs; cpu++) {
      uint64 msrValuePCPU = query->logicalCPUs[cpu].msrVal;
      commonVal &= msrValuePCPU & MSR_VM_CR_R_INIT;
      commonVal |= msrValuePCPU & ~MSR_VM_CR_R_INIT;
   }

   return commonVal;
}


static uint64
Vmx86FindCommonMSRJoin(uint32 msrIndex, uint32 numPCPUs)
{
   uint32 j;
   uint64 msrCommonVal;

   MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
   ASSERT(query != NULL);

   msrCommonVal = query->logicalCPUs[0].msrVal;
   for (j = 1; j < numPCPUs; j++) {
      msrCommonVal &= query->logicalCPUs[j].msrVal;
   }
   return msrCommonVal;
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86GenFindCommonCap --
 * Vmx86GenFindCommonIntelVTCap --
 * Vmx86FindCommonMSR --
 *
 *      Generate common MSR calculation routines by deriving appropriate
 *      function with 'member' name.
 *
 *-----------------------------------------------------------------------------
 */

static uint64
Vmx86GenFindCommonCap(uint32 msrIndex, uint32 numPCPUs)
{
#define MSRNUMVT(msr, member)
#define MSRNUMVT2 MSRNUMVT

#define MSRNUM(msr, member)                                                    \
   if (msrIndex == msr) {                                                      \
      return Vmx86FindCommonMSR##member(msrIndex, numPCPUs);                   \
   } else {                                                                    \
      return Vmx86FindCommonMSRJoin(msrIndex, numPCPUs);                       \
   }

   UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2

   return CONST64(0);
}


static uint64
Vmx86GenFindCommonIntelVTCap(uint32 msrIndex, uint32 numPCPUs)
{
   MSRCache vt;
   IntelVTMSRGet_Fn fn = Vmx86VTMSRCacheGet;

   /* Prepare a special cache for VT MSR uniformity checks. */
   vt.queryCache = &msrCacheQueryData;
   vt.nPCPUs = numPCPUs;

#define MSRNUM(msr, member)

#define MSRNUMVT(msr, member)                                                  \
   if (msrIndex == msr) {                                                      \
      return IntelVT_FindCommon##member(&vt, fn, numPCPUs);                    \
   }

#define MSRNUMVT2(msr, member)                                                 \
   if (msrIndex == msr) {                                                      \
      return IntelVT_FindCommon##member(&vt, fn, numPCPUs, msr);               \
   }

   UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2

   return CONST64(0);
}


static uint64
Vmx86FindCommonMSR(uint32 msrIndex, uint32 numPCPUs)
{
#define MSRNUM(msr, member)                                                    \
   if (msrIndex == msr) {                                                      \
      return Vmx86GenFindCommonCap(msrIndex, numPCPUs);                        \
   }

#define MSRNUMVT(msr, member)                                                  \
   if (msrIndex == msr) {                                                      \
      return Vmx86GenFindCommonIntelVTCap(msrIndex, numPCPUs);                 \
   }

#define MSRNUMVT2 MSRNUMVT

   UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2

   return CONST64(0);
}


/*
 *-----------------------------------------------------------------------------
 *
 * Vmx86_CheckMSRUniformity --
 *
 *      Provides basic hardware MSR feature checks for x86 hosted platform. VMM
 *      requires and prefers uniformity of certain MSRs. This function iterates
 *      through a list of MSR features (i.e. msrUniformityCacheInfo), checking
 *      uniformity for MSR value on each logical CPU. A Uniformity check is
 *      ignored for MSRs are that are not available for the target architecture
 *      or cpu family. If MSRs are non uniform then, a common bit field is
 *      calculated by taking the intersection of MSR values across cpu(s).
 *
 * Results:
 *      Returns TRUE if MSR uniformity checks complete successfully, FALSE
 *      otherwise.
 *
 * Side effects:
 *      updates msrUniformityCacheInfo cache with MSR values.
 *
 *-----------------------------------------------------------------------------
 */

Bool
Vmx86_CheckMSRUniformity(void)
{
   uint32 i;
   Atomic_uint32 numLogicalCPUs;
   uint32 numPCPUs = 0;

   Atomic_Write32(&numLogicalCPUs, 0);
   /*
    * Calculates number of logical CPUs by counting and then uses this
    * information to set up MSR queries; will be executed on each logical CPU.
    */
   HostIF_CallOnEachCPU(Vmx86RegisterCPU, &numLogicalCPUs);
   numPCPUs = Atomic_Read32(&numLogicalCPUs);
   ASSERT(numPCPUs > 0);

   if (!Vmx86AllocMSRUniformityCache(numPCPUs)) {
      Warning("Fatal, not enough memory for MSR feature uniformity checks");
      return FALSE;
   }

   Vmx86CheckMSRUniformity(numPCPUs);

   for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
      uint32 msrIndex = msrUniformityCacheInfo[i].msrIndex;
      msrUniformityCacheInfo[i].msrValue = Vmx86FindCommonMSR(msrIndex,
                                                              numPCPUs);
   }

   Vmx86FreeMSRUniformityCache();

   return TRUE;
}


/*
 *----------------------------------------------------------------------
 *
 * Vmx86_KernelCETEnabled --
 *
 *      Check if kernel mode shadow stacks are enabled by examining
 *      the current shadow stack pointer.
 *
 * Results:
 *      TRUE if any CPU has kernel mode shadow stacks enabled.
 *
 *----------------------------------------------------------------------
 */

static void
Vmx86KernelCETEnabledOnCPU(void *data)
{
   uint64 ssp = GET_SSP();

   if (ssp != INVALID_SSP) {
      Atomic_Bool *kernelCETEnabled = (Atomic_Bool *)data;

      Atomic_WriteBool(kernelCETEnabled, TRUE);
   }
}


Bool
Vmx86_KernelCETEnabled(void)
{
   Atomic_Bool kernelCETEnabled;

   Atomic_WriteBool(&kernelCETEnabled, FALSE);
   HostIF_CallOnEachCPU(Vmx86KernelCETEnabledOnCPU, &kernelCETEnabled);
   return Atomic_ReadBool(&kernelCETEnabled);
}