vmware-host-modules/vmmon-only/common/vmx86.c

4065 lines
105 KiB
C

/*********************************************************
* Copyright (C) 1998-2022 VMware, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation version 2 and no later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*********************************************************/
/*
* vmx86.c --
*
* Platform independent routines for creating/destroying/running
* virtual machine monitors.
*/
#ifdef __linux__
/* Must come before any kernel header file --hpreg */
# include "driver-config.h"
# include <linux/string.h> /* memset() in the kernel */
# include <linux/sched.h> /* jiffies from the kernel */
#else
# include <string.h>
#endif
#include "vm_assert.h"
#include "vm_basic_math.h"
#include "vmx86.h"
#include "task.h"
#include "iocontrols.h"
#include "hostif.h"
#include "cpuid.h"
#include "vcpuset.h"
#include "memtrack.h"
#if defined(_WIN64)
#include "vmmon-asm-x86-64.h"
#endif
#include "x86vt.h"
#include "x86svm.h"
#include "x86cpuid_asm.h"
#if defined(__linux__)
#include <linux/timex.h>
#endif
#include "perfctr.h"
#include "x86vtinstr.h"
#include "bootstrap_vmm.h"
#include "monLoader.h"
#include "vmmblob.h"
#include "sharedAreaVmmon.h"
#include "statVarsVmmon.h"
#include "intelVT.h"
#include "cpu_defs.h"
#include "x86cet.h"
PseudoTSC pseudoTSC;
/*
* Keep track of the virtual machines that have been
* created using the following structures.
*/
static VMDriver *vmDriverList = NULL;
static LockedPageLimit lockedPageLimit = {
0, // host: does not need to be initialized.
0, // configured: must be set by some VM as it is powered on.
};
/* Percentage of guest "paged" memory that must fit within the hard limit. */
static Percent minVmMemPct;
/* Number of pages actually locked by all virtual machines */
static PageCnt numLockedPages;
/* Total virtual machines on this host */
static unsigned vmCount;
/* Total number of open vmmon file handles. */
static unsigned fdCount;
/*
* We implement a list of allocated VM ID's using an array.
* The array is initialized with the values 1...MAX_VMS-1, INVALID_VMID.
* vmIDsAllocated holds the last VM ID given out and vmIDsUnused
* holds the next VM ID to give out.
*/
#define INVALID_VMID (-1)
static int vmIDList[MAX_VMS];
static int vmIDsAllocated;
static int vmIDsUnused;
/* Max rate requested for fast clock by any virtual machine. */
static unsigned globalFastClockRate;
/* 3 physically contiguous pages for the I/O bitmap. SVM only. */
HostIFContigMemMap *hvIOBitmap;
typedef struct {
Atomic_uint32 *index; // OUT: array of cpu counters for queries.
MSRQuery *query; // IN/OUT: array of query items
uint32 numItems; // IN
} Vmx86GetMSRData;
static Bool hostUsesNX;
typedef struct NXData {
Atomic_uint32 responded;
Atomic_uint32 hasNX;
} NXData;
/*
* A structure for holding MSR indexes, values for MSR uniformity checks.
*/
typedef struct VMX86MSRCacheInfo {
uint32 msrIndex;
uint64 msrValue;
} VMX86MSRCacheInfo;
struct MSRCache {
Vmx86GetMSRData *queryCache;
uint32 nPCPUs;
};
static Vmx86GetMSRData msrCacheQueryData;
/*
* A MSR cache list for checking uniformity across physical cpus and for
* generating least common denominated values across pcpus.
* {MSR_Index, Member_Name}
*/
#define UNIFORMITY_CACHE_MSRS \
MSRNUM(IA32_MSR_ARCH_CAPABILITIES, ArchCap) \
MSRNUM(MSR_BIOS_SIGN_ID, BIOSSignID) \
MSRNUM(MSR_PLATFORM_INFO, Join) \
MSRNUM(MSR_TSX_CTRL, Join) \
MSRNUM(MSR_VM_CR, VMCR) \
MSRNUMVT(MSR_FEATCTL, FeatureCtl) \
MSRNUMVT(MSR_VMX_BASIC, Basic) \
MSRNUMVT(MSR_VMX_MISC, Misc) \
MSRNUMVT(MSR_VMX_VMCS_ENUM, Enum) \
MSRNUMVT(MSR_VMX_EPT_VPID, EPT) \
MSRNUMVT(MSR_VMX_VMFUNC, VMFunc) \
MSRNUMVT(MSR_VMX_3RD_CTLS, 3rd) \
MSRNUMVT2(MSR_VMX_PINBASED_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_PROCBASED_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_EXIT_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_ENTRY_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_2ND_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_TRUE_PINBASED_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_TRUE_PROCBASED_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_TRUE_EXIT_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_TRUE_ENTRY_CTLS, Ctls) \
MSRNUMVT2(MSR_VMX_CR0_FIXED0, Fixed0) \
MSRNUMVT2(MSR_VMX_CR4_FIXED0, Fixed0) \
MSRNUMVT2(MSR_VMX_CR0_FIXED1, Fixed1) \
MSRNUMVT2(MSR_VMX_CR4_FIXED1, Fixed1) \
static VMX86MSRCacheInfo msrUniformityCacheInfo[] = {
#define MSRNUM(msr, member) {msr, CONST64(0)},
#define MSRNUMVT MSRNUM
#define MSRNUMVT2 MSRNUM
UNIFORMITY_CACHE_MSRS
};
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2
/*
*----------------------------------------------------------------------
*
* Vmx86AdjustLimitForOverheads --
*
* This function adjusts an overall limit on the number of
* locked pages to take into account overhead for the vmx processes, etc.
* since the hostOS will also see this as overhead. We do this for all
* vmx processes, not just ones whose vms have been admitted.
*
* If "vm" is NULL, we are allocating a global page and have no
* perVMOverhead term to take into account.
*
* Results:
* Number of remaining pages considered to be lockable on this host.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static INLINE PageCnt
Vmx86AdjustLimitForOverheads(const VMDriver* vm,
const PageCnt limit)
{
PageCnt extraCost = (vm != NULL) ? vmCount * vm->memInfo.perVMOverhead : 0;
ASSERT(HostIF_GlobalLockIsHeld());
return (extraCost < limit) ? (limit - extraCost) : 0;
}
/*
*----------------------------------------------------------------------
*
* Vmx86LockedPageLimit --
*
* There are three limits controlling how many pages we can lock on
* a host:
*
* lockedPageLimit.configured is controlled by UI,
* lockedPageLimit.host is calculated dynamically based on kernel stats
* by vmmon using kernel stats.
*
* We can lock the MIN of these values.
*
* Results:
* Number of pages to lock on this host.
*
* Side effects:
* Updates the host locked pages limit.
*
*----------------------------------------------------------------------
*/
static INLINE PageCnt
Vmx86LockedPageLimit(const VMDriver* vm) // IN:
{
PageCnt overallLimit;
ASSERT(HostIF_GlobalLockIsHeld());
lockedPageLimit.host = HostIF_EstimateLockedPageLimit(vm, numLockedPages);
overallLimit = MIN(MIN(lockedPageLimit.configured, lockedPageLimit.host),
MAX_LOCKED_PAGES);
return Vmx86AdjustLimitForOverheads(vm, overallLimit);
}
/*
*----------------------------------------------------------------------
*
* Vmx86HasFreePages --
*
* Returns TRUE if the vm can lock more pages. This is true if
* we are below the host's hard memory limit and this vm has not
* exceeded its maximum allocation.
* Callers must ensure driver-wide and VM serialization
* typically by using HostIF_GlobalLock() and HostIF_VMLock().
*
* Results:
* TRUE if pages can be locked, FALSE otherwise
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static INLINE Bool
Vmx86HasFreePages(VMDriver *vm,
PageCnt numPages,
Bool checkVM)
{
/*
* 1) Be careful with overflow.
* 2) lockedPageLimit and vm->memInfo.maxAllocation can be decreased below
* the current numLockedPages and vm->memInfo.locked
* 3) lockedPageLimit.host can go lower than numLockedPages.
*/
ASSERT(HostIF_GlobalLockIsHeld() &&
(!checkVM || HostIF_VMLockIsHeld(vm)));
if (checkVM) {
/* Check the per-vm limit. */
ASSERT(HostIF_VMLockIsHeld(vm));
if (vm->memInfo.admitted) {
if (vm->memInfo.maxAllocation <= vm->memInfo.locked) {
return FALSE;
} else if (vm->memInfo.maxAllocation - vm->memInfo.locked < numPages) {
return FALSE;
}
}
} else {
/* Check the global limit. */
PageCnt limit = Vmx86LockedPageLimit(vm);
if (limit <= numLockedPages) {
return FALSE;
} else if (limit - numLockedPages < numPages) {
return FALSE;
}
}
return TRUE;
}
#ifdef VMX86_DEBUG
/*
*----------------------------------------------------------------------
*
* Vmx86VMIsRegistered --
*
* Check if "vm" is on the list of VMDrivers.
*
* Results:
* Return TRUE iff "vm" is on the list of VMDrivers.
*
* Side effects:
* None
*
*----------------------------------------------------------------
*/
static Bool
Vmx86VMIsRegistered(VMDriver *vm, Bool needsLock)
{
VMDriver *tmp;
Bool found = FALSE;
ASSERT(needsLock || HostIF_GlobalLockIsHeld());
if (needsLock) {
HostIF_GlobalLock(5);
}
for (tmp = vmDriverList; tmp != NULL; tmp = tmp->nextDriver) {
if (tmp == vm) {
found = TRUE;
break;
}
}
if (needsLock) {
HostIF_GlobalUnlock(5);
}
return found;
}
#endif
/*
*----------------------------------------------------------------------
*
* Vmx86_InitIDList --
*
* Called when the driver is initialized.
* Set up the list of available VM ID's.
*
* Results:
* None. Sets up global data.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
Vmx86_InitIDList(void)
{
int i;
HostIF_GlobalLock(32);
for (i = 0; i < MAX_VMS; i++) {
vmIDList[i] = i + 1;
}
vmIDList[MAX_VMS - 1] = INVALID_VMID;
vmIDsUnused = 0;
vmIDsAllocated = INVALID_VMID;
HostIF_GlobalUnlock(32);
}
/*
*----------------------------------------------------------------------
*
* Vmx86FreeVMID --
*
* Return a VM ID to the list of available VM ID's.
*
* Results:
* None
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static void
Vmx86FreeVMID(int vmID) // IN
{
int i;
ASSERT(HostIF_GlobalLockIsHeld());
/* Deleting head of the list. */
if (vmID == vmIDsAllocated) {
int tmp;
tmp = vmIDList[vmIDsAllocated];
vmIDList[vmIDsAllocated] = vmIDsUnused;
vmIDsAllocated = tmp;
vmIDsUnused = vmID;
return;
}
for (i = vmIDsAllocated; vmIDList[i] != INVALID_VMID; i = vmIDList[i]) {
if (vmIDList[i] == vmID) {
vmIDList[i] = vmIDList[vmID];
vmIDList[vmID] = vmIDsUnused;
vmIDsUnused = vmID;
return;
}
}
}
/*
*----------------------------------------------------------------------
*
* Vmx86AllocVMID --
*
* Grab a VM ID from the list of available VM ID's.
*
* Results:
* The VM ID, in the range [ 0 ; MAX_VMS ).
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static int
Vmx86AllocVMID(void)
{
int vmID;
ASSERT(HostIF_GlobalLockIsHeld());
vmID = vmIDsUnused;
ASSERT(0 <= vmID && vmID < MAX_VMS);
vmIDsUnused = vmIDList[vmID];
vmIDList[vmID] = vmIDsAllocated;
vmIDsAllocated = vmID;
return vmID;
}
/*
*----------------------------------------------------------------------
*
* Vmx86RegisterVMOnList --
*
* Add a VM to the list of registered VMs and increment
* the count of VMs.
*
* Results:
* None
*
* Side effects:
* Add VM to linked list.
* Increment count of VMs.
*
*----------------------------------------------------------------
*/
static void
Vmx86RegisterVMOnList(VMDriver *vm) // IN
{
int vmID;
VMDriver **vmp;
ASSERT(HostIF_GlobalLockIsHeld());
vmCount++;
vmID = Vmx86AllocVMID();
ASSERT(vm->userID == 0);
vm->userID = vmID + 1;
ASSERT(vm->userID > 0);
for (vmp = &vmDriverList; *vmp != NULL; vmp = &(*vmp)->nextDriver) {
if (*vmp == vm) {
Warning("VM already registered on the list of VMs.\n");
return;
}
}
*vmp = vm;
}
/*
*----------------------------------------------------------------------
*
* Vmx86DeleteVMFromList --
*
* Delete a VM from the list of registered VMs and decrement
* the count of VMs. This function should be called on any
* VM registered on the VMDriverList before invoking
* Vmx86FreeAllVMResources to free its memory.
*
* Results:
* None
*
* Side effects:
* Remove VM from linked list.
* Decrement count of VMs.
*
*----------------------------------------------------------------
*/
static void
Vmx86DeleteVMFromList(VMDriver *vm)
{
VMDriver **vmp;
ASSERT(vm);
ASSERT(HostIF_GlobalLockIsHeld());
for (vmp = &vmDriverList; *vmp != vm; vmp = &(*vmp)->nextDriver) {
if (*vmp == NULL) {
Warning("VM is not on the list of registered VMs.\n");
return;
}
}
*vmp = vm->nextDriver;
vmCount--;
Vmx86FreeVMID(vm->userID - 1);
numLockedPages -= vm->memInfo.locked;
/*
* If no VM is running, reset the configured locked-page limit so
* that the next VM to power on sets it appropriately.
*/
if (vmCount == 0) {
lockedPageLimit.configured = 0;
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_Free --
*
* A wrapper around HostIF_FreeKernelMem that checks if the given
* pointer is NULL before freeing memory.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
void
Vmx86_Free(void *ptr)
{
if (ptr != NULL) {
HostIF_FreeKernelMem(ptr);
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_Calloc --
*
* A wrapper around HostIF_AllocKernelMem that zeroes memory and
* fails if integer overflow would occur in the computed
* allocation size.
*
* Results:
* Pointer to allocated memory or NULL on failure. Use
* HostIF_FreeKernelMem or Vmx86_Free to free.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
void *
Vmx86_Calloc(size_t numElements, // IN
size_t elementSize, // IN
Bool nonPageable) // IN
{
size_t numBytes = numElements * elementSize;
void *retval;
if (UNLIKELY(numBytes / numElements != elementSize)) { // Overflow.
return NULL;
}
retval = HostIF_AllocKernelMem(numBytes, nonPageable);
if (retval != NULL) {
memset(retval, 0, numBytes);
}
return retval;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86AllocCrossPages --
*
* Allocate numVCPUs pages suitable to be used as the VCPU's
* crosspage area.
*
* Results:
* TRUE if the required crosspages are allocated successfully.
* FALSE otherwise.
*
*-----------------------------------------------------------------------------
*/
static Bool
Vmx86AllocCrossPages(VMDriver *vm)
{
Vcpuid v;
for (v = 0; v < vm->numVCPUs; v++) {
MPN unused;
UNUSED_VARIABLE(unused);
vm->crosspage[v] = HostIF_AllocKernelPages(1, &unused);
if (vm->crosspage[v] == NULL) {
return FALSE;
}
memset(vm->crosspage[v], 0, PAGE_SIZE);
}
return TRUE;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86FreeCrossPages --
*
* Free the crosspages allocated for the given VM.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86FreeCrossPages(VMDriver *vm)
{
Vcpuid v;
if (vm->crosspage != NULL) {
for (v = 0; v < vm->numVCPUs; v++) {
if (vm->crosspage[v] != NULL) {
HostIF_FreeKernelPages(1, vm->crosspage[v]);
}
}
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86FreeVMDriver --
*
* Release kernel memory allocated for the driver structure.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86FreeVMDriver(VMDriver *vm)
{
Vmx86_Free(vm->ptRootMpns);
Vmx86_Free(vm->crosspage);
Vmx86_Free(vm->crosscallWaitSet);
Vmx86_Free(vm->ptscOffsets);
Vmx86_Free(vm->currentHostCpu);
vm->ptRootMpns = NULL;
vm->crosspage = NULL;
vm->crosscallWaitSet = NULL;
vm->ptscOffsets = NULL;
vm->currentHostCpu = NULL;
HostIF_FreeKernelMem(vm);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86AllocVMDriver --
*
* Allocate the driver structure for a virtual machine.
*
* Results:
* Zeroed VMDriver structure or NULL on error.
*
* Side effects:
* May allocate kernel memory.
*
*-----------------------------------------------------------------------------
*/
static VMDriver *
Vmx86AllocVMDriver(uint32 numVCPUs)
{
VMDriver *vm = Vmx86_Calloc(1, sizeof *vm, TRUE);
if (vm == NULL) {
return NULL;
}
if ((vm->ptRootMpns =
Vmx86_Calloc(numVCPUs, sizeof *vm->ptRootMpns, TRUE)) != NULL &&
(vm->crosspage =
Vmx86_Calloc(numVCPUs, sizeof *vm->crosspage, TRUE)) != NULL &&
(vm->crosscallWaitSet =
Vmx86_Calloc(numVCPUs, sizeof *vm->crosscallWaitSet, TRUE)) != NULL &&
(vm->ptscOffsets =
Vmx86_Calloc(numVCPUs, sizeof *vm->ptscOffsets, TRUE)) != NULL &&
(vm->currentHostCpu =
Vmx86_Calloc(numVCPUs, sizeof *vm->currentHostCpu, TRUE)) != NULL) {
return vm;
}
Vmx86FreeVMDriver(vm);
return NULL;
}
/*
*----------------------------------------------------------------------
*
* Vmx86VMMPageFree --
*
* Unmaps the VMM page corresponding to this entry from in the host
* kernel. This function is used as a callback by MemTrack_Cleanup().
*
* Results:
* None
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static void
Vmx86VMMPageFree(void *unused, MemTrackEntry *entry)
{
ASSERT(entry->vpn != 0 && entry->mpn != 0);
Vmx86_UnmapPage(entry->vpn);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_CleanupVMMPages --
*
* Ummaps all VMM pages from the host kernel address space and frees
* the VMM MemTracker.
*
* Results:
* None
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
void
Vmx86_CleanupVMMPages(VMDriver *vm)
{
MemTrack_Cleanup(vm->vmmTracker, Vmx86VMMPageFree, NULL);
vm->vmmTracker = NULL;
}
/*
*----------------------------------------------------------------------
*
* Vmx86CleanupContigMappings --
*
* Frees all allocations from HostIF_AllocContigPages that are associated
* with the given vm.
*
* Results:
* None
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static void
Vmx86CleanupContigMappings(VMDriver *vm)
{
HostIFContigMemMap *m, *next;
HostIF_VMLock(vm, 48);
for (m = vm->contigMappings; m != NULL; m = next) {
next = m->next;
HostIF_FreeContigPages(vm, m);
}
HostIF_VMUnlock(vm, 48);
vm->contigMappings = NULL;
}
/*
*----------------------------------------------------------------------
*
* Vmx86FreeAllVMResources
*
* Free the resources allocated for a vm that is not registered
* on the VMDriverList. Except in the case of Vmx86_CreateVM(),
* this should be called only after a call to Vmx86DeleteVMFromList().
*
* Results:
* None
*
* Side effects:
* Memory freed.
*
*----------------------------------------------------------------------
*/
static void
Vmx86FreeAllVMResources(VMDriver *vm)
{
ASSERT(!HostIF_GlobalLockIsHeld());
if (vm) {
ASSERT(!Vmx86VMIsRegistered(vm, TRUE));
Vmx86_SetHostClockRate(vm, 0);
Vmx86FreeCrossPages(vm);
if (vm->ptpTracker != NULL) {
Task_SwitchPTPPageCleanup(vm);
}
if (vm->vmmTracker != NULL) {
Vmx86_CleanupVMMPages(vm);
}
if (vm->blobInfo != NULL) {
VmmBlob_Cleanup(vm->blobInfo);
vm->blobInfo = NULL;
}
if (vm->sharedArea != NULL) {
SharedAreaVmmon_Cleanup(vm->sharedArea);
vm->sharedArea = NULL;
}
if (vm->statVars != NULL) {
StatVarsVmmon_Cleanup(vm->statVars);
vm->statVars = NULL;
}
if (vm->contigMappings != NULL) {
Vmx86CleanupContigMappings(vm);
}
HostIF_FreeAllResources(vm);
Vmx86FreeVMDriver(vm);
}
}
/*
*----------------------------------------------------------------------
*
* Vmx86ReserveFreePages --
*
* Returns TRUE and increases locked page counts if the vm can lock
* more pages. This is true if we are below the host's hard memory
* limit and this vm has not exceeded its maximum allocation.
* The function is thread-safe.
*
* If ignoreLimits is TRUE then additional pages may be reserved even
* if limits are violated. The request to ignore limits may come in
* cases of anonymous page allocations. Swapping is not always possible
* at those points but a swap target will have been posted so that the
* vmm will release memory shortly allowing the excessive reservation
* to be reduced.
*
* Results:
* TRUE if pages are reserved for locking, FALSE otherwise
*
* Side effects:
* The global lock and VM's lock are acquired and released.
*
*----------------------------------------------------------------------
*/
static Bool
Vmx86ReserveFreePages(VMDriver *vm,
PageCnt numPages,
Bool ignoreLimits)
{
Bool retval = FALSE;
int retries = 3;
ASSERT(vm);
for (retries = 3; !retval && (retries > 0); retries--) {
HostIF_GlobalLock(17);
HostIF_VMLock(vm, 0);
/* Check VM's limit and don't wait. */
retval = Vmx86HasFreePages(vm, numPages, TRUE);
if (!retval) {
HostIF_VMUnlock(vm, 0);
HostIF_GlobalUnlock(17);
break;
} else {
/* Wait to satisfy the global limit. */
retval = Vmx86HasFreePages(vm, numPages, FALSE);
if (retval) {
numLockedPages += numPages;
vm->memInfo.locked += numPages;
HostIF_VMUnlock(vm, 0);
HostIF_GlobalUnlock(17);
break;
} else {
/*
* There are not enough pages -- drop the locks and wait for
* the host and/or other VMs to produce free pages.
*/
HostIF_VMUnlock(vm, 0);
HostIF_GlobalUnlock(17);
HostIF_WaitForFreePages(10);
}
}
}
if (!retval && ignoreLimits) {
HostIF_GlobalLock(17);
HostIF_VMLock(vm, 0);
numLockedPages += numPages;
vm->memInfo.locked += numPages;
HostIF_VMUnlock(vm, 0);
HostIF_GlobalUnlock(17);
retval = TRUE;
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86UnreserveFreePages --
*
* Decreases the global and VM's locked page counts.
* The function is thread-safe.
*
* Results:
* void
*
* Side effects:
* The global lock and VM's lock are acquired and released.
*
*----------------------------------------------------------------------
*/
static void
Vmx86UnreserveFreePages(VMDriver *vm,
PageCnt numPages)
{
ASSERT(vm);
HostIF_GlobalLock(18);
HostIF_VMLock(vm, 1);
ASSERT(numLockedPages >= numPages);
ASSERT(vm->memInfo.locked >= numPages);
numLockedPages -= numPages;
vm->memInfo.locked -= numPages;
HostIF_VMUnlock(vm, 1);
HostIF_GlobalUnlock(18);
}
/*
*----------------------------------------------------------------------
*
* Vmx86GetNX --
*
* Checks whether NX is enabled on the current CPU.
*
* Results:
* None.
*
* Side effects:
* Increments responded-CPU counter, may increment NX CPU counter.
*
*----------------------------------------------------------------------
*/
static void
Vmx86GetNX(void *clientData) // IN/OUT: A NXData *
{
NXData *nxData = (NXData *)clientData;
uint64 efer = X86MSR_GetMSR(MSR_EFER);
Atomic_Inc32(&nxData->responded);
if ((efer & MSR_EFER_NXE) == MSR_EFER_NXE) {
Atomic_Inc32(&nxData->hasNX);
}
}
/*
*----------------------------------------------------------------------
*
* Vmx86_CacheNXState --
*
* Checks whether every CPU on the host has NX/XD enabled and
* caches this value.
*
* Results:
* None.
*
* Side effects:
* Caches host NX value.
*
*----------------------------------------------------------------------
*/
void
Vmx86_CacheNXState(void)
{
NXData nxData;
Atomic_Write32(&nxData.responded, 0);
Atomic_Write32(&nxData.hasNX, 0);
HostIF_CallOnEachCPU(Vmx86GetNX, &nxData);
hostUsesNX = Atomic_Read32(&nxData.hasNX) ==
Atomic_Read32(&nxData.responded);
}
/*
*----------------------------------------------------------------------
*
* Vmx865LvlPagingEnabled --
*
* Checks if 5-level paging enabled on the current CPU. It is assumed
* that the host OS will not support a mix of 4 and 5-level paging.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static Bool
Vmx865LvlPagingEnabled(void)
{
uintptr_t cr4;
GET_CR4(cr4);
return (cr4 & CR4_LA57) != 0;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_CreateVM --
*
* Allocate and initialize a driver structure for a virtual machine.
*
* Results:
* VMDriver structure or NULL on error.
* 'status' is populated with the status of the operation.
*
* Side effects:
* May allocate kernel memory.
*
*-----------------------------------------------------------------------------
*/
VMDriver *
Vmx86_CreateVM(VA64 bsBlob, // IN:
uint32 bsBlobSize, // IN:
uint32 numVCPUs, // IN:
VMCreateStatus *status) // OUT:
{
VMDriver *vm;
Vcpuid v;
void *bsBuf = NULL;
BSVMM_HostParams *bsParams;
/* Disallow VM creation if the vmx passes us an invalid number of vcpus. */
if (numVCPUs == 0) {
*status = VM_CREATE_ERR_NO_VCPUS;
return NULL;
}
if (numVCPUs > MAX_VCPUS) {
*status = VM_CREATE_ERR_TOO_MANY_VCPUS;
return NULL;
}
/* Disallow VM creation if NX is disabled on the host as VMM requires NX. */
if (!hostUsesNX) {
*status = VM_CREATE_ERR_NO_NX;
Log("NX/XD must be enabled. Cannot create VM.\n");
return NULL;
}
vm = Vmx86AllocVMDriver(numVCPUs);
if (vm == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
return NULL;
}
vm->userID = 0;
vm->numVCPUs = numVCPUs;
vm->memInfo.admitted = FALSE;
for (v = 0; v < numVCPUs; v++) {
Atomic_Write32(&vm->currentHostCpu[v], INVALID_PCPU);
vm->ptRootMpns[v] = INVALID_MPN;
}
if (!HostIF_Init(vm, numVCPUs)) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
/* If the BS blob exists then the VMM is in use. */
if (bsBlobSize != 0) {
/* Disallow VM creation if 5 level paging is enabled with the VMM. */
if (Vmx865LvlPagingEnabled()) {
Log("5 level paging must not be enabled. Cannot create VM.\n");
*status = VM_CREATE_ERR_5LP;
goto cleanup;
}
/* The ULM does not use the cross GDT. */
bsBuf = HostIF_AllocKernelMem(bsBlobSize, FALSE);
if (bsBuf == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
if (HostIF_CopyFromUser(bsBuf, bsBlob, bsBlobSize) != 0) {
*status = VM_CREATE_ERR_NO_BLOB;
goto cleanup;
}
bsParams = BSVMM_Validate(bsBuf, bsBlobSize);
if (bsParams == NULL) {
*status = VM_CREATE_ERR_INV_BLOB;
Warning("Could not validate the VMM bootstrap blob");
goto cleanup;
}
if (!Task_CreateCrossGDT(&bsParams->gdtInit)) {
*status = VM_CREATE_ERR_CROSS_GDT;
goto cleanup;
}
}
vm->ptpTracker = MemTrack_Init(vm);
if (vm->ptpTracker == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
vm->vmmTracker = MemTrack_Init(vm);
if (vm->vmmTracker == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
vm->sharedArea = SharedAreaVmmon_Init(vm);
if (vm->sharedArea == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
vm->statVars = StatVarsVmmon_Init(vm);
if (vm->statVars == NULL) {
*status = VM_CREATE_ERR_NO_MEM;
goto cleanup;
}
HostIF_GlobalLock(0);
#ifdef _WIN32
if (vmCount >= MAX_VMS_WIN32) {
*status = VM_CREATE_ERR_TOO_MANY_VMS;
HostIF_GlobalUnlock(0);
goto cleanup;
}
#endif
if (vmCount >= MAX_VMS) {
*status = VM_CREATE_ERR_TOO_MANY_VMS;
HostIF_GlobalUnlock(0);
goto cleanup;
}
Vmx86RegisterVMOnList(vm);
HostIF_GlobalUnlock(0);
if (bsBuf != NULL) {
HostIF_FreeKernelMem(bsBuf);
}
*status = VM_CREATE_SUCCESS;
return vm;
cleanup:
ASSERT(*status != VM_CREATE_SUCCESS);
if (bsBuf != NULL) {
HostIF_FreeKernelMem(bsBuf);
}
/*
* The VM is not on a list, "vmCount" has not been incremented,
* "vm->cowID" is INVALID_VMID, and either the VM's mutex hasn't
* been initialized or we've only taken the global lock and checked
* a counter since, so we know that the VM has not yet locked any
* pages.
*/
ASSERT(vm->memInfo.locked == 0);
Vmx86FreeAllVMResources(vm);
return NULL;
}
/*
*----------------------------------------------------------------------
*
* Vmx86SetPageTableRoots --
*
* Translates the user VA corresponding to the root page tables
* for all VCPUs into MPNs and stores them in VMDriver.
*
* Results:
* TRUE if successful, FALSE otherwise.
*
*----------------------------------------------------------------------
*/
static Bool
Vmx86SetPageTableRoots(VMDriver *vm, PerVcpuPages *perVcpuPages,
uint16 numVCPUs)
{
uint16 vcpu;
if (numVCPUs > vm->numVCPUs) {
return FALSE;
}
for (vcpu = 0; vcpu < numVCPUs; vcpu++) {
VA64 ptRoot = perVcpuPages[vcpu].ptRoot;
if ((ptRoot & (PAGE_SIZE - 1)) != 0) {
Warning("Error: page table VA %"FMT64"x is not page-aligned\n",
ptRoot);
return FALSE;
}
ASSERT(vm->ptRootMpns[vcpu] == INVALID_MPN);
HostIF_VMLock(vm, 38);
if (HostIF_LookupUserMPN(vm, ptRoot, &vm->ptRootMpns[vcpu]) !=
PAGE_LOOKUP_SUCCESS) {
HostIF_VMUnlock(vm, 38);
Warning("Failure looking up page table root MPN for VCPU %d\n", vcpu);
return FALSE;
}
HostIF_VMUnlock(vm, 38);
}
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_LookupUserMPN --
*
* Look up the MPN of a locked user page by user VA under the VM lock.
*
* Results:
* A status code and the MPN on success.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
int
Vmx86_LookupUserMPN(VMDriver *vm, // IN: VMDriver
VA64 uAddr, // IN: user VA of the page
MPN *mpn) // OUT
{
int ret;
HostIF_VMLock(vm, 38);
ret = HostIF_LookupUserMPN(vm, uAddr, mpn);
HostIF_VMUnlock(vm, 38);
return ret;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_ProcessBootstrap --
*
* Copies the VMM bootstrap blob header and processes it by invoking
* MonLoader.
*
* Results:
* TRUE if successful, FALSE otherwise.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_ProcessBootstrap(VMDriver *vm,
VA64 bsBlobAddr,
uint32 numBytes,
uint32 headerOffset,
uint16 numVCPUs,
PerVcpuPages *perVcpuPages,
VMSharedRegion *shRegions)
{
VmmBlobInfo *bi = NULL;
unsigned errLine;
Vcpuid errVcpu;
MonLoaderError ret;
MonLoaderArgs args;
MonLoaderHeader *header;
if (!VmmBlob_Load(bsBlobAddr, numBytes, headerOffset, &bi)) {
Warning("Error loading VMM bootstrap blob\n");
goto error;
}
vm->blobInfo = bi;
header = bi->header;
if (!Vmx86SetPageTableRoots(vm, perVcpuPages, numVCPUs)) {
goto error;
}
if (!pseudoTSC.initialized) {
Warning("%s: PseudoTSC has not been initialized\n", __FUNCTION__);
goto error;
}
if (!Vmx86AllocCrossPages(vm)) {
Warning("Failed to allocate cross pages.\n");
goto error;
}
/*
* Initialize the driver's part of the cross-over page used to
* talk to the monitor.
*/
if (!Task_InitCrosspage(vm, header->monStartLPN, header->monEndLPN,
perVcpuPages)) {
Warning("Error initializing crosspage\n");
goto error;
}
args.vm = vm;
args.shRegions = shRegions;
ret = MonLoader_Process(header, numVCPUs, &args, &errLine, &errVcpu);
if (ret != ML_OK) {
Warning("Error processing bootstrap: error %d at line %u, vcpu %u\n",
ret, errLine, errVcpu);
goto error;
}
return TRUE;
error:
if (bi != NULL) {
VmmBlob_Cleanup(bi);
vm->blobInfo = NULL;
}
return FALSE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_ReleaseVM --
*
* Release a VM (either created here or from a bind).
*
* Results:
* zero if successful
*
* Side effects:
* Decrement VM reference count.
* Release resources (those that are left) when count reaches 0.
*
*----------------------------------------------------------------------
*/
int
Vmx86_ReleaseVM(VMDriver *vm) // IN:
{
ASSERT(vm);
HostIF_GlobalLock(1);
Vmx86DeleteVMFromList(vm);
HostIF_GlobalUnlock(1);
Vmx86FreeAllVMResources(vm);
return 0;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_Open --
*
* Called on open of the fd.
*
* Results:
* None.
*
* Side effects:
* Bumps fdCount.
*
*----------------------------------------------------------------------
*/
void
Vmx86_Open(void)
{
HostIF_GlobalLock(123);
ASSERT(fdCount < MAX_INT32);
if (fdCount < MAX_INT32) {
fdCount++;
}
HostIF_GlobalUnlock(123);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_Close --
*
* Called on close of the fd.
*
* Results:
* None.
*
* Side effects:
* Decrements fdCount
* May de-initialize ptsc.
*
*----------------------------------------------------------------------
*/
void
Vmx86_Close(void)
{
HostIF_GlobalLock(124);
/*
* If fdCount hits MAX_INT32 saturate the counter and leave it at
* MAX_INT32.
*/
ASSERT(fdCount > 0);
if (fdCount < MAX_INT32) {
fdCount--;
}
/*
* If no VMs are running and there are no open file handles, reset the
* pseudo TSC state so that the next VM to initialize is free to
* initialize the system wide PTSC however it wants. See PR 403505.
*/
if (fdCount == 0) {
ASSERT(vmCount == 0);
pseudoTSC.initialized = FALSE;
}
HostIF_GlobalUnlock(124);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_ReadTSCAndUptime --
*
* Atomically read the TSC and the uptime.
*
* Results:
* The current TSC and uptime values.
*
* Side effects:
* none
*
*
*----------------------------------------------------------------------
*/
void
Vmx86_ReadTSCAndUptime(VmTimeStart *st) // OUT: return value
{
uintptr_t flags;
SAVE_FLAGS(flags);
CLEAR_INTERRUPTS();
st->count = RDTSC();
st->time = HostIF_ReadUptime();
RESTORE_FLAGS(flags);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_ComputekHz --
*
* Given aggregate cycles and system uptime, computes cycle rate as,
*
* khz = cycles / (uptime / HostIF_UptimeFrequency()) / 1000
*
* We need to do the computation carefully to avoid overflow or
* undue loss of precision. Also, on Linux we can't do a
* 64/64=64 bit division directly, as the gcc stub for that
* is not linked into the kernel.
*
* Results:
* Returns the computed khz value, or 0 if uptime == 0.
*
* Side effects:
* none
*
*----------------------------------------------------------------------
*/
uint32
Vmx86_ComputekHz(uint64 cycles, uint64 uptime)
{
uint64 hz;
uint64 freq;
freq = HostIF_UptimeFrequency();
while (cycles > MAX_UINT64 / freq) {
cycles >>= 1;
uptime >>= 1;
}
if (uptime == 0) {
return 0;
}
hz = (cycles * freq) / uptime;
return (uint32) ((hz + 500) / 1000);
}
/*
*----------------------------------------------------------------------
*
* Vmx86GetkHzEstimate
*
* Return an estimate of the processor's kHz rating, based on
* the ratio of the cycle counter and system uptime since the
* driver was loaded.
* This function could be called (on Windows) at IRQL DISPATCH_LEVEL.
*
*----------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER uint32
Vmx86GetkHzEstimate(VmTimeStart *st) // IN: start time
{
uint64 cDiff, tDiff;
uintptr_t flags;
SAVE_FLAGS(flags);
CLEAR_INTERRUPTS();
cDiff = RDTSC() - st->count;
tDiff = HostIF_ReadUptime() - st->time;
RESTORE_FLAGS(flags);
return Vmx86_ComputekHz(cDiff, tDiff);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetkHzEstimate
*
* Return an estimate of the processor's kHz rating, based on
* the ratio of the cycle counter and system uptime since the
* driver was loaded. Or based on a spinloop.
*
* This function could be called (on Windows) at IRQL DISPATCH_LEVEL.
*
* Results:
* Processor speed in kHz.
*
* Side effects:
* Result is cached.
*
*----------------------------------------------------------------------
*/
uint32
Vmx86_GetkHzEstimate(VmTimeStart *st) // IN: start time
{
static uint32 kHz;
/*
* Cache and return the first result for consistency.
* TSC values can be changed without notification.
* TSC frequency can be vary too (SpeedStep, slowing clock on HALT, etc.)
*/
if (kHz != 0) {
return kHz;
}
return kHz = Vmx86GetkHzEstimate(st);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_SetHostClockRate --
*
* The monitor wants to poll for events at the given rate. If no VM
* is specified, then 'rate' is ignored and the last set rate is set
* again.
*
* Results:
* 0 for success, host-specific error code for failure.
*
* Side effects:
* May increase the host timer interrupt rate, etc.
*
*----------------------------------------------------------------------
*/
int
Vmx86_SetHostClockRate(VMDriver *vm, // IN: VM instance pointer
unsigned rate) // IN: rate in Hz
{
unsigned newGlobalRate;
VMDriver *cur;
int retval = 0;
if (!vm) {
Log("Resetting last set host clock rate of %d\n", globalFastClockRate);
HostIF_FastClockLock(0);
retval = HostIF_SetFastClockRate(globalFastClockRate);
HostIF_FastClockUnlock(0);
return retval;
}
/* Quick test before locks are acquired. */
if (vm->fastClockRate == rate) {
return retval;
}
HostIF_FastClockLock(2);
if (vm->fastClockRate == rate) {
HostIF_FastClockUnlock(2);
return retval;
}
/*
* Loop through all vms to find new max rate.
*/
newGlobalRate = rate;
HostIF_GlobalLock(19);
for (cur = vmDriverList; cur != NULL; cur = cur->nextDriver) {
if (cur != vm && cur->fastClockRate > newGlobalRate) {
newGlobalRate = cur->fastClockRate;
}
}
HostIF_GlobalUnlock(19);
if (newGlobalRate != globalFastClockRate) {
retval = HostIF_SetFastClockRate(newGlobalRate);
if (!retval) {
globalFastClockRate = newGlobalRate;
}
}
if (!retval) {
vm->fastClockRate = rate;
}
HostIF_FastClockUnlock(2);
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_MonTimerIPI --
*
* Check for VCPUs that are in the monitor and need an IPI to fire
* their next MonTimer callback. Should be called once per fast
* timer interrupt if the fast timer is in use.
*
* Results:
* None.
*
* Side effects:
* May send IPIs.
*
*----------------------------------------------------------------------
*/
void
Vmx86_MonTimerIPI(void)
{
VMDriver *vm;
VmAbsoluteTS pNow, expiry;
/*
* Loop through all vms -- needs the global lock to protect vmDriverList.
*/
HostIF_GlobalLock(21);
pNow = Vmx86_GetPseudoTSC();
for (vm = vmDriverList; vm != NULL; vm = vm->nextDriver) {
Vcpuid v;
VCPUSet expiredVCPUs;
Bool hasWork = FALSE;
VCPUSet_Empty(&expiredVCPUs);
for (v = 0; v < vm->numVCPUs; v++) {
VMCrossPageData *crosspage = vm->crosspage[v];
if (crosspage == NULL) {
continue; // VCPU is not initialized yet
}
expiry = crosspage->monTimerExpiry;
if (expiry != 0 && expiry <= pNow) {
VCPUSet_Include(&expiredVCPUs, v);
hasWork = TRUE;
}
}
if (hasWork) {
HostIF_IPI(vm, &expiredVCPUs);
}
}
HostIF_GlobalUnlock(21);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetNumVMs --
*
* Return the number of VMs.
*
* Results:
* The number of VMs.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
int32
Vmx86_GetNumVMs(void)
{
return vmCount;
}
static INLINE PageCnt
Vmx86MinAllocationFunc(PageCnt nonpaged,
PageCnt anonymous,
PageCnt mainmem,
Percent memPct)
{
return (memPct * mainmem) / 100 + nonpaged + anonymous;
}
/*
*----------------------------------------------------------------------
*
* Vmx86MinAllocation --
*
* Computes the minimum number of pages that must be allocated to a
* specific vm. The minAllocation for a vm is defined as
* some percentage of guest memory plus 100% of nonpagable (overhead)
* memory.
*
* Results:
* The minAllocation for this vm.
*
*
* Side effects:
* Analyzes the vm info, requiring the vm lock.
*
*----------------------------------------------------------------------
*/
static INLINE PageCnt
Vmx86MinAllocation(VMDriver *vm,
Percent memPct)
{
ASSERT(HostIF_VMLockIsHeld(vm));
return Vmx86MinAllocationFunc(vm->memInfo.nonpaged, vm->memInfo.anonymous,
vm->memInfo.mainMemSize, memPct);
}
/*
*----------------------------------------------------------------------
*
* Vmx86CalculateGlobalMinAllocation --
*
* Computes the sum of minimum allocations of each vm assuming a given
* percentage of guest memory must fit within host RAM.
*
* Results:
* Number of pages that must fit within host ram for a given overcommit
* level.
*
*
* Side effects:
* None. The actual minAllocations of each vm are NOT updated during
* this computation.
*
*----------------------------------------------------------------------
*/
static PageCnt
Vmx86CalculateGlobalMinAllocation(Percent memPct)
{
VMDriver *vm;
PageCnt minAllocation = 0;
ASSERT(HostIF_GlobalLockIsHeld());
/* Pages of other vms required to fit inside the hard limit. */
for (vm = vmDriverList; vm; vm = vm->nextDriver) {
HostIF_VMLock(vm, 2);
if (vm->memInfo.admitted) {
minAllocation += Vmx86MinAllocation(vm, memPct);
}
HostIF_VMUnlock(vm, 2);
}
return minAllocation;
}
/*
*----------------------------------------------------------------------
*
* Vmx86UpdateMinAllocations --
*
* Updates the minimum allocation for each vm based on the global
* overcommitment percentage.
*
* Results:
* minAllocations for vms are changed.
*
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static INLINE_SINGLE_CALLER void
Vmx86UpdateMinAllocations(Percent memPct) // IN:
{
VMDriver *vm;
ASSERT(HostIF_GlobalLockIsHeld());
/* Pages of other vms required to fit inside the hard limit. */
for (vm = vmDriverList; vm; vm = vm->nextDriver) {
HostIF_VMLock(vm, 3);
if (vm->memInfo.admitted) {
vm->memInfo.minAllocation = Vmx86MinAllocation(vm, memPct);
}
HostIF_VMUnlock(vm, 3);
}
}
/*
*----------------------------------------------------------------------
*
* Vmx86_SetConfiguredLockedPagesLimit --
*
* Set the user defined limit on the number of pages that can
* be locked. This limit can be raised at any time but not lowered.
* This avoids having a user lower the limit as vms are running and
* inadvertently cause the vms to crash because of memory starvation.
*
* Results:
* Returns TRUE on success and FALSE on failure to set the limit
*
* Side effects:
* Hard limit may be changed.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_SetConfiguredLockedPagesLimit(PageCnt limit) // IN:
{
Bool retval = FALSE;
HostIF_GlobalLock(4);
if (limit >= lockedPageLimit.configured) {
lockedPageLimit.configured = limit;
retval = TRUE;
}
HostIF_GlobalUnlock(4);
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_LockPage --
*
* Lock a page.
*
* Results:
* A PAGE_LOCK_* status code and the MPN of the locked page on success.
*
* Side effects:
* Number of global and per-VM locked pages increased.
*
*----------------------------------------------------------------------
*/
int
Vmx86_LockPage(VMDriver *vm, // IN: VMDriver
VA64 uAddr, // IN: VA of the page to lock
Bool allowMultipleMPNsPerVA, // IN: allow locking many pages with the same VA
MPN *mpn) // OUT
{
int retval;
/* Atomically check and reserve locked memory */
if (!Vmx86ReserveFreePages(vm, 1, FALSE)) {
return PAGE_LOCK_LIMIT_EXCEEDED;
}
HostIF_VMLock(vm, 4);
retval = HostIF_LockPage(vm, uAddr, allowMultipleMPNsPerVA, mpn);
HostIF_VMUnlock(vm, 4);
if (retval != PAGE_LOCK_SUCCESS) {
Vmx86UnreserveFreePages(vm, 1);
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_UnlockPage --
*
* Unlock a page.
*
* Results:
* A PAGE_UNLOCK_* status code.
*
* Side effects:
* Number of global and per-VM locked pages decreased.
*
*----------------------------------------------------------------------
*/
int
Vmx86_UnlockPage(VMDriver *vm, // IN
VA64 uAddr) // IN
{
int retval;
HostIF_VMLock(vm, 5);
retval = HostIF_UnlockPage(vm, uAddr);
HostIF_VMUnlock(vm, 5);
if (retval == PAGE_UNLOCK_SUCCESS) {
Vmx86UnreserveFreePages(vm, 1);
}
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_UnlockPageByMPN --
*
* Unlock a page.
*
* Results:
* A PAGE_UNLOCK_* status code.
*
* Side effects:
* Number of global and per-VM locked pages decreased.
*
*----------------------------------------------------------------------
*/
int
Vmx86_UnlockPageByMPN(VMDriver *vm, // IN: VMDriver
MPN mpn, // IN: the page to unlock
VA64 uAddr) // IN: optional valid VA for this MPN
{
int retval;
HostIF_VMLock(vm, 6);
retval = HostIF_UnlockPageByMPN(vm, mpn, uAddr);
HostIF_VMUnlock(vm, 6);
if (retval == PAGE_UNLOCK_SUCCESS) {
Vmx86UnreserveFreePages(vm, 1);
}
return retval;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_AllocLockedPages --
*
* Allocate physical locked pages from the kernel.
*
* Initially the pages are not mapped to any user or kernel
* address space.
*
* Results:
* Non-negative value on partial/full completion: actual number of
* allocated MPNs. MPNs of the allocated pages are copied to the
* caller's buffer at 'addr'.
*
* Negative system specific error code on error (NTSTATUS on Windows,
* etc.)
*
* Side effects:
* Number of global and per-VM locked pages is increased.
*
*-----------------------------------------------------------------------------
*/
int64
Vmx86_AllocLockedPages(VMDriver *vm, // IN: VMDriver
VA64 addr, // OUT: VA of an array for
// allocated MPNs.
PageCnt numPages, // IN: number of pages to allocate
Bool kernelMPNBuffer, // IN: is the MPN buffer in kernel
// or user address space?
Bool ignoreLimits) // IN: should limits be ignored?
{
int64 allocatedPages;
if (!Vmx86ReserveFreePages(vm, numPages, ignoreLimits)) {
// XXX What kind of system-specific error code is that? --hpreg
return PAGE_LOCK_LIMIT_EXCEEDED;
}
HostIF_VMLock(vm, 7);
allocatedPages = HostIF_AllocLockedPages(vm, addr, numPages,
kernelMPNBuffer);
HostIF_VMUnlock(vm, 7);
if (allocatedPages < 0) {
Vmx86UnreserveFreePages(vm, numPages);
} else if (allocatedPages < numPages) {
Vmx86UnreserveFreePages(vm, numPages - allocatedPages);
}
return allocatedPages;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_FreeLockedPages --
*
* Frees physical locked pages from the kernel previosly allocated
* by Vmx86_AllocLockedPages().
*
* Results:
* 0 on success,
* non-0 system specific error code on error (NTSTATUS on Windows, etc.)
*
* Side effects:
* Number of global and per-VM locked pages is decreased.
*
*----------------------------------------------------------------------
*/
int
Vmx86_FreeLockedPages(VMDriver *vm, // IN: VM instance pointer
MPN *mpns, // IN: MPNs to free
PageCnt numPages) // IN: number of pages to free
{
int ret;
HostIF_VMLock(vm, 8);
ret = HostIF_FreeLockedPages(vm, mpns, numPages);
HostIF_VMUnlock(vm, 8);
if (ret == 0) {
Vmx86UnreserveFreePages(vm, numPages);
}
return ret;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_AllocLowPage --
*
* Allocate a zeroed locked low page.
*
* Results:
* Allocated MPN on success. INVALID_MPN on failure.
*
* Side effects:
* Number of global and per-VM locked pages is increased.
*
*-----------------------------------------------------------------------------
*/
MPN
Vmx86_AllocLowPage(VMDriver *vm, // IN: VMDriver
Bool ignoreLimits) // IN: should limits be ignored?
{
MPN mpn;
if (!Vmx86ReserveFreePages(vm, 1, ignoreLimits)) {
return INVALID_MPN;
}
HostIF_VMLock(vm, 49);
mpn = HostIF_AllocLowPage(vm);
HostIF_VMUnlock(vm, 49);
if (mpn == INVALID_MPN) {
Vmx86UnreserveFreePages(vm, 1);
}
return mpn;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetNextAnonPage --
*
* Queries the driver to retrieve the list of anonymous pages.
* A supplied value of INVALID_MPN will start the query from
* the head of the list. Callers supply the previously received
* mpn to retrieve the next in the chain. Note: There is no
* guarantee of coherency.
*
* Results:
* A valid mpn or INVALID_MPN if the list has been exhausted.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
MPN
Vmx86_GetNextAnonPage(VMDriver *vm, // IN: VM instance pointer
MPN mpn) // IN: MPN
{
MPN ret;
HostIF_VMLock(vm, 22);
ret = HostIF_GetNextAnonPage(vm, mpn);
HostIF_VMUnlock(vm, 22);
return ret;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetNumAnonPages --
*
* Queries the driver for the total number of anonymous pages.
*
* Results:
* Total number of anonymous pages
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
PageCnt
Vmx86_GetNumAnonPages(VMDriver *vm) // IN: VM instance pointer
{
PageCnt ret;
HostIF_VMLock(vm, 45);
ret = HostIF_GetNumAnonPages(vm);
HostIF_VMUnlock(vm, 45);
return ret;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetMemInfo --
*
* Return the info about all VMs.
*
* Results:
* TRUE if all info was successfully copied.
*
* Side effects:
* VMGetMemInfoArgs is filled in. If the supplied curVM is null
* then only the baseline information will be returned. Calling
* with a null curVM may return results for maxLockedPages
* that differ from those when the vm is passed if huge pages
* are in use.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_GetMemInfo(VMDriver *curVM,
Bool curVMOnly,
VMMemInfoArgs *outArgs,
int outArgsLength)
{
VMDriver *vm;
int outSize;
int wantedVMs;
HostIF_GlobalLock(7);
if (curVMOnly) {
wantedVMs = 1;
} else {
wantedVMs = vmCount;
}
outSize = VM_GET_MEM_INFO_SIZE(wantedVMs);
if (outSize > outArgsLength) {
HostIF_GlobalUnlock(7);
return FALSE;
}
outArgs->numVMs = wantedVMs;
outArgs->numLockedPages = numLockedPages;
outArgs->maxLockedPages = Vmx86LockedPageLimit(curVM);
outArgs->lockedPageLimit = lockedPageLimit;
outArgs->globalMinAllocation = Vmx86CalculateGlobalMinAllocation(minVmMemPct);
outArgs->minVmMemPct = minVmMemPct;
outArgs->callerIndex = (uint32)-1;
outArgs->currentTime = HostIF_ReadUptime() / HostIF_UptimeFrequency();
if (curVM == NULL) {
HostIF_GlobalUnlock(7);
return TRUE;
}
curVM->memInfo.timestamp = outArgs->currentTime;
if (wantedVMs == 1) {
outArgs->memInfo[0] = curVM->memInfo;
outArgs->callerIndex = 0;
} else {
int i;
for (i = 0, vm = vmDriverList;
vm != NULL && i < vmCount;
i++, vm = vm->nextDriver) {
if (vm == curVM) {
outArgs->callerIndex = i;
}
HostIF_VMLock(vm, 10);
outArgs->memInfo[i] = vm->memInfo;
HostIF_VMUnlock(vm, 10);
}
}
HostIF_GlobalUnlock(7);
if (outArgs->callerIndex == -1) {
return FALSE;
}
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86SetMemoryUsage --
*
* Updates the paged, nonpaged, and anonymous memory reserved memory
* values for the vm.
*
* Results:
* None
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
static void
Vmx86SetMemoryUsage(VMDriver *curVM, // IN/OUT
PageCnt paged, // IN
PageCnt nonpaged, // IN
PageCnt anonymous, // IN
Percent aminVmMemPct) // IN
{
ASSERT(HostIF_VMLockIsHeld(curVM));
curVM->memInfo.paged = paged;
curVM->memInfo.nonpaged = nonpaged;
curVM->memInfo.anonymous = anonymous;
curVM->memInfo.minAllocation = Vmx86MinAllocation(curVM, aminVmMemPct);
curVM->memInfo.maxAllocation = curVM->memInfo.mainMemSize + nonpaged +
anonymous;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_Admit --
*
* Set the memory management information about this VM and handles
* admission control. We allow vm to power on if there is room for
* the minimum allocation for all running vms in memory. Note that
* the hard memory limit can change dynamically in windows so we
* don't have guarantees due to admission control.
*
* Results:
* Returns global information about the memory state in args as well
* as a value indicating whether or not the virtual machine was
* started.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
void
Vmx86_Admit(VMDriver *curVM, // IN
VMMemInfoArgs *args) // IN/OUT
{
Bool allowAdmissionCheck = FALSE;
PageCnt globalMinAllocation;
HostIF_GlobalLock(9);
/*
* Update the overcommitment level and minimums for all vms if they can
* fit under new minimum limit. If they do not fit, do nothing. And of
* course if existing VMs cannot fit under limit, likelihood that new VM
* will fit in is zero.
*/
globalMinAllocation = Vmx86CalculateGlobalMinAllocation(args->minVmMemPct);
if (globalMinAllocation <= Vmx86LockedPageLimit(NULL)) {
allowAdmissionCheck = TRUE;
minVmMemPct = args->minVmMemPct;
Vmx86UpdateMinAllocations(args->minVmMemPct);
}
HostIF_VMLock(curVM, 12);
curVM->memInfo.shares = args->memInfo->shares;
curVM->memInfo.touchedPct = 100;
curVM->memInfo.dirtiedPct = 100;
curVM->memInfo.mainMemSize = args->memInfo->mainMemSize;
curVM->memInfo.perVMOverhead = args->memInfo->perVMOverhead;
/*
* Always set the allocations required for the current configuration
* so that the user will know how bad situation really is with the
* suggested percentage.
*/
curVM->memInfo.admitted = FALSE;
Vmx86SetMemoryUsage(curVM, args->memInfo->paged, args->memInfo->nonpaged,
args->memInfo->anonymous, args->minVmMemPct);
if (allowAdmissionCheck &&
globalMinAllocation + curVM->memInfo.minAllocation <=
Vmx86LockedPageLimit(curVM)) {
curVM->memInfo.admitted = TRUE;
}
#if defined _WIN32
if (curVM->memInfo.admitted) {
PageCnt allocatedPages, nonpaged;
int64 pages;
MPN *mpns;
/*
* More admission control: Get enough memory for the nonpaged portion
* of the VM. Drop locks for this long operation.
* XXX Timeout?
*/
HostIF_VMUnlock(curVM, 12);
HostIF_GlobalUnlock(9);
#define ALLOCATE_CHUNK_SIZE 64
allocatedPages = 0;
nonpaged = args->memInfo->nonpaged + args->memInfo->anonymous;
mpns = HostIF_AllocKernelMem(nonpaged * sizeof *mpns, FALSE);
if (mpns == NULL) {
goto undoAdmission;
}
while (allocatedPages < nonpaged) {
pages = Vmx86_AllocLockedPages(curVM,
PtrToVA64(mpns + allocatedPages),
MIN(ALLOCATE_CHUNK_SIZE, nonpaged - allocatedPages),
TRUE,
FALSE);
if (pages <= 0) {
break;
}
allocatedPages += pages;
}
/*
* Free the allocated pages.
* XXX Do not free the pages but hand them directly to the admitted VM.
*/
Vmx86_FreeLockedPages(curVM, mpns, allocatedPages);
HostIF_FreeKernelMem(mpns);
#undef ALLOCATE_CHUNK_SIZE
undoAdmission:
if (allocatedPages != nonpaged) {
curVM->memInfo.admitted = FALSE; // undo admission
}
HostIF_GlobalLock(9);
HostIF_VMLock(curVM, 12);
}
#endif
/* Return global state to the caller. */
args->memInfo[0] = curVM->memInfo;
args->numVMs = vmCount;
args->numLockedPages = numLockedPages;
args->maxLockedPages = Vmx86LockedPageLimit(curVM);
args->lockedPageLimit = lockedPageLimit;
args->globalMinAllocation = globalMinAllocation;
HostIF_VMUnlock(curVM, 12);
HostIF_GlobalUnlock(9);
}
Bool
Vmx86_Readmit(VMDriver *curVM, OvhdMem_Deltas *delta)
{
PageCnt globalMinAllocation, newMinAllocation;
Bool retval = FALSE;
int64 paged;
int64 nonpaged;
int64 anonymous;
HostIF_GlobalLock(31);
globalMinAllocation = Vmx86CalculateGlobalMinAllocation(minVmMemPct);
HostIF_VMLock(curVM, 31);
paged = curVM->memInfo.paged + delta->paged;
nonpaged = curVM->memInfo.nonpaged + delta->nonpaged;
anonymous = curVM->memInfo.anonymous + delta->anonymous;
if (nonpaged >= 0 && paged >= 0 && anonymous >= 0) {
globalMinAllocation -= Vmx86MinAllocation(curVM, minVmMemPct);
newMinAllocation = Vmx86MinAllocationFunc(nonpaged, anonymous,
curVM->memInfo.mainMemSize,
minVmMemPct);
if (globalMinAllocation + newMinAllocation <= Vmx86LockedPageLimit(curVM) ||
(delta->paged <= 0 && delta->nonpaged <= 0 && delta->anonymous <= 0)) {
Vmx86SetMemoryUsage(curVM, paged, nonpaged, anonymous, minVmMemPct);
retval = TRUE;
}
}
HostIF_VMUnlock(curVM, 31);
HostIF_GlobalUnlock(31);
return retval;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_UpdateMemInfo --
*
* Updates information about this VM with the new data supplied in
* a patch.
*
* Results:
* Sets the memory usage by this vm based on its memSample data.
*
* Side effects:
* None
*
*----------------------------------------------------------------------
*/
void
Vmx86_UpdateMemInfo(VMDriver *curVM,
const VMMemMgmtInfoPatch *patch)
{
ASSERT(patch->touchedPct <= 100 && patch->dirtiedPct <= 100);
HostIF_VMLock(curVM, 13);
curVM->memInfo.touchedPct = AsPercent(patch->touchedPct);
curVM->memInfo.dirtiedPct = AsPercent(patch->dirtiedPct);
curVM->memInfo.hugePageBytes = patch->hugePageBytes;
HostIF_VMUnlock(curVM, 13);
}
/*
*----------------------------------------------------------------------
*
* Vmx86VMXEnabled --
*
* Test the VMXE bit as an easy proxy for whether VMX operation
* is enabled.
*
* Results:
* TRUE if the CPU supports VT and CR4.VMXE is set.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static Bool
Vmx86VMXEnabled(void)
{
if (VT_CapableCPU()) {
uintptr_t cr4;
GET_CR4(cr4);
return (cr4 & CR4_VMXE) != 0;
} else {
return FALSE;
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86EnableHVOnCPU --
*
* Enable HV on the current CPU, if possible.
*
* Results:
* None.
*
* Side effects:
* HV will be enabled, if possible.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86EnableHVOnCPU(void)
{
if (CPUID_HostSupportsSVM()) {
uint64 vmCR = X86MSR_GetMSR(MSR_VM_CR);
if (!SVM_LockedFromFeatures(vmCR)) {
CPUIDRegs regs;
__GET_CPUID(0x8000000A, &regs);
if (CPUID_GET(0x8000000A, EDX, SVM_LOCK, regs.edx) != 0) {
X86MSR_SetMSR(MSR_VM_CR, (vmCR & ~MSR_VM_CR_SVME_DISABLE) |
MSR_VM_CR_SVM_LOCK);
}
}
} else if (CPUID_HostSupportsVT()) {
uint64 featCtl = X86MSR_GetMSR(MSR_FEATCTL);
if (!VT_LockedFromFeatures(featCtl)) {
X86MSR_SetMSR(MSR_FEATCTL,
featCtl | MSR_FEATCTL_LOCK | MSR_FEATCTL_VMXE);
}
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86RefClockInCycles --
*
* Convert the reference clock (HostIF_Uptime) to cycle units.
*
*-----------------------------------------------------------------------------
*/
static INLINE uint64
Vmx86RefClockInCycles(uint64 uptime)
{
return Mul64x3264(uptime,
pseudoTSC.refClockToPTSC.ratio.mult,
pseudoTSC.refClockToPTSC.ratio.shift);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86RefClockToPTSC --
*
* Convert from the reference clock (HostIF_Uptime) time to pseudo TSC.
*
*-----------------------------------------------------------------------------
*/
static INLINE uint64
Vmx86RefClockToPTSC(uint64 uptime)
{
return Vmx86RefClockInCycles(uptime) +
Atomic_Read64(&pseudoTSC.refClockToPTSC.add);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_InitPseudoTSC --
*
* Initialize the pseudo TSC state if it is not already initialized.
* If another vmx has initialized the pseudo TSC, then we continue to
* use the parameters specified by the first vmx.
*
* Results:
* None
*
* Side effects:
* - Updates tscHz, the frequency of the PTSC in Hz. That frequency may
* differ from the value passed in if another VM is already running.
* - Updates the refClkToTSC parameters to be consistent with the tscHz
* value that's in use.
*
*-----------------------------------------------------------------------------
*/
void
Vmx86_InitPseudoTSC(PTSCInitParams *params) // IN/OUT
{
VmTimeStart startTime;
uint64 tsc, uptime;
HostIF_GlobalLock(36);
if (!pseudoTSC.initialized) {
Bool logParams = pseudoTSC.hz != params->tscHz ||
pseudoTSC.hwTSCsSynced != params->hwTSCsSynced ||
pseudoTSC.useRefClock != params->forceRefClock;
pseudoTSC.hz = params->tscHz;
pseudoTSC.refClockToPTSC.ratio.mult = params->refClockToPTSC.mult;
pseudoTSC.refClockToPTSC.ratio.shift = params->refClockToPTSC.shift;
Vmx86_ReadTSCAndUptime(&startTime);
tsc = startTime.count;
uptime = startTime.time;
/* Start Pseudo TSC at initialPTSC (usually 0). */
pseudoTSC.tscOffset = params->initialPTSC - tsc;
Atomic_Write64(&pseudoTSC.refClockToPTSC.add,
params->initialPTSC - Vmx86RefClockInCycles(uptime));
/* forceRefClock gets priority. */
pseudoTSC.useRefClock = params->forceRefClock;
pseudoTSC.neverSwitchToRefClock = params->forceTSC;
pseudoTSC.hwTSCsSynced = params->hwTSCsSynced;
if (logParams) {
Log("PTSC: initialized at %"FMT64"u Hz using %s, TSCs are "
"%ssynchronized.\n", pseudoTSC.hz,
pseudoTSC.useRefClock ? "reference clock" : "TSC",
pseudoTSC.hwTSCsSynced ? "" : "not ");
}
pseudoTSC.initialized = TRUE;
}
/*
* Allow the calling vmx to respect ptsc.noTSC=TRUE config option
* even if another vmx is already running (pseudoTSC was already
* initialized). Useful for testing.
*/
if (params->forceRefClock) {
Vmx86_SetPseudoTSCUseRefClock();
}
params->refClockToPTSC.mult = pseudoTSC.refClockToPTSC.ratio.mult;
params->refClockToPTSC.shift = pseudoTSC.refClockToPTSC.ratio.shift;
params->refClockToPTSC.add = Atomic_Read64(&pseudoTSC.refClockToPTSC.add);
params->tscOffset = pseudoTSC.tscOffset;
params->tscHz = pseudoTSC.hz;
params->hwTSCsSynced = pseudoTSC.hwTSCsSynced;
HostIF_GlobalUnlock(36);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_GetPseudoTSC --
*
* Read the pseudo TSC. We prefer to implement the pseudo TSC using
* TSC. On systems where the TSC varies its rate (e.g. Pentium M),
* stops advancing when the core is in deep sleep (e.g. Core 2 Duo),
* or the TSCs can get out of sync across cores (e.g. Opteron due to
* halt clock ramping, Core 2 Duo due to independent core deep sleep
* states; though WinXP does handle the Core 2 Duo out of sync case;
* and on IBM x-Series NUMA machines), we use a reference clock
* (HostIF_ReadUptime()) as the basis for pseudo TSC.
*
* Note that we depend on HostIF_ReadUptime being a high resolution
* timer that is synchronized across all cores.
*
* Results:
* Current value of the PTSC.
*
*-----------------------------------------------------------------------------
*/
uint64
Vmx86_GetPseudoTSC(void)
{
if (Vmx86_PseudoTSCUsesRefClock()) {
return Vmx86RefClockToPTSC(HostIF_ReadUptime());
}
return RDTSC() + pseudoTSC.tscOffset;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_CheckPseudoTSC --
*
* Periodically called by userspace to check whether the TSC is
* reliable, using the reference clock as the trusted time source.
* If the TSC is unreliable, switch the basis of the PTSC from the
* TSC to the reference clock.
*
* Also, recompute the "add" component of the reference clock to PTSC
* conversion, to periodically eliminate the drift between the two
* clocks. That way, if the PTSC switches from using the TSC to the
* reference clock, PTSC will remain (roughly) continuous. See PR
* 547055.
*
* Note that we might be executing concurrently with other threads,
* but it doesn't matter since we only ever go from using the TSC to
* using the reference clock, never the other direction.
*
* Results:
* TRUE if the PTSC is implemented by the reference clock.
* FALSE if the PTSC is implemented by the TSC.
*
* Side effects:
* May switch the basis of the PTSC from the TSC to the reference clock.
*
*-----------------------------------------------------------------------------
*/
Bool
Vmx86_CheckPseudoTSC(uint64 *lastTSC, // IN/OUT: last/current value of the TSC
uint64 *lastRC) // IN/OUT: last/current value of the reference clock
{
VmTimeStart curTime;
Vmx86_ReadTSCAndUptime(&curTime);
if (pseudoTSC.initialized && *lastTSC && !Vmx86_PseudoTSCUsesRefClock()) {
uint64 tsc, refClkTS, refClkLastTS;
uint64 tscDiff, refClkDiff;
tsc = curTime.count;
refClkTS = Vmx86RefClockInCycles(curTime.time);
refClkLastTS = Vmx86RefClockInCycles(*lastRC);
tscDiff = tsc - *lastTSC;
refClkDiff = refClkTS - refClkLastTS;
if (((int64)tscDiff < 0) ||
(tscDiff * 100 < refClkDiff * 95) ||
(tscDiff * 95 > refClkDiff * 100)) {
/*
* TSC went backwards or drifted from the reference clock by
* more than 5% over the last poll period.
*/
Vmx86_SetPseudoTSCUseRefClock();
} else {
uint64 ptscFromTSC = tsc + pseudoTSC.tscOffset;
Atomic_Write64(&pseudoTSC.refClockToPTSC.add, ptscFromTSC - refClkTS);
}
}
*lastTSC = curTime.count;
*lastRC = curTime.time;
return Vmx86_PseudoTSCUsesRefClock();
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86GetMSR --
*
* Collect MSR value on the current logical CPU.
*
* Function must not block (it is invoked from interrupt context).
* Only VT MSRs are supported on VT-capable processors.
*
* Results:
* None.
*
* Side effects:
* 'data->index' is atomically incremented by one.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86GetMSR(void *clientData) // IN/OUT: A Vmx86GetMSRData *
{
uint32 i;
Vmx86GetMSRData *data = (Vmx86GetMSRData *)clientData;
uint32 numPCPUs = data->query->numLogicalCPUs;
size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs;
ASSERT(data && data->index && data->query);
for (i = 0; i < data->numItems; ++i) {
uint32 index;
int err;
Atomic_uint32 *cpus = &data->index[i];
MSRQuery *query = (MSRQuery *) ((uint8 *)&data->query[0] + i * offset);
index = Atomic_ReadInc32(cpus);
if (index >= numPCPUs) {
continue;
}
query->logicalCPUs[index].tag = HostIF_GetCurrentPCPU();
/*
* We treat BIOS_SIGN_ID (microcode version) specially on Intel,
* where the preferred read sequence involves a macro.
*/
if (CPUID_GetVendor() == CPUID_VENDOR_INTEL &&
query->msrNum == MSR_BIOS_SIGN_ID) {
/* safe to read: MSR_BIOS_SIGN_ID architectural since Pentium Pro */
query->logicalCPUs[index].msrVal = INTEL_MICROCODE_VERSION();
err = 0;
} else {
/*
* Try to enable HV any time these MSRs are queried. We have seen
* buggy firmware that forgets to re-enable HV after waking from
* deep sleep. [PR 1020692]
*/
if (query->msrNum == MSR_FEATCTL || query->msrNum == MSR_VM_CR) {
Vmx86EnableHVOnCPU();
}
err =
HostIF_SafeRDMSR(query->msrNum, &query->logicalCPUs[index].msrVal);
}
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_GetAllMSRs --
*
* Collect MSR value on number of logical CPUs requested.
*
* The caller is responsible for ensuring that the requested MSR is valid
* on all logical CPUs.
*
* 'query->numLogicalCPUs' is the size of the 'query->logicalCPUs' output
* array.
*
* Results:
* On success: TRUE. 'query->logicalCPUs' is filled and
* 'query->numLogicalCPUs' is adjusted accordingly.
* On failure: FALSE. Happens if 'query->numLogicalCPUs' was too small.
*
* Side effects:
* None
*
*-----------------------------------------------------------------------------
*/
Bool
Vmx86_GetAllMSRs(MSRQuery *query) // IN/OUT
{
unsigned i, cpu;
Atomic_uint32 index;
Vmx86GetMSRData data;
data.index = &index;
data.numItems = 1;
/* Check MSR uniformity cache first. */
for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
if (msrUniformityCacheInfo[i].msrIndex == query->msrNum) {
for (cpu = 0; cpu < query->numLogicalCPUs; cpu++) {
query->logicalCPUs[cpu].msrVal = msrUniformityCacheInfo[i].msrValue;
query->logicalCPUs[cpu].tag = cpu;
}
return TRUE;
}
}
Atomic_Write32(data.index, 0);
data.query = query;
HostIF_CallOnEachCPU(Vmx86GetMSR, &data);
/*
* At this point, Atomic_Read32(data.index) is the number of logical CPUs
* who replied.
*/
if (Atomic_Read32(data.index) > query->numLogicalCPUs) {
return FALSE;
}
ASSERT(Atomic_Read32(data.index) <= query->numLogicalCPUs);
query->numLogicalCPUs = Atomic_Read32(data.index);
return TRUE;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86CheckVMXStatus --
*
* Checks the status of the given operation and issues a warning if it was
* not successful. If it is a valid failure, the error code will be read
* and logged.
*
* Results:
* None.
*
* Side effects:
* None.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86CheckVMXStatus(const char *operation, // IN: Operation string
VMXStatus status) // IN: Status to check
{
if (status != VMX_Success) {
Warning("%s failed with status %s.\n", operation,
status == VMX_FailValid ? "VMX_FailValid" :
status == VMX_FailInvalid ? "VMX_FailInvalid" : "UNKNOWN");
/*
* We use a broken in-house version of binutils (2.16.1-vt) with gcc 4.3 which
* doesn't handle VMREAD/VMWRITE operands properly.
*/
#ifdef __GNUC__
#if __GNUC__== 4 && __GNUC_MINOR__ > 3
if (status == VMX_FailValid) {
size_t errorCode;
VMREAD_2_STATUS(VT_VMCS_VMINSTR_ERR, &errorCode);
Log("VM-instruction error: Error %"FMTSZ"d\n", errorCode);
}
#endif
#endif
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86FlushVMCSPage --
*
* VMCLEAR the given VMCS page on the current logical CPU. We first enable
* HV if necessary, and execute a VMXON using the given VMXON region MPN.
* If HV was already enabled, it will remain enabled. If we enabled HV or
* executed a VMXON in non-root operation, we will restore the state of
* each respectively after the VMCLEAR.
*
* Results:
* None.
*
* Side effects:
* The hardware VMCS cache will be flushed.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86FlushVMCSPage(void *clientData) // IN: The MA of the VMCS to VMCLEAR
{
MA vmxonRegion;
Bool hvWasEnabled;
MA vmcs = (MA)clientData;
Bool vmxWasInRootOperation = FALSE;
VMXStatus vmxonStatus, vmclearStatus, vmxoffStatus;
ASSERT(vmcs);
/* Enable HV if it isn't already enabled. */
hvWasEnabled = Vmx86VMXEnabled();
if (!hvWasEnabled) {
uintptr_t cr4reg;
ASSERT(VT_CapableCPU());
Vmx86EnableHVOnCPU();
GET_CR4(cr4reg);
SET_CR4(cr4reg | CR4_VMXE);
}
/* VMXON using this CPUs's VMXON region. */
vmxonRegion = MPN_2_MA(Task_GetHVRootPageForPCPU(HostIF_GetCurrentPCPU()));
vmxonStatus = VMXON_2_STATUS(&vmxonRegion);
if (vmxonStatus != VMX_Success) {
/* VMXON failed, we must already be in VMX root operation. */
vmxWasInRootOperation = TRUE;
}
/* VMCLEAR the given VMCS page. */
vmclearStatus = VMCLEAR_2_STATUS(&vmcs);
Vmx86CheckVMXStatus("VMCLEAR", vmclearStatus);
/* VMXOFF if we were initially in VMX non-root operation. */
if (!vmxWasInRootOperation) {
vmxoffStatus = VMXOFF_2_STATUS();
Vmx86CheckVMXStatus("VMXOFF", vmxoffStatus);
}
/* Disable HV if it was initially disabled. */
if (!hvWasEnabled) {
uintptr_t cr4reg;
GET_CR4(cr4reg);
SET_CR4(cr4reg & ~CR4_VMXE);
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_FlushVMCSAllCPUs --
*
* Enable HV (if necessary) and VMCLEAR a VMCS page on all logical CPUs.
* This will prevent stale data from surfacing out of the VMCS cache when
* executing VMREADs.
*
* Results:
* None.
*
* Side effects:
* HV will be enabled and hardware VMCS caches will be flushed across all
* CPUs.
*
*-----------------------------------------------------------------------------
*/
void
Vmx86_FlushVMCSAllCPUs(MA vmcs) // IN: The MA of the VMCS to VMCLEAR
{
HostIF_CallOnEachCPU(Vmx86FlushVMCSPage, (void *)vmcs);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_YieldToSet --
*
* Yield the CPU until a vCPU from the requested set has run.
*
* usecs is the total spin time in monitor. Very low numbers
* indicate we detected there was a vCPU thread that was not
* in the monitor, so we didn't spin. In that case, simply
* nudge the threads we want and return.
*
* Results:
* The current CPU yields whenever possible.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
Vmx86_YieldToSet(VMDriver *vm, // IN:
Vcpuid currVcpu, // IN:
const VCPUSet *req, // IN:
uint32 usecs, // IN:
Bool skew) // IN:
{
VCPUSet vcpus;
ASSERT(currVcpu < vm->numVCPUs);
if (VCPUSet_IsEmpty(req)) {
return;
}
/* Crosscalls should spin a few times before blocking */
if (!skew && usecs < CROSSCALL_SPIN_SHORT_US) {
HostIF_WakeUpYielders(vm, currVcpu);
return;
}
if (HostIF_PrepareWaitForThreads(vm, currVcpu)) {
return;
}
VCPUSet_Empty(&vcpus);
FOR_EACH_VCPU_IN_SET_WITH_MAX(req, vcpuid, vm->numVCPUs) {
if (vcpuid == currVcpu) {
continue;
}
/*
* First assume the vCPU we want to have wake up the current vCPU
* is out of the monitor, so set its wakeup bit corresponding to
* the current vCPU. It may or may not actually be on the vmmon side.
*/
VCPUSet_AtomicInclude(&vm->crosscallWaitSet[vcpuid], currVcpu);
/*
* Now that the bit is set, check whether the vCPU is in vmmon. If
* it was previously in vmmon, and then took a trip to the monitor
* and back before we got here, then the wakeup has already been sent.
* If it is in the monitor, either it started in vmmon and sent the
* wakeup, or it was there the entire time. In either case we can
* clear the bit. This is safe because the bit is directed solely
* at the current vCPU.
*/
if (Atomic_Read32(&vm->currentHostCpu[vcpuid]) != INVALID_PCPU) {
VCPUSet_AtomicRemove(&vm->crosscallWaitSet[vcpuid], currVcpu);
} else {
if (VCPUSet_AtomicIsMember(&vm->crosscallWaitSet[vcpuid], currVcpu)) {
VCPUSet_Include(&vcpus, vcpuid);
}
}
} ROF_EACH_VCPU_IN_SET_WITH_MAX();
/*
* Wake up any threads that had previously yielded the processor to
* let this one run.
*/
HostIF_WakeUpYielders(vm, currVcpu);
/*
* If this thread has other threads to wait for, and no other threads
* are waiting for this thread, block until one of the threads we're
* waiting for has run.
*/
if (!VCPUSet_IsEmpty(&vcpus) &&
VCPUSet_IsEmpty(&vm->crosscallWaitSet[currVcpu])) {
HostIF_WaitForThreads(vm, currVcpu);
}
/*
* Tell other vcpus that they no longer have to wake this one.
* This is optional, the other threads will eventually clear their
* bits anyway.
*/
FOR_EACH_VCPU_IN_SET_WITH_MAX(&vcpus, vcpuid, vm->numVCPUs) {
VCPUSet_AtomicRemove(&vm->crosscallWaitSet[vcpuid], currVcpu);
} ROF_EACH_VCPU_IN_SET_WITH_MAX();
HostIF_CancelWaitForThreads(vm, currVcpu);
}
/*
*----------------------------------------------------------------------
*
* Vmx86PerfCtrInUse --
*
* Determine which performance counters are already in use by the
* host on the current PCPU. A performance counter is considered
* in use if its event select enable bit is set or if this method
* is unable to count events with the performance counter.
*
* Results:
* Return TRUE if counter is in use.
*
* Side effects:
* None.
*----------------------------------------------------------------------
*/
static Bool
Vmx86PerfCtrInUse(Bool isGen, unsigned pmcNum, unsigned ctrlMSR,
unsigned cntMSR, Bool hasPGC)
{
volatile unsigned delay;
uint64 origPGC = hasPGC ? X86MSR_GetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR) : 0;
uint64 pmcCtrl;
uint64 pmcCount, count;
uint64 ctrlEna, pgcEna;
pmcCtrl = X86MSR_GetMSR(ctrlMSR);
if (isGen) {
ASSERT(pmcNum < 32);
if ((pmcCtrl & PERFCTR_CPU_ENABLE) != 0) {
return TRUE;
}
ctrlEna = PERFCTR_CPU_ENABLE | PERFCTR_CPU_KERNEL_MODE |
PERFCTR_CORE_INST_RETIRED;
pgcEna = CONST64U(1) << pmcNum;
} else {
ASSERT(pmcNum < PERFCTR_CORE_NUM_FIXED_COUNTERS);
if ((pmcCtrl & PERFCTR_CORE_FIXED_ENABLE_MASKn(pmcNum)) != 0) {
return TRUE;
}
ctrlEna = pmcCtrl | PERFCTR_CORE_FIXED_KERNEL_MASKn(pmcNum);
pgcEna = CONST64U(1) << (pmcNum + 32);
}
pmcCount = X86MSR_GetMSR(cntMSR);
/* Enable the counter. */
X86MSR_SetMSR(ctrlMSR, ctrlEna);
if (hasPGC) {
X86MSR_SetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR, pgcEna | origPGC);
}
/* Retire some instructions and wait a few cycles. */
for (delay = 0; delay < 100; delay++) ;
/* Disable the counter. */
if (hasPGC) {
X86MSR_SetMSR(PERFCTR_CORE_GLOBAL_CTRL_ADDR, origPGC);
}
count = X86MSR_GetMSR(cntMSR);
X86MSR_SetMSR(ctrlMSR, pmcCtrl);
X86MSR_SetMSR(cntMSR, pmcCount);
return count == pmcCount;
}
/*
*----------------------------------------------------------------------
*
* Vmx86GetUnavailPerfCtrsOnCPU --
*
* Determine which performance counters are already in use by the
* host on the current PCPU.
*
* Results:
* A bitset representing unavailable performance counter.
* Bits 0-31 represent general purpose counters, and bits 32-63
* represent fixed counters.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
static void
Vmx86GetUnavailPerfCtrsOnCPU(void *data)
{
CPUIDRegs regs;
unsigned i, numGen = 0, numFix = 0, stride = 1;
uint32 selBase = 0;
uint32 ctrBase = 0;
Bool hasPGC = FALSE;
Atomic_uint64 *ctrs = (Atomic_uint64 *)data;
uintptr_t flags;
if (CPUID_GetVendor() == CPUID_VENDOR_INTEL) {
unsigned version;
if (__GET_EAX_FROM_CPUID(0) < 0xA) {
return;
}
__GET_CPUID(0xA, &regs);
version = CPUID_GET(0xA, EAX, PMC_VERSION, regs.eax);
if (version == 0) {
return;
}
numGen = CPUID_GET(0xA, EAX, PMC_NUM_GEN, regs.eax);
if (version >= 2) {
numFix = CPUID_GET(0xA, EDX, PMC_NUM_FIXED, regs.edx);
hasPGC = TRUE;
}
selBase = PERFCTR_CORE_PERFEVTSEL0_ADDR;
ctrBase = PERFCTR_CORE_PERFCTR0_ADDR;
} else if (CPUID_GetVendor() == CPUID_VENDOR_AMD ||
CPUID_GetVendor() == CPUID_VENDOR_HYGON) {
if (CPUID_ISSET(0x80000001, ECX, PERFCORE,
__GET_ECX_FROM_CPUID(0x80000001))) {
numGen = 6;
selBase = PERFCTR_AMD_EXT_BASE_ADDR + PERFCTR_AMD_EXT_EVENTSEL;
ctrBase = PERFCTR_AMD_EXT_BASE_ADDR + PERFCTR_AMD_EXT_CTR;
stride = 2;
} else {
numGen = 4;
selBase = PERFCTR_AMD_PERFEVTSEL0_ADDR;
ctrBase = PERFCTR_AMD_PERFCTR0_ADDR;
}
}
ASSERT(numGen <= 32 && numFix <= 32);
/*
* Vmx86PerfCtrInUse modifies performance counters to determine if
* if they are usable, disable interrupts to avoid racing with
* interrupt handlers.
*/
SAVE_FLAGS(flags);
CLEAR_INTERRUPTS();
for (i = 0; i < numGen; i++) {
if (Vmx86PerfCtrInUse(TRUE, i, selBase + i * stride,
ctrBase + i * stride, hasPGC)) {
Atomic_SetBit64(ctrs, i);
}
}
if (numFix > 0) {
numFix = MIN(numFix, PERFCTR_CORE_NUM_FIXED_COUNTERS);
for (i = 0; i < numFix; i++) {
if (Vmx86PerfCtrInUse(FALSE, i, PERFCTR_CORE_FIXED_CTR_CTRL_ADDR,
PERFCTR_CORE_FIXED_CTR0_ADDR + i, hasPGC)) {
Atomic_SetBit64(ctrs, i + 32);
}
}
}
RESTORE_FLAGS(flags);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetUnavailablePerfCtrs --
*
* Determine which performance counters are already in use by the
* host on across all PCPUs, and therefore unavailable for use by
* the monitor. A performance counter is considered in use if its
* event select enable bit on any PCPU is set.
*
* Results:
* A bitset representing unavailable performance counter.
* Bits 0-31 represent general purpose counters, and bits 32-63
* represent fixed counters.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
uint64
Vmx86_GetUnavailablePerfCtrs(void)
{
Atomic_uint64 unavailCtrs;
Atomic_Write64(&unavailCtrs, 0);
HostIF_CallOnEachCPU(Vmx86GetUnavailPerfCtrsOnCPU, &unavailCtrs);
return Atomic_Read64(&unavailCtrs);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetPageRoot --
*
* Get the page root MPN for the specified VCPU.
*
* Results:
* TRUE and an MPN on success, FALSE on failure.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_GetPageRoot(VMDriver *vm, // IN:
Vcpuid vcpuid, // IN:
MPN *mpn) // OUT:
{
if (vcpuid >= vm->numVCPUs) {
return FALSE;
}
*mpn = vm->ptRootMpns[vcpuid];
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_MapPage --
*
* Maps the specified MPN into the host kernel address space.
* returns the VPN of the mapping.
*
* Results:
* The VPN in the kernel address space of the new mapping, or 0 if
* the mapping failed.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
VPN
Vmx86_MapPage(MPN mpn) // IN:
{
return HostIF_MapPage(mpn);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_UnmapPage --
*
* Unmaps the specified VPN from the host kernel address space.
*
* Results:
* None.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
void
Vmx86_UnmapPage(VPN vpn) // IN:
{
HostIF_UnmapPage(vpn);
}
/*
*----------------------------------------------------------------------
*
* Vmx86_GetMonitorContext --
*
* Gets most of the monitor's saved context (as of the last world switch)
* from a given VCPU's crosspage. CR3 is omitted as it is privileged,
* while DS/SS/ES are returned due to their potential utility in debugging.
*
* Results:
* On success, TRUE and context is (partially) populated. FALSE otherwise.
*
* Side effects:
* None.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_GetMonitorContext(VMDriver *vm, // IN: The VM instance.
Vcpuid vcpuid, // IN: VCPU in question.
Context64 *context) // OUT: context.
{
VMCrossPageData *cpData;
if (vcpuid >= vm->numVCPUs || vm->crosspage[vcpuid] == NULL) {
return FALSE;
}
cpData = vm->crosspage[vcpuid];
memset(context, 0, sizeof *context);
context->es = cpData->monES;
context->ss = cpData->monSS;
context->ds = cpData->monDS;
context->rbx = cpData->monRBX;
context->rsp = cpData->monRSP;
context->rbp = cpData->monRBP;
context->r12 = cpData->monR12;
context->r13 = cpData->monR13;
context->r14 = cpData->monR14;
context->r15 = cpData->monR15;
context->rip = cpData->monRIP;
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_CleanupHVIOBitmap --
*
* Free any resources that were allocated for the HV I/O bitmap.
*
* Results:
* None.
*
*----------------------------------------------------------------------
*/
void
Vmx86_CleanupHVIOBitmap(void)
{
if (hvIOBitmap != NULL) {
HostIF_FreeContigPages(NULL, hvIOBitmap);
hvIOBitmap = NULL;
}
}
/*
*----------------------------------------------------------------------
*
* Vmx86_CreateHVIOBitmap --
*
* Called on driver load to create and initialize the host wide SVM I/O
* bitmap. This item is a physically contiguous region of
* SVM_VMCB_IO_BITMAP_PAGES pages and is initialized to all-bits-set.
*
* Results:
* TRUE on success or FALSE on failure.
*
*----------------------------------------------------------------------
*/
Bool
Vmx86_CreateHVIOBitmap(void)
{
if (!CPUID_HostSupportsSVM()) {
return TRUE;
}
hvIOBitmap = HostIF_AllocContigPages(NULL, SVM_VMCB_IO_BITMAP_PAGES);
if (hvIOBitmap == NULL) {
Warning("Failed to allocate SVM I/O bitmap.\n");
return FALSE;
}
memset(hvIOBitmap->addr, 0xff, SVM_VMCB_IO_BITMAP_SIZE);
return TRUE;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86RegisterCPU --
*
* Registers each logical CPU by incrementing a counter.
*
* Results:
* None.
*
* Side effects:
* counter value pointed by 'data' is incremented by one.
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86RegisterCPU(void *data) // IN: *data
{
Atomic_uint32 *numLogicalCPUs = data;
ASSERT(numLogicalCPUs);
Atomic_Inc32(numLogicalCPUs);
}
/*
*----------------------------------------------------------------------
*
* Vmx86VTMSRCacheGet --
*
* Retrieve the requested VT MSR value from the cache. Returns zero
* for uncached values.
*
*----------------------------------------------------------------------
*/
static uint64
Vmx86VTMSRCacheGet(const MSRCache *cache, uint32 msrNum, unsigned cpu)
{
ASSERT((msrNum >= MSR_VMX_BASIC && msrNum < MSR_VMX_BASIC + NUM_VMX_MSRS) ||
msrNum == MSR_FEATCTL);
if (cache != NULL && cache->queryCache != NULL) {
size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * cache->nPCPUs;
MSRQuery *query;
unsigned ix;
ASSERT(cpu < cache->nPCPUs);
for (ix = 0; ix < cache->queryCache->numItems; ix++) {
query = (MSRQuery *) ((uint8 *)&cache->queryCache->query[0] +
ix * offset);
if (query->msrNum == msrNum) {
return query->logicalCPUs[cpu].msrVal;
}
}
}
return 0;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86AllocMSRUniformityCache --
* Vmx86FreeMSRUniformityCache --
*
* Allocate/populate and cleanup MSR uniformity cache.
*
*-----------------------------------------------------------------------------
*/
static Bool
Vmx86AllocMSRUniformityCache(uint32 numPCPUs)
{
MSRQuery *query = NULL;
uint32 i;
uint32 numQueries = ARRAYSIZE(msrUniformityCacheInfo);
Atomic_uint32 *cpuCounters;
MSRQuery *multMSRQueryAllPcpus = Vmx86_Calloc(numQueries,
sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs, FALSE);
if (multMSRQueryAllPcpus == NULL) {
return FALSE;
}
cpuCounters = Vmx86_Calloc(numQueries, sizeof(Atomic_uint32), FALSE);
if (cpuCounters == NULL) {
Vmx86_Free(multMSRQueryAllPcpus);
return FALSE;
}
msrCacheQueryData.query = multMSRQueryAllPcpus;
msrCacheQueryData.index = cpuCounters;
msrCacheQueryData.numItems = numQueries;
/*
* Enumerates a MSR list and initializes MSR msrCacheQueryData structure
* before the actual (safe) MSR query takes place.
*/
for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
query = (MSRQuery *) ((uint8 *)&msrCacheQueryData.query[0] +
i * (sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs));
Atomic_Write32(&msrCacheQueryData.index[i], 0);
query->msrNum = msrUniformityCacheInfo[i].msrIndex;
query->numLogicalCPUs = numPCPUs;
}
/* Perform a single query for all of the MSRs in the uniformity check list.*/
HostIF_CallOnEachCPU(Vmx86GetMSR, &msrCacheQueryData);
return TRUE;
}
static void
Vmx86FreeMSRUniformityCache(void)
{
Vmx86_Free(msrCacheQueryData.index);
Vmx86_Free(msrCacheQueryData.query);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86CheckMSRUniformity --
*
* Iterate MSR uniformity cache and test uniformity of each MSR across all
* physical cpu(s).
*
*-----------------------------------------------------------------------------
*/
static void
Vmx86CheckMSRUniformity(uint32 numPCPUs)
{
uint32 i, j;
MSRQuery *query = NULL;
for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
uint32 msrIndex = msrUniformityCacheInfo[i].msrIndex;
query = (MSRQuery *)((uint8 *)&msrCacheQueryData.query[0] +
i * (sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs));
ASSERT(Atomic_Read32(&msrCacheQueryData.index[i]) == numPCPUs);
for (j = 1; j < numPCPUs; j++) {
uint64 msrValuePCPU = query->logicalCPUs[j].msrVal;
if (msrValuePCPU != query->logicalCPUs[0].msrVal) {
Warning("Found a mismatch on MSR feature 0x%x; logical cpu%u "
"value = 0x%llx, but logical cpu%u value = 0x%llx\n",
msrIndex, j, msrValuePCPU, 0, query->logicalCPUs[0].msrVal);
}
}
}
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86FindMSRQueryFromCache --
*
* Iterate MSR uniformity cache and find query position for the given msr.
*
*-----------------------------------------------------------------------------
*/
static MSRQuery*
Vmx86FindMSRQueryFromCache(uint32 msrIndex, uint32 numPCPUs)
{
uint32 i;
MSRQuery *query = NULL;
size_t offset = sizeof(MSRQuery) + sizeof(MSRReply) * numPCPUs;
MSRQuery *first = &msrCacheQueryData.query[0];
for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
if (msrIndex == msrUniformityCacheInfo[i].msrIndex) {
query = (MSRQuery *)((uint8 *)first + i * offset);
break;
}
}
return query;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86FindCommonMSRArchCap --
* Vmx86FindCommonMSRBIOSSignID --
* Vmx86FindCommonMSRVMCR --
* Vmx86FindCommonMSRJoin --
*
* Calculate least common denominator for IA32_MSR_ARCH_CAPABILITIES,
* MSR_BIOS_SIGN_ID, MSR_VM_CR, and general case respectively.
*
*-----------------------------------------------------------------------------
*/
static uint64
Vmx86FindCommonMSRArchCap(uint32 msrIndex, uint32 numPCPUs)
{
uint32 j;
uint64 msrCommonVal;
MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
ASSERT(query != NULL);
ASSERT(msrIndex == IA32_MSR_ARCH_CAPABILITIES);
msrCommonVal = query->logicalCPUs[0].msrVal;
/*
* MSR_ARCH_CAPABILITIES_RSBA bit 1 represents lack of feature while 0
* represents presence. Therefore, bit is flipped for calculating the
* least common set and flipped again on the final value for resetting.
*/
msrCommonVal ^= MSR_ARCH_CAPABILITIES_RSBA;
for (j = 1; j < numPCPUs; j++) {
uint64 msrValuePCPU = query->logicalCPUs[j].msrVal;
if (msrValuePCPU != query->logicalCPUs[0].msrVal) {
msrValuePCPU ^= MSR_ARCH_CAPABILITIES_RSBA;
msrCommonVal &= msrValuePCPU;
}
}
msrCommonVal ^= MSR_ARCH_CAPABILITIES_RSBA;
return msrCommonVal;
}
static uint64
Vmx86FindCommonMSRBIOSSignID(uint32 msrIndex, uint32 numPCPUs)
{
unsigned cpu;
uint64 commonVal;
MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
ASSERT(query != NULL);
commonVal = ~0ULL;
for (cpu = 0; cpu < numPCPUs; cpu++) {
if (query->logicalCPUs[cpu].msrVal < commonVal) {
commonVal = query->logicalCPUs[cpu].msrVal;
}
}
return commonVal;
}
static uint64
Vmx86FindCommonMSRVMCR(uint32 msrIndex, uint32 numPCPUs)
{
unsigned cpu;
uint64 commonVal;
MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
ASSERT(query != NULL);
commonVal = query->logicalCPUs[0].msrVal;
for (cpu = 1; cpu < numPCPUs; cpu++) {
uint64 msrValuePCPU = query->logicalCPUs[cpu].msrVal;
commonVal &= msrValuePCPU & MSR_VM_CR_R_INIT;
commonVal |= msrValuePCPU & ~MSR_VM_CR_R_INIT;
}
return commonVal;
}
static uint64
Vmx86FindCommonMSRJoin(uint32 msrIndex, uint32 numPCPUs)
{
uint32 j;
uint64 msrCommonVal;
MSRQuery *query = Vmx86FindMSRQueryFromCache(msrIndex, numPCPUs);
ASSERT(query != NULL);
msrCommonVal = query->logicalCPUs[0].msrVal;
for (j = 1; j < numPCPUs; j++) {
msrCommonVal &= query->logicalCPUs[j].msrVal;
}
return msrCommonVal;
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86GenFindCommonCap --
* Vmx86GenFindCommonIntelVTCap --
* Vmx86FindCommonMSR --
*
* Generate common MSR calculation routines by deriving appropriate
* function with 'member' name.
*
*-----------------------------------------------------------------------------
*/
static uint64
Vmx86GenFindCommonCap(uint32 msrIndex, uint32 numPCPUs)
{
#define MSRNUMVT(msr, member)
#define MSRNUMVT2 MSRNUMVT
#define MSRNUM(msr, member) \
if (msrIndex == msr) { \
return Vmx86FindCommonMSR##member(msrIndex, numPCPUs); \
} else { \
return Vmx86FindCommonMSRJoin(msrIndex, numPCPUs); \
}
UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2
return CONST64(0);
}
static uint64
Vmx86GenFindCommonIntelVTCap(uint32 msrIndex, uint32 numPCPUs)
{
MSRCache vt;
IntelVTMSRGet_Fn fn = Vmx86VTMSRCacheGet;
/* Prepare a special cache for VT MSR uniformity checks. */
vt.queryCache = &msrCacheQueryData;
vt.nPCPUs = numPCPUs;
#define MSRNUM(msr, member)
#define MSRNUMVT(msr, member) \
if (msrIndex == msr) { \
return IntelVT_FindCommon##member(&vt, fn, numPCPUs); \
}
#define MSRNUMVT2(msr, member) \
if (msrIndex == msr) { \
return IntelVT_FindCommon##member(&vt, fn, numPCPUs, msr); \
}
UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2
return CONST64(0);
}
static uint64
Vmx86FindCommonMSR(uint32 msrIndex, uint32 numPCPUs)
{
#define MSRNUM(msr, member) \
if (msrIndex == msr) { \
return Vmx86GenFindCommonCap(msrIndex, numPCPUs); \
}
#define MSRNUMVT(msr, member) \
if (msrIndex == msr) { \
return Vmx86GenFindCommonIntelVTCap(msrIndex, numPCPUs); \
}
#define MSRNUMVT2 MSRNUMVT
UNIFORMITY_CACHE_MSRS
#undef MSRNUM
#undef MSRNUMVT
#undef MSRNUMVT2
return CONST64(0);
}
/*
*-----------------------------------------------------------------------------
*
* Vmx86_CheckMSRUniformity --
*
* Provides basic hardware MSR feature checks for x86 hosted platform. VMM
* requires and prefers uniformity of certain MSRs. This function iterates
* through a list of MSR features (i.e. msrUniformityCacheInfo), checking
* uniformity for MSR value on each logical CPU. A Uniformity check is
* ignored for MSRs are that are not available for the target architecture
* or cpu family. If MSRs are non uniform then, a common bit field is
* calculated by taking the intersection of MSR values across cpu(s).
*
* Results:
* Returns TRUE if MSR uniformity checks complete successfully, FALSE
* otherwise.
*
* Side effects:
* updates msrUniformityCacheInfo cache with MSR values.
*
*-----------------------------------------------------------------------------
*/
Bool
Vmx86_CheckMSRUniformity(void)
{
uint32 i;
Atomic_uint32 numLogicalCPUs;
uint32 numPCPUs = 0;
Atomic_Write32(&numLogicalCPUs, 0);
/*
* Calculates number of logical CPUs by counting and then uses this
* information to set up MSR queries; will be executed on each logical CPU.
*/
HostIF_CallOnEachCPU(Vmx86RegisterCPU, &numLogicalCPUs);
numPCPUs = Atomic_Read32(&numLogicalCPUs);
ASSERT(numPCPUs > 0);
if (!Vmx86AllocMSRUniformityCache(numPCPUs)) {
Warning("Fatal, not enough memory for MSR feature uniformity checks");
return FALSE;
}
Vmx86CheckMSRUniformity(numPCPUs);
for (i = 0; i < ARRAYSIZE(msrUniformityCacheInfo); ++i) {
uint32 msrIndex = msrUniformityCacheInfo[i].msrIndex;
msrUniformityCacheInfo[i].msrValue = Vmx86FindCommonMSR(msrIndex,
numPCPUs);
}
Vmx86FreeMSRUniformityCache();
return TRUE;
}
/*
*----------------------------------------------------------------------
*
* Vmx86_KernelCETEnabled --
*
* Check if kernel mode shadow stacks are enabled by examining
* the current shadow stack pointer.
*
* Results:
* TRUE if any CPU has kernel mode shadow stacks enabled.
*
*----------------------------------------------------------------------
*/
static void
Vmx86KernelCETEnabledOnCPU(void *data)
{
uint64 ssp = GET_SSP();
if (ssp != INVALID_SSP) {
Atomic_Bool *kernelCETEnabled = (Atomic_Bool *)data;
Atomic_WriteBool(kernelCETEnabled, TRUE);
}
}
Bool
Vmx86_KernelCETEnabled(void)
{
Atomic_Bool kernelCETEnabled;
Atomic_WriteBool(&kernelCETEnabled, FALSE);
HostIF_CallOnEachCPU(Vmx86KernelCETEnabledOnCPU, &kernelCETEnabled);
return Atomic_ReadBool(&kernelCETEnabled);
}