vmware-host-modules/vmmon-only/linux/driver.c

/*********************************************************
 * Copyright (C) 1998-2017 VMware, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation version 2 and no later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 *
 *********************************************************/

/* Must come before any kernel header file */
#include "driver-config.h"

#define EXPORT_SYMTAB

#include "compat_timer.h"
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/poll.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/wait.h>

#include <asm/hw_irq.h> /* for CALL_FUNCTION_VECTOR */

#include "compat_version.h"
#include "compat_module.h"
#include "compat_page.h"

#include "usercalldefs.h"

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 16)
#error Linux before 2.6.16 is not supported
#endif

#include <asm/io.h>

#include "vmware.h"
#include "driverLog.h"
#include "driver.h"
#include "modulecall.h"
#include "vm_asm.h"
#include "vmx86.h"
#include "initblock.h"
#include "task.h"
#include "memtrack.h"
#include "task.h"
#include "cpuid.h"
#include "cpuid_info.h"
#include "circList.h"
#include "x86msr.h"

#ifdef VMX86_DEVEL
#include "private.h"
#endif

#include "hostif.h"
#include "hostif_priv.h"
#include "vmhost.h"

#include "vmmonInt.h"

static void LinuxDriverQueue(VMLinux *vmLinux);
static void LinuxDriverDequeue(VMLinux *vmLinux);
static Bool LinuxDriverCheckPadding(void);

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
#define VMW_NOPAGE_2624
#endif

#define VMMON_UNKNOWN_SWAP_SIZE -1ULL

struct VMXLinuxState linuxState;

#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
typedef int vm_fault_t;
#endif

/*
 *----------------------------------------------------------------------
 *
 * Device Driver Interface --
 *
 *      Runs the VM by implementing open/close/ioctl functions
 *
 *
 *----------------------------------------------------------------------
 */

static int LinuxDriver_Open(struct inode *inode, struct file *filp);

/*
 * gcc-4.5+ can name-mangle LinuxDriver_Ioctl, but our stack-size
 * script needs to find it.  So it shouldn't be static.  ("hidden"
 * visibility would be OK.)
 */
long LinuxDriver_Ioctl(struct file *filp, u_int iocmd,
                       unsigned long ioarg);

static int LinuxDriver_Close(struct inode *inode, struct file *filp);
static unsigned int LinuxDriverPoll(struct file *file, poll_table *wait);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static vm_fault_t LinuxDriverFault(struct vm_fault *fault);
#elif defined(VMW_NOPAGE_2624)
static vm_fault_t LinuxDriverFault(struct vm_area_struct *vma, struct vm_fault *fault);
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma,
                                      unsigned long address,
                                      int *type);
#endif
static int LinuxDriverMmap(struct file *filp, struct vm_area_struct *vma);

static void LinuxDriverPollTimeout(compat_timer_arg_t unused);
static unsigned int LinuxDriverEstimateTSCkHz(void);

static struct vm_operations_struct vmuser_mops = {
#ifdef VMW_NOPAGE_2624
        .fault  = LinuxDriverFault
#else
        .nopage = LinuxDriverNoPage
#endif
};

static struct file_operations vmuser_fops;
static struct timer_list tscTimer;
static Atomic_uint32 tsckHz;
static VmTimeStart tsckHzStartTime;


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHzWork --
 *
 *      Estimates TSC frequency in terms of cycles and system uptime
 *      elapsed since module init. At module init, the starting cycle
 *      count and uptime are recorded (in tsckHzStartTime) and a timer
 *      is scheduled to call this function after 4 seconds.
 *
 *      It is possible that vmx queries the TSC rate after module init
 *      but before the 4s timer expires. In that case, we just go ahead
 *      and compute the rate for the duration since the driver loaded.
 *      When the timer expires, the new computed value is dropped. If the
 *      query races with the timer, the first thread to write to 'tsckHz'
 *      wins.
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverEstimateTSCkHzWork(void *data)
{
   VmTimeStart curTime;
   uint64 cycles;
   uint64 uptime;
   unsigned int khz;

   ASSERT(tsckHzStartTime.count != 0 && tsckHzStartTime.time != 0);
   Vmx86_ReadTSCAndUptime(&curTime);
   cycles = curTime.count - tsckHzStartTime.count;
   uptime = curTime.time  - tsckHzStartTime.time;
   khz    = Vmx86_ComputekHz(cycles, uptime);

   if (khz != 0) {
       if (Atomic_ReadIfEqualWrite(&tsckHz, 0, khz) == 0) {
          Log("TSC frequency estimated using system uptime: %u\n", khz);
       }
   } else if (Atomic_ReadIfEqualWrite(&tsckHz, 0, cpu_khz) == 0) {
       Log("Failed to compute TSC frequency, using cpu_khz: %u\n", cpu_khz);
   }
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHz --
 *
 *      Returns the estimated TSC khz, cached in tscKhz. If tsckHz is
 *      0, the routine kicks off estimation work on CPU 0.
 *
 * Results:
 *
 *      Returns the estimated TSC khz value.
 *
 *----------------------------------------------------------------------
 */

static unsigned int
LinuxDriverEstimateTSCkHz(void)
{
   int err;
   uint32 khz;

   khz = Atomic_Read(&tsckHz);
   if (khz != 0) {
      return khz;
   }
   err = compat_smp_call_function_single(0, LinuxDriverEstimateTSCkHzWork,
                                         NULL, 1);
   /*
    * The smp function call may fail for two reasons, either
    * the function is not supportd by the kernel, or the cpu
    * went offline. In this unlikely event, we just perform
    * the work wherever we can.
    */
   if (err != 0) {
      LinuxDriverEstimateTSCkHzWork(NULL);
   }

   return Atomic_Read(&tsckHz);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverEstimateTSCkHzDeferred --
 *
 *      Timer callback for deferred TSC rate estimation.
 *
 *----------------------------------------------------------------------
 */
static void
LinuxDriverEstimateTSCkHzDeferred(compat_timer_arg_t unused)
{
   LinuxDriverEstimateTSCkHz();
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverInitTSCkHz --
 *
 *      Initialize TSC khz rate.
 *
 *      We rely on the kernel estimated cycle rate in the exported
 *      variable tsc_khz. If the kernel has disabled tsc, tsc_khz
 *      will be 0, and we fall back on our own estimation routines.
 *
 * Side effects:
 *
 *      If tsc_khz is unusable, schedules a 4s timer for deferred
 *      khz estimation (see LinuxDriverEstimateTSCkHz).
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverInitTSCkHz(void)
{
   unsigned int khz;

   khz = compat_tsc_khz();
   if (khz != 0) {
      Atomic_Write(&tsckHz, khz);
      Log("Using tsc_khz as TSC frequency: %u\n", khz);
      return;
   }

   Vmx86_ReadTSCAndUptime(&tsckHzStartTime);
   tscTimer.expires  = jiffies + 4 * HZ;
   add_timer(&tscTimer);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverInit --
 *
 *      linux module entry point. Called by /sbin/insmod command
 *
 * Results:
 *      registers a device driver for a major # that depends
 *      on the uid. Add yourself to that list.  List is now in
 *      private/driver-private.c.
 *
 *----------------------------------------------------------------------
 */

static int
LinuxDriverInit(void)
{
   int retval;

   DriverLog_Init("/dev/vmmon");
   HostIF_InitGlobalLock();

   if (!LinuxDriverCheckPadding()) {
      return -ENOEXEC;
   }

   CPUID_Init();
   if (!Task_Initialize()) {
      return -ENOEXEC;
   }

   /*
    * Initialize LinuxDriverPoll state
    */

   init_waitqueue_head(&linuxState.pollQueue);
   compat_timer_setup(&linuxState.pollTimer, LinuxDriverPollTimeout, 0);

   linuxState.fastClockThread = NULL;
   linuxState.fastClockFile = NULL;
   linuxState.fastClockRate = 0;
   linuxState.fastClockPriority = -20;
   linuxState.swapSize = VMMON_UNKNOWN_SWAP_SIZE;

   /*
    * Initialize the file_operations structure. Because this code is always
    * compiled as a module, this is fine to do it here and not in a static
    * initializer.
    */

   memset(&vmuser_fops, 0, sizeof vmuser_fops);
   vmuser_fops.owner = THIS_MODULE;
   vmuser_fops.poll = LinuxDriverPoll;
   vmuser_fops.unlocked_ioctl = LinuxDriver_Ioctl;
   vmuser_fops.compat_ioctl = LinuxDriver_Ioctl;
   vmuser_fops.open = LinuxDriver_Open;
   vmuser_fops.release = LinuxDriver_Close;
   vmuser_fops.mmap = LinuxDriverMmap;

#ifdef VMX86_DEVEL
   devel_init_module();
   linuxState.minor = 0;
   retval = register_chrdev(linuxState.major, linuxState.deviceName,
                            &vmuser_fops);
#else
   sprintf(linuxState.deviceName, "vmmon");
   linuxState.major = 10;
   linuxState.minor = 165;
   linuxState.misc.minor = linuxState.minor;
   linuxState.misc.name = linuxState.deviceName;
   linuxState.misc.fops = &vmuser_fops;

   retval = misc_register(&linuxState.misc);
#endif

   if (retval) {
      Warning("Module %s: error registering with major=%d minor=%d\n",
              linuxState.deviceName, linuxState.major, linuxState.minor);

      return -ENOENT;
   }
   Log("Module %s: registered with major=%d minor=%d\n",
       linuxState.deviceName, linuxState.major, linuxState.minor);

   HostIF_InitUptime();
   compat_timer_setup(&tscTimer, LinuxDriverEstimateTSCkHzDeferred, 0);
   LinuxDriverInitTSCkHz();
   Vmx86_InitIDList();

   Log("Module %s: initialized\n", linuxState.deviceName);

   return 0;
}

/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverExit --
 *
 *      Called by /sbin/rmmod
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverExit(void)
{
   /*
    * XXX smp race?
    */
#ifdef VMX86_DEVEL
   unregister_chrdev(linuxState.major, linuxState.deviceName);
#else
   misc_deregister(&linuxState.misc);
#endif

   Log("Module %s: unloaded\n", linuxState.deviceName);

   del_timer_sync(&linuxState.pollTimer);
   del_timer_sync(&tscTimer);

   Task_Terminate();
   // Make sure fastClockThread is dead
   HostIF_FastClockLock(1);
   HostIF_SetFastClockRate(0);
   HostIF_FastClockUnlock(1);

   HostIF_CleanupUptime();
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriver_Open  --
 *
 *      called on open of /dev/vmmon or /dev/vmx86.$USER. Use count used
 *      to determine eventual deallocation of the module
 *
 * Side effects:
 *     Increment use count used to determine eventual deallocation of
 *     the module
 *
 *----------------------------------------------------------------------
 */

static int
LinuxDriver_Open(struct inode *inode, // IN
                 struct file *filp)   // IN
{
   VMLinux *vmLinux;

   vmLinux = kmalloc(sizeof *vmLinux, GFP_KERNEL);
   if (vmLinux == NULL) {
      return -ENOMEM;
   }
   memset(vmLinux, 0, sizeof *vmLinux);

   sema_init(&vmLinux->lock4Gb, 1);
   init_waitqueue_head(&vmLinux->pollQueue);

   filp->private_data = vmLinux;
   LinuxDriverQueue(vmLinux);

   Vmx86_Open();

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverAllocPages --
 *
 *    Allocate physically contiguous block of memory with specified order.
 *    Pages in the allocated block are configured so that caller can pass
 *    independent pages to the VM.
 *
 * Results:
 *    Zero on success, non-zero (error code) on failure.
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

static int
LinuxDriverAllocPages(unsigned int gfpFlag, // IN
                      unsigned int order,   // IN
                      struct page **pg,     // OUT
                      unsigned int size)    // IN
{
   struct page* page;

   page = alloc_pages(gfpFlag, order);
   if (page) {
      unsigned int i;

      /*
       * Grab an extra reference on all pages except first one - first
       * one was already refcounted by alloc_pages.
       *
       * Under normal situation all pages except first one in the block
       * have refcount zero.  As we pass these pages to the VM, we must
       * bump their count, otherwise VM will release these pages every
       * time they would be unmapped from user's process, causing crash.
       *
       * Note that this depends on Linux VM internals.  It works on all
       * kernels we care about.
       */

      order = 1 << order;
      for (i = 0; i < order; i++) {
         if (i) {
            /*
             * Debug kernels assert that page->_count is not zero when
             * calling get_page. We use init_page_count as a temporary
             * workaround. PR 894174
             */
#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 16)
            ASSERT(page_count(page) == 0);
            init_page_count(page);
#else
            get_page(page);
#endif
         }
         if (i >= size) {
            put_page(page);
         } else {
            void *addr = kmap(page);

            memset(addr, 0, PAGE_SIZE);
            kunmap(page);
            *pg++ = page;
         }
         page++;
      }

      return 0;
   }

   return -ENOMEM;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverDestructor4Gb --
 *
 *    Deallocate all directly mappable memory.
 *
 * Results:
 *    None
 *
 * Side effects:
 *    None
 *
 *-----------------------------------------------------------------------------
 */

static void
LinuxDriverDestructor4Gb(VMLinux *vmLinux) // IN
{
   unsigned int pg;

   if (!vmLinux->size4Gb) {
      return;
   }
   for (pg = 0; pg < vmLinux->size4Gb; pg++) {
      put_page(vmLinux->pages4Gb[pg]);
   }
   vmLinux->size4Gb = 0;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriver_Close  --
 *
 *      called on close of /dev/vmmon or /dev/vmx86.$USER, most often when the
 *      process exits. Decrement use count, allowing for possible uninstalling
 *      of the module.
 *
 *----------------------------------------------------------------------
 */

static int
LinuxDriver_Close(struct inode *inode, // IN
                  struct file *filp)   // IN
{
   VMLinux *vmLinux;

   vmLinux = (VMLinux *)filp->private_data;
   ASSERT(vmLinux);

   LinuxDriverDequeue(vmLinux);
   if (vmLinux->vm != NULL) {
      Vmx86_ReleaseVM(vmLinux->vm);
      vmLinux->vm = NULL;
   }

   Vmx86_Close();

   /*
    * Destroy all low memory allocations.
    * We are closing the struct file here, so clearly no other process
    * uses it anymore, and we do not need to hold the semaphore.
    */

   LinuxDriverDestructor4Gb(vmLinux);

   /*
    * Clean up poll state.
    */

   HostIF_PollListLock(0);
   if (vmLinux->pollBack != NULL) {
      if ((*vmLinux->pollBack = vmLinux->pollForw) != NULL) {
         vmLinux->pollForw->pollBack = vmLinux->pollBack;
      }
   }
   HostIF_PollListUnlock(0);
   // XXX call wake_up()?
   HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);

   kfree(vmLinux);
   filp->private_data = NULL;

   return 0;
}


#define POLLQUEUE_MAX_TASK 1000
static DEFINE_SPINLOCK(pollQueueLock);
static void *pollQueue[POLLQUEUE_MAX_TASK];
static unsigned int pollQueueCount = 0;


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverQueuePoll --
 *
 *      Remember that current process waits for next timer event.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
LinuxDriverQueuePoll(void)
{
   unsigned long flags;

   spin_lock_irqsave(&pollQueueLock, flags);

   /*
    * Under normal circumstances every process should be listed
    * only once in this array. If it becomes problem that process
    * can be in the array twice, walk array! Maybe you can keep
    * it sorted by 'current' value then, making IsPollQueued
    * a bit faster...
    */

   if (pollQueueCount < POLLQUEUE_MAX_TASK) {
      pollQueue[pollQueueCount++] = current;
   }
   spin_unlock_irqrestore(&pollQueueLock, flags);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverIsPollQueued --
 *
 *      Determine whether timer event occurred since we queued for it using
 *      LinuxDriverQueuePoll.
 *
 * Results:
 *      0    Event already occurred.
 *      1    Event did not occur yet.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER int
LinuxDriverIsPollQueued(void)
{
   unsigned long flags;
   unsigned int i;
   int retval = 0;

   spin_lock_irqsave(&pollQueueLock, flags);
   for (i = 0; i < pollQueueCount; i++) {
      if (current == pollQueue[i]) {
         retval = 1;
         break;
      }
   }
   spin_unlock_irqrestore(&pollQueueLock, flags);

   return retval;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverFlushPollQueue --
 *
 *      Signal to queue that timer event occurred.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static INLINE_SINGLE_CALLER void
LinuxDriverFlushPollQueue(void)
{
   unsigned long flags;

   spin_lock_irqsave(&pollQueueLock, flags);
   pollQueueCount = 0;
   spin_unlock_irqrestore(&pollQueueLock, flags);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverWakeUp --
 *
 *      Wake up processes waiting on timer event.
 *
 * Results:
 *      None.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

void
LinuxDriverWakeUp(Bool selective)  // IN:
{
   if (selective && linuxState.pollList != NULL) {
      VmTimeType now;
      VMLinux *p;
      VMLinux *next;

      HostIF_PollListLock(1);
      now = ktime_get_ns() / NSEC_PER_USEC;

      for (p = linuxState.pollList; p != NULL; p = next) {
         next = p->pollForw;

         if (p->pollTime <= now) {
            if ((*p->pollBack = next) != NULL) {
               next->pollBack = p->pollBack;
            }
            p->pollForw = NULL;
            p->pollBack = NULL;
            wake_up(&p->pollQueue);
         }
      }
      HostIF_PollListUnlock(1);
   }

   LinuxDriverFlushPollQueue();
   wake_up(&linuxState.pollQueue);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverPoll  --
 *
 *      This is used to wake up the VMX when a user call arrives, or
 *      to wake up select() or poll() at the next clock tick.
 *
 *----------------------------------------------------------------------
 */

static unsigned int
LinuxDriverPoll(struct file *filp,  // IN:
                poll_table *wait)   // IN:
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   unsigned int mask = 0;

   /*
    * Set up or check the timeout for fast wakeup.
    *
    * Thanks to Petr for this simple and correct implementation:
    *
    * There are four cases of wait == NULL:
    *    another file descriptor is ready in the same poll()
    *    just slept and woke up
    *    nonblocking poll()
    *    did not sleep due to memory allocation on 2.4.21-9.EL
    * In first three cases, it's okay to return POLLIN.
    * Unfortunately, for 4th variant we have to do some
    * bookkeeping to not return POLLIN when timer did not expire
    * yet.
    *
    * We may schedule a timer unnecessarily if an existing
    * timer fires between poll_wait() and timer_pending().
    *
    * -- edward
    */

   if (wait == NULL) {
      if (vmLinux->pollBack == NULL && !LinuxDriverIsPollQueued()) {
         mask = POLLIN;
      }
   } else {
      if (linuxState.fastClockThread && vmLinux->pollTimeoutPtr != NULL) {
         u64 now = ktime_get_ns() / NSEC_PER_USEC;

         poll_wait(filp, &vmLinux->pollQueue, wait);
         vmLinux->pollTime = *vmLinux->pollTimeoutPtr + now;
         if (vmLinux->pollBack == NULL) {
            HostIF_PollListLock(2);
            if (vmLinux->pollBack == NULL) {
               if ((vmLinux->pollForw = linuxState.pollList) != NULL) {
                  vmLinux->pollForw->pollBack = &vmLinux->pollForw;
               }
               linuxState.pollList = vmLinux;
               vmLinux->pollBack = &linuxState.pollList;
            }
            HostIF_PollListUnlock(2);
         }
      } else {
         LinuxDriverQueuePoll();
         poll_wait(filp, &linuxState.pollQueue, wait);

         if (!timer_pending(&linuxState.pollTimer)) {
            mod_timer(&linuxState.pollTimer, jiffies + 1);
         }
      }
   }

   return mask;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverPollTimeout  --
 *
 *      Wake up a process waiting in poll/select.  This is called from
 *      the timer, and hence processed in the bottom half
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverPollTimeout(compat_timer_arg_t unused)  // IN:
{
   LinuxDriverWakeUp(FALSE);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverNoPage/LinuxDriverFault --
 *
 *      Callback for returning allocated page for memory mapping
 *
 * Results:
 *    NoPage:
 *      Page or page address on success, NULL or 0 on failure.
 *    Fault:
 *      Error code; 0, minor page fault.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
static vm_fault_t
LinuxDriverFault(struct vm_fault *fault)     //IN/OUT
#elif defined(VMW_NOPAGE_2624)
static vm_fault_t LinuxDriverFault(struct vm_area_struct *vma, //IN
				   struct vm_fault *fault)     //IN/OUT
#else
static struct page *LinuxDriverNoPage(struct vm_area_struct *vma, //IN
                                      unsigned long address,      //IN
                                      int *type)                  //OUT: Fault type
#endif
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
   struct vm_area_struct *vma = fault->vma;
#endif
   VMLinux *vmLinux = (VMLinux *) vma->vm_file->private_data;
   unsigned long pg;
   struct page* page;

#ifdef VMW_NOPAGE_2624
   pg = fault->pgoff;
#else
   pg = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
#endif
   pg = VMMON_MAP_OFFSET(pg);
   if (pg >= vmLinux->size4Gb) {
#ifdef VMW_NOPAGE_2624
      return VM_FAULT_SIGBUS;
#else
      return 0;
#endif
   }
   page = vmLinux->pages4Gb[pg];
   get_page(page);
#ifdef VMW_NOPAGE_2624
   fault->page = page;
   return 0;
#else
   *type = VM_FAULT_MINOR;
   return page;
#endif
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverAllocContig --
 *
 *      Create mapping for contiguous memory areas.
 *
 * Results:
 *
 *      0       on success,
 *      -EINVAL on invalid arguments or
 *      -ENOMEM on out of memory
 *
 * Side effects:
 *      Pages for mapping are allocated.
 *
 *-----------------------------------------------------------------------------
 */

static int LinuxDriverAllocContig(VMLinux *vmLinux,
                                  struct vm_area_struct *vma,
                                  unsigned long off,
                                  unsigned long size)
{
   unsigned long vmaOrder = VMMON_MAP_ORDER(off);
   unsigned long vmaAllocSize;
   unsigned int gfpFlag;
   unsigned long i;

   if (VMMON_MAP_RSVD(off)) {
      /* Reserved bits set... */
      return -EINVAL;
   }
   if (VMMON_MAP_OFFSET(off)) {
      /* We do not need non-zero offsets... */
      return -EINVAL;
   }
   switch (VMMON_MAP_MT(off)) {
      case VMMON_MAP_MT_LOW4GB:
#ifdef GFP_DMA32
         gfpFlag = GFP_USER | GFP_DMA32;
#else
         gfpFlag = GFP_USER | GFP_DMA;
#endif
         break;
      case VMMON_MAP_MT_LOW16MB:
         gfpFlag = GFP_USER | GFP_DMA;
         break;
      case VMMON_MAP_MT_ANY:
         gfpFlag = GFP_HIGHUSER;
         break;
      default:
         /* Invalid memory type */
         return -EINVAL;
   }
   if (size > VMMON_MAP_OFFSET_MASK + 1) {
      /* Size is too big to fit to our window. */
      return -ENOMEM;
   }

   /* 16 pages looks like a good limit... */
   if (size > VMMON_MAX_LOWMEM_PAGES) {
      return -ENOMEM;
   }
   /* Sorry. Only one mmap per one open. */
   down(&vmLinux->lock4Gb);
   if (vmLinux->size4Gb) {
      up(&vmLinux->lock4Gb);
      return -EINVAL;
   }
   vmaAllocSize = 1 << vmaOrder;
   for (i = 0; i < size; i += vmaAllocSize) {
      int err;

      err = LinuxDriverAllocPages(gfpFlag, vmaOrder,
                                  vmLinux->pages4Gb + i, size - i);
      if (err) {
         while (i > 0) {
            put_page(vmLinux->pages4Gb[--i]);
         }
         up(&vmLinux->lock4Gb);

         return err;
      }
   }
   vmLinux->size4Gb = size;
   up(&vmLinux->lock4Gb);
   vma->vm_ops = &vmuser_mops;

   return 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverMmap --
 *
 *      Create mapping for lowmem or locked memory.
 *
 * Results:
 *
 *      0       on success,
 *      -EINVAL on invalid arguments or
 *      -ENOMEM on out of memory
 *
 * Side effects:
 *      Pages for mapping are allocated.
 *
 *-----------------------------------------------------------------------------
 */

static int
LinuxDriverMmap(struct file *filp,
                struct vm_area_struct *vma)
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   unsigned long size;
   int err;

   /* Only shared mappings */
   if (!(vma->vm_flags & VM_SHARED)) {
      return -EINVAL;
   }
   if ((vma->vm_end | vma->vm_start) & (PAGE_SIZE - 1)) {
      return -EINVAL;
   }
   size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
   if (size < 1) {
      return -EINVAL;
   }
   if (vmLinux->vm) {
      err = -EINVAL;
   } else {
      err = LinuxDriverAllocContig(vmLinux, vma, vma->vm_pgoff, size);
   }
   if (err) {
      return err;
   }
   /* Clear VM_IO, otherwise SuSE's kernels refuse to do get_user_pages */
#if COMPAT_LINUX_VERSION_CHECK_LT(6, 3, 0)
   vma->vm_flags &= ~VM_IO;
#else
   vm_flags_clear(vma, VM_IO);
#endif

   return 0;
}


typedef Bool (*SyncFunc)(void *data, unsigned cpu);

typedef struct {
   Atomic_uint32 numCPUs;
   Atomic_uint32 ready;
   Atomic_uint32 failures;
   Atomic_uint32 done;
   SyncFunc      func;
   void          *data;
} SyncFuncArgs;


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncCallHook --
 *
 *      Called on each CPU, waits for them all to show up, and executes
 *      the callback.
 *
 * Results:
 *
 * Side effects:
 *      Whatever side effects the callback has.
 *
 *-----------------------------------------------------------------------------
 */

static void
LinuxDriverSyncCallHook(void *data)  // IN:
{
   Bool success;
   uint32 numCPUs;
   volatile unsigned iterations = 1000 * 1000;
   SyncFuncArgs *args = (SyncFuncArgs *)data;
   unsigned cpu = smp_processor_id();

   /*
    * We need to careful about reading cpu_online_map on kernels that
    * have hot add/remove cpu support.  The kernel's smp_call_function
    * blocks hot add from occuring between the time it computes the set
    * of cpus it will IPI and when all those cpus have entered their IPI
    * handlers.  Additionally, we disabled preemption on the initiating
    * cpu during the entire sync call sequence.  So, since a cpu hot add
    * is initiated from process context, a cpu cannot be hot added until
    * at least one cpu has exited this code, and therefore it is safe
    * for the first cpu to reach this point to read cpu_online_map.
    *
    * Hot remove works by stopping the entire machine, which is done by
    * waiting for a set of kernel threads to be scheduled on all cpus.
    * This cannot happen until all cpus are preemptible.  Since the
    * initiating cpu has preemption disabled during this entire
    * sequence, this code is also safe from cpu hot remove.
    *
    * So, the first cpu to reach this code will read the same value of
    * cpu_online_map that was used by smp_call_function, and therefore
    * we can safely assume that numCPUs cpus will execute this routine.
    */

   Atomic_CMPXCHG32(&args->numCPUs, 0, num_online_cpus());
   numCPUs = Atomic_Read(&args->numCPUs);

   Atomic_Inc(&args->ready);

   /*
    * Wait for all CPUs, but not forever since we could deadlock.  The
    * potential deadlock scenerio is this: cpu0 has IF=1 and holds a
    * lock.  cpu1 has IF=0 and is spinning waiting for the lock.
    */

   while (Atomic_Read(&args->ready) != numCPUs && --iterations) ;

   /* Now simultaneously call the routine. */
   success = args->func(args->data, cpu);

   if (!iterations || !success) {
      /* Indicate that we either timed out or the callback failed. */
      Atomic_Inc(&args->failures);
   }
   /* Indicate that we are finished. */
   Atomic_Inc(&args->done);
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncCallOnEachCPU --
 *
 *      Calls func on each cpu at (nearly) the same time.
 *
 * Results:
 *      TRUE if func was called at the same time on all cpus.  Note that
 *      func is called regardless of whether all cpus showed up in time.
 *
 * Side effects:
 *      func's side effects, on all cpus.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
LinuxDriverSyncCallOnEachCPU(SyncFunc func,  // IN:
                             void *data)     // IN:
{
   SyncFuncArgs args;
   uintptr_t flags;

   ASSERT(HostIF_GlobalLockIsHeld());

   args.func = func;
   args.data = data;

   Atomic_Write(&args.numCPUs, 0); // Must be calculated inside the callback.
   Atomic_Write(&args.ready, 0);
   Atomic_Write(&args.failures, 0);
   Atomic_Write(&args.done, 0);

   preempt_disable();

   /*
    * Call all other CPUs, but do not wait so we can enter the callback
    * on this CPU too.
    */

   compat_smp_call_function(LinuxDriverSyncCallHook, &args, 0);

   /*
    * smp_call_function doesn't return until all cpus have been
    * interrupted.  It's safe to disable interrupts now that all other
    * cpus are in their IPI handlers.
    */

   SAVE_FLAGS(flags);
   CLEAR_INTERRUPTS();

   LinuxDriverSyncCallHook(&args);

   RESTORE_FLAGS(flags);
   preempt_enable();

   /*
    * Wait for everyone else to finish so we can get an accurate
    * failures count.
    */

   while (Atomic_Read(&args.done) != Atomic_Read(&args.numCPUs)) ;

   /*
    * This routine failed if any CPU bailed out early to avoid deadlock,
    * or the callback routine failed on any CPU.  Both conditions are
    * recorded in the failures field.
    */

   return Atomic_Read(&args.failures) == 0;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverReadTSC --
 *
 *      Callback that is executed simultaneously on all cpus to read the TSCs.
 *
 * Results:
 *      TRUE.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

static Bool
LinuxDriverReadTSC(void *data,   // OUT: TSC values
                   unsigned cpu) // IN: the pcpu number
{
   TSCDelta *tscDelta = (TSCDelta *)data;
   uint64 tsc, old;

   if (LIKELY(CPUID_SSE2Supported())) {
      RDTSC_BARRIER();
   }
   tsc = RDTSC();

   /* Any looping means another CPU changed min/max. */
   do {
      old = Atomic_Read64(&tscDelta->min);
   } while (old > tsc && !Atomic_CMPXCHG64(&tscDelta->min, &old, &tsc));
   do {
      old = Atomic_Read64(&tscDelta->max);
   } while (old < tsc && !Atomic_CMPXCHG64(&tscDelta->max, &old, &tsc));

   return TRUE;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriverSyncReadTSCs --
 *
 *      Simultaneously read the TSCs on all cpus.
 *
 * Results:
 *      The set of all TSCs.
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

__always_inline static Bool
LinuxDriverSyncReadTSCs(uint64 *delta) // OUT: TSC max - TSC min
{
   TSCDelta tscDelta;
   unsigned i;
   Bool okay = FALSE;

   /* Take the global lock to block concurrent calls. */
   HostIF_GlobalLock(14);

   /* Loop to warm up the cache. */
   for (i = 0; i < 3; i++) {
      Atomic_Write64(&tscDelta.min, ~CONST64U(0));
      Atomic_Write64(&tscDelta.max, CONST64U(0));

      if (LinuxDriverSyncCallOnEachCPU(LinuxDriverReadTSC, &tscDelta)) {
         /* We return the last successful simultaneous read of the TSCs. */
         *delta = Atomic_Read64(&tscDelta.max) - Atomic_Read64(&tscDelta.min);
         okay = TRUE;
      }
   }
   HostIF_GlobalUnlock(14);

   return okay;
}


/*
 *-----------------------------------------------------------------------------
 *
 * LinuxDriver_Ioctl --
 *
 *      Main path for UserRPC
 *
 *      Be VERY careful with stack usage; gcc's stack allocation is iffy
 *      and allocations from individual "case" statements do not overlap,
 *      so it is easy to use kilobytes of stack space here.
 *
 * Results:
 *
 * Side effects:
 *      None.
 *
 *-----------------------------------------------------------------------------
 */

long
LinuxDriver_Ioctl(struct file *filp,    // IN:
                  u_int iocmd,          // IN:
                  unsigned long ioarg)  // IN:
{
   VMLinux *vmLinux = (VMLinux *) filp->private_data;
   int retval = 0;
   Vcpuid vcpuid;
   VMDriver *vm;

   if (vmLinux == NULL) {
      return -EINVAL;
   }

   vm = vmLinux->vm;

   /*
    * Validate the VM pointer for those IOCTLs that require it.
    */

   switch (iocmd) {
   case IOCTL_VMX86_VERSION:
   case IOCTL_VMX86_CREATE_VM:
   case IOCTL_VMX86_INIT_CROSSGDT:
   case IOCTL_VMX86_SET_UID:
   case IOCTL_VMX86_GET_NUM_VMS:
   case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
   case IOCTL_VMX86_SET_HARD_LIMIT:
   case IOCTL_VMX86_PAE_ENABLED:
   case IOCTL_VMX86_VMX_ENABLED:
   case IOCTL_VMX86_GET_IPI_VECTORS:
   case IOCTL_VMX86_GET_KHZ_ESTIMATE:
   case IOCTL_VMX86_GET_ALL_CPUID:
   case IOCTL_VMX86_GET_ALL_MSRS:
   case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR:
   case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
   case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ:
   case IOCTL_VMX86_INIT_PSEUDO_TSC:
   case IOCTL_VMX86_CHECK_PSEUDO_TSC:
   case IOCTL_VMX86_GET_PSEUDO_TSC:
   case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
   case IOCTL_VMX86_SYNC_GET_TSCS:
   case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS:
      break;

   default:
      if (vm == NULL) {
         retval = -EINVAL;
         goto exit;
      }
   }

   /*
    * Perform the IOCTL operation.
    */

   switch (iocmd) {
   case IOCTL_VMX86_VERSION:
      retval = VMMON_VERSION;
      break;

   case IOCTL_VMX86_CREATE_VM:
      if (vm != NULL) {
         retval = -EINVAL;
         break;
      }

      vm = Vmx86_CreateVM();

      if (vm == NULL) {
         retval = -ENOMEM;
      } else {
         vmLinux->vm = vm;
         retval = vm->userID;
      }
      break;

   case IOCTL_VMX86_RELEASE_VM:
      vmLinux->vm = NULL;
      Vmx86_ReleaseVM(vm);
      break;

   case IOCTL_VMX86_ALLOC_CROSSGDT: {
      InitBlock initBlock;

      if (Task_AllocCrossGDT(&initBlock)) {
         retval = HostIF_CopyToUser((char *)ioarg, &initBlock,
                                    sizeof initBlock);
      } else {
         retval = -EINVAL;
      }
      break;
   }

   case IOCTL_VMX86_INIT_VM: {
      InitBlock initParams;

      retval = HostIF_CopyFromUser(&initParams, (char *)ioarg,
                                   sizeof initParams);
      if (retval != 0) {
         break;
      }
      if (Vmx86_InitVM(vm, &initParams)) {
         retval = -EINVAL;
         break;
      }
      retval = HostIF_CopyToUser((char *)ioarg, &initParams,
                                 sizeof initParams);
      break;
   }

   case IOCTL_VMX86_INIT_CROSSGDT: {
      InitCrossGDT initCrossGDT;

      retval = HostIF_CopyFromUser(&initCrossGDT, (char *)ioarg,
                                   sizeof initCrossGDT);

      if ((retval == 0) && Task_InitCrossGDT(&initCrossGDT)) {
         retval = -EIO;
      }
      break;
   }

   case IOCTL_VMX86_RUN_VM:
      vcpuid = ioarg;

      if (vcpuid >= vm->numVCPUs) {
         retval = -EINVAL;
         break;
      }
      retval = Vmx86_RunVM(vm, vcpuid);
      break;

   case IOCTL_VMX86_SET_UID:
#ifdef VMX86_DEVEL
      devel_suid();
#else
      retval = -EPERM;
#endif
      break;

   case IOCTL_VMX86_LOCK_PAGE: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LockPage(vm, args.uAddr, FALSE, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_LOCK_PAGE_NEW: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LockPage(vm, args.uAddr, TRUE, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_UNLOCK_PAGE: {
      VA64 uAddr;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }
      retval = Vmx86_UnlockPage(vm, uAddr);
      break;
   }

   case IOCTL_VMX86_UNLOCK_PAGE_BY_MPN: {
      VMMUnlockPageByMPN args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      retval = Vmx86_UnlockPageByMPN(vm, args.mpn, args.uAddr);
      break;
   }

   case IOCTL_VMX86_LOOK_UP_MPN: {
      VMLockPage args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval) {
         break;
      }
      args.ret.status = Vmx86_LookupUserMPN(vm, args.uAddr, &args.ret.mpn);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_GET_NUM_VMS:
      retval = Vmx86_GetNumVMs();
      break;

   case IOCTL_VMX86_GET_TOTAL_MEM_USAGE:
      retval = Vmx86_GetTotalMemUsage();
      break;

   case IOCTL_VMX86_SET_HARD_LIMIT: {
      int32 limit;

      retval = HostIF_CopyFromUser(&limit, (void *)ioarg, sizeof limit);
      if (retval != 0) {
         break;
      }
      if (!Vmx86_SetConfiguredLockedPagesLimit(limit)) {
         retval = -EINVAL;
      }
      break;
   }

   case IOCTL_VMX86_ADMIT: {
      VMMemInfoArgs args;

      retval = HostIF_CopyFromUser(&args, (void *)ioarg, sizeof args);
      if (retval != 0) {
         break;
      }
      Vmx86_Admit(vm, &args);
      retval = HostIF_CopyToUser((void *)ioarg, &args, sizeof args);
      break;
   }

   case IOCTL_VMX86_READMIT: {
      OvhdMem_Deltas delta;

      retval = HostIF_CopyFromUser(&delta, (void *)ioarg, sizeof delta);
      if (retval != 0) {
         break;
      }
      if (!Vmx86_Readmit(vm, &delta)) {
         retval = -1;
      }

      break;
   }

   case IOCTL_VMX86_UPDATE_MEM_INFO: {
      VMMemMgmtInfoPatch patch;

      retval = HostIF_CopyFromUser(&patch, (void *)ioarg, sizeof patch);
      if (retval == 0) {
         Vmx86_UpdateMemInfo(vm, &patch);
      }
      break;
   }

   case IOCTL_VMX86_GET_MEM_INFO: {
      VA64 uAddr;
      VMMemInfoArgs *userVA;
      VMMemInfoArgs in;
      VMMemInfoArgs *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      if (in.numVMs < 1 || in.numVMs > MAX_VMS) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(VM_GET_MEM_INFO_SIZE(in.numVMs), TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!Vmx86_GetMemInfo(vm, FALSE, out, VM_GET_MEM_INFO_SIZE(in.numVMs))) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser(userVA, out,
                                 VM_GET_MEM_INFO_SIZE(out->numVMs));
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_PAE_ENABLED:
      retval = Vmx86_PAEEnabled();
      break;

   case IOCTL_VMX86_VMX_ENABLED:
      retval = Vmx86_VMXEnabled();
      break;

   case IOCTL_VMX86_APIC_INIT: {
      VMAPICInfo info;
      Bool setVMPtr;
      Bool probe;

      retval = HostIF_CopyFromUser(&info, (VMAPICInfo *)ioarg, sizeof info);
      if (retval != 0) {
         break;
      }
      setVMPtr = ((info.flags & APIC_FLAG_DISABLE_NMI) != 0);
      probe = ((info.flags & APIC_FLAG_PROBE) != 0);

      /*
       * Kernel uses NMIs for deadlock detection - set APIC VMptr so that
       * NMIs get disabled in the monitor.
       */

      setVMPtr = TRUE;

      retval = HostIF_APICInit(vm, setVMPtr, probe) ? 0 : -ENODEV;
      break;
   }

   case IOCTL_VMX86_SET_HOST_CLOCK_RATE:
      retval = -Vmx86_SetHostClockRate(vm, (unsigned)ioarg);
      break;

   case IOCTL_VMX86_SEND_IPI: {
      VCPUSet ipiTargets;

      retval = HostIF_CopyFromUser(&ipiTargets, (VCPUSet *) ioarg,
                                   sizeof ipiTargets);

      if (retval == 0) {
         HostIF_IPI(vm, &ipiTargets);
      }

      break;
   }

   case IOCTL_VMX86_GET_IPI_VECTORS: {
      IPIVectors ipiVectors;

      ipiVectors.hostIPIVectors[0] = CALL_FUNCTION_VECTOR;
#ifdef CALL_FUNCTION_SINGLE_VECTOR
      ipiVectors.hostIPIVectors[1] = CALL_FUNCTION_SINGLE_VECTOR;
#else
      ipiVectors.hostIPIVectors[1] = 0;
#endif
      ipiVectors.monitorIPIVector = monitorIPIVector;
      ipiVectors.hvIPIVector      = hvIPIVector;

      retval = HostIF_CopyToUser((void *)ioarg, &ipiVectors,
                                  sizeof ipiVectors);
      break;
   }

   case IOCTL_VMX86_GET_KHZ_ESTIMATE:
      retval = LinuxDriverEstimateTSCkHz();
      break;

   case IOCTL_VMX86_GET_ALL_CPUID: {
      VA64 uAddr;
      CPUIDQuery *userVA;
      CPUIDQuery in;
      CPUIDQuery *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      /*
       * Some kernels panic on kmalloc request larger than 128KB.
       * XXX This test should go inside HostIF_AllocKernelMem() then.
       */

      if (in.numLogicalCPUs >
                        (131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(
         sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
         TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!HostIF_GetAllCpuInfo(out)) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
                                  &out->logicalCPUs[0],
                           out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_GET_ALL_MSRS: {
      VA64 uAddr;
      MSRQuery *userVA;
      MSRQuery in;
      MSRQuery *out;

      retval = HostIF_CopyFromUser(&uAddr, (void *)ioarg, sizeof uAddr);
      if (retval) {
         break;
      }

      userVA = VA64ToPtr(uAddr);
      retval = HostIF_CopyFromUser(&in, userVA, sizeof in);
      if (retval) {
         break;
      }

      /*
       * Some kernels panic on kmalloc request larger than 128KB.
       * XXX This test should go inside HostIF_AllocKernelMem() then.
       */

      if (in.numLogicalCPUs >
                        (131072 - sizeof *out) / sizeof out->logicalCPUs[0]) {
         retval = -EINVAL;
         break;
      }
      out = HostIF_AllocKernelMem(
         sizeof *out + in.numLogicalCPUs * sizeof out->logicalCPUs[0],
         TRUE);
      if (!out) {
         retval = -ENOMEM;
         break;
      }

      *out = in;
      if (!Vmx86_GetAllMSRs(out)) {
         HostIF_FreeKernelMem(out);
         retval = -ENOBUFS;
         break;
      }

      retval = HostIF_CopyToUser((int8 *)userVA + sizeof *userVA,
                                  &out->logicalCPUs[0],
                            out->numLogicalCPUs * sizeof out->logicalCPUs[0]);
      HostIF_FreeKernelMem(out);
      break;
   }

   case IOCTL_VMX86_ALLOC_LOCKED_PAGES:
   case IOCTL_VMX86_FREE_LOCKED_PAGES: {
         VMMPNList req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
           break;
         }
         if (iocmd == IOCTL_VMX86_ALLOC_LOCKED_PAGES) {
            retval = Vmx86_AllocLockedPages(vm, req.mpnList,
                                            req.mpnCount, FALSE,
                                            req.ignoreLimits);
         } else {
            retval = Vmx86_FreeLockedPages(vm, req.mpnList,
                                           req.mpnCount, FALSE);
         }
         break;
      }

   case IOCTL_VMX86_GET_NEXT_ANON_PAGE: {
      VMMPNNext req;

      retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
      if (retval) {
         req.outMPN = INVALID_MPN;
      } else {
         req.outMPN = Vmx86_GetNextAnonPage(vm, req.inMPN);
      }
      retval = HostIF_CopyToUser((void *)ioarg, &req, sizeof req);
      break;
   }

   case IOCTL_VMX86_GET_LOCKED_PAGES_LIST: {
         VMMPNList req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = Vmx86_GetLockedPageList(vm, req.mpnList, req.mpnCount);
         break;
      }

   case IOCTL_VMX86_READ_PAGE: {
         VMMReadWritePage req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = HostIF_ReadPage(vm, req.mpn, req.uAddr, FALSE);
         break;
      }

   case IOCTL_VMX86_WRITE_PAGE: {
         VMMReadWritePage req;

         retval = HostIF_CopyFromUser(&req, (void *)ioarg, sizeof req);
         if (retval) {
            break;
         }
         retval = HostIF_WritePage(vm, req.mpn, req.uAddr, FALSE);
         break;
      }

   case IOCTL_VMX86_SET_POLL_TIMEOUT_PTR: {
      vmLinux->pollTimeoutPtr = NULL;
      HostIF_UnmapUserMem(vmLinux->pollTimeoutHandle);
      if (ioarg != 0) {
         vmLinux->pollTimeoutPtr = HostIF_MapUserMem((VA)ioarg,
                                              sizeof *vmLinux->pollTimeoutPtr,
                                                 &vmLinux->pollTimeoutHandle);

         if (vmLinux->pollTimeoutPtr == NULL) {
            retval = -EINVAL;
            break;
         }
      }
      break;
   }

   case IOCTL_VMX86_GET_KERNEL_CLOCK_RATE:
      retval = HZ;
      break;

   case IOCTL_VMX86_FAST_SUSP_RES_SET_OTHER_FLAG:
      retval = Vmx86_FastSuspResSetOtherFlag(vm, ioarg);
      break;

   case IOCTL_VMX86_FAST_SUSP_RES_GET_MY_FLAG:
      retval = Vmx86_FastSuspResGetMyFlag(vm, ioarg);
      break;

   case IOCTL_VMX86_GET_REFERENCE_CLOCK_HZ: {
      uint64 refClockHz = HostIF_UptimeFrequency();

      retval = HostIF_CopyToUser((void *)ioarg, &refClockHz,
                                 sizeof refClockHz);
      break;
   }

   case IOCTL_VMX86_INIT_PSEUDO_TSC: {
      PTSCInitParams params;

      retval = HostIF_CopyFromUser(&params, (void *)ioarg, sizeof params);
      if (retval != 0) {
         break;
      }
      Vmx86_InitPseudoTSC(&params);
      retval = HostIF_CopyToUser((void *)ioarg, &params, sizeof params);
      break;
   }

   case IOCTL_VMX86_CHECK_PSEUDO_TSC: {
      PTSCCheckParams params;

      retval = HostIF_CopyFromUser(&params, (void *)ioarg, sizeof params);
      if (retval != 0) {
         break;
      }
      params.usingRefClock = Vmx86_CheckPseudoTSC(&params.lastTSC,
                                                  &params.lastRC);

      retval = HostIF_CopyToUser((void *)ioarg, &params, sizeof params);
      break;
   }

   case IOCTL_VMX86_GET_PSEUDO_TSC: {
      uint64 ptsc = Vmx86_GetPseudoTSC();

      retval = HostIF_CopyToUser((void *)ioarg, &ptsc, sizeof ptsc);
      break;
   }

   case IOCTL_VMX86_SET_HOST_CLOCK_PRIORITY:
      /*
       * This affects the global fast clock priority, and it only
       * takes effect when the fast clock rate transitions from zero
       * to a non-zero value.
       *
       * This is used to allow VMs to optionally work around
       * bug 218750 by disabling our default priority boost. If any
       * VM chooses to apply this workaround, the effect is permanent
       * until vmmon is reloaded!
       */

      HostIF_FastClockLock(3);
      linuxState.fastClockPriority = MAX(-20, MIN(19, (int)ioarg));
      HostIF_FastClockUnlock(3);
      retval = 0;
      break;

   case IOCTL_VMX86_SYNC_GET_TSCS: {
      uint64 delta;

      if (LinuxDriverSyncReadTSCs(&delta)) {
         retval = HostIF_CopyToUser((void *)ioarg, &delta, sizeof delta);
       } else {
         retval = -EBUSY;
      }
      break;
   }

   case IOCTL_VMX86_SET_HOST_SWAP_SIZE: {
      uint64 swapSize;
      retval = HostIF_CopyFromUser(&swapSize, (void *)ioarg, sizeof swapSize);
      if (retval != 0) {
         Warning("Could not copy swap size from user, status %d\n", retval);
	 break;
      }
      linuxState.swapSize = swapSize;
      break;
   }

   case IOCTL_VMX86_GET_UNAVAIL_PERF_CTRS: {
      uint64 ctrs = Vmx86_GetUnavailablePerfCtrs();
      retval = HostIF_CopyToUser((void *)ioarg, &ctrs, sizeof ctrs);
      break;
   }

   default:
      Warning("Unknown ioctl %d\n", iocmd);
      retval = -EINVAL;
   }

exit:
   return retval;
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriverQueue --
 *
 *      add the vmLinux to the global queue
 *
 * Results:
 *
 *      void
 * Side effects:
 *      None.
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverQueue(VMLinux *vmLinux)  // IN/OUT:
{
   /*
    * insert in global vm queue
    */

   HostIF_GlobalLock(12);

   vmLinux->next = linuxState.head;
   linuxState.head = vmLinux;

   HostIF_GlobalUnlock(12);
}


/*
 *----------------------------------------------------------------------
 *
 * LinuxDriveDequeue --
 *
 *      remove from active list
 *
 * Results:
 *
 *      void
 * Side effects:
 *      printk if it is not in the list (error condition)
 *
 *----------------------------------------------------------------------
 */

static void
LinuxDriverDequeue(VMLinux *vmLinux)  // IN/OUT:
{
   VMLinux **p;

   HostIF_GlobalLock(13);
   for (p = &linuxState.head; *p != vmLinux; p = &(*p)->next) {
      ASSERT(*p != NULL);
   }
   *p = vmLinux->next;
   vmLinux->next = NULL;
   HostIF_GlobalUnlock(13);
}


/*
 *----------------------------------------------------------------------
 *
 * CheckPadding --
 *
 *      check for expected padding --
 *      this check currently fails on the egcs compiler
 *
 * Results:
 *
 *      TRUE if the check succeeds -- module will be loaded
 *
 *
 *
 * Side effects:
 *      output to kernel log on error
 *
 *----------------------------------------------------------------------
 */

static Bool
LinuxDriverCheckPadding(void)
{
   DTRWords32 dtr;
   uint16 *x;

   memset(&dtr, 0, sizeof dtr);
   dtr.dtr.limit = 0x1111;
   dtr.dtr.offset = 0x22223333;

   x = (uint16 *) &dtr;

   if (x[0] == 0x1111 && x[1] == 0x3333 && x[2] == 0x2222) {
   } else {
      Warning("DTR padding\n");
      goto error;
   }

   return TRUE;

error:
   printk("/dev/vmmon: Cannot load module. Use standard gcc compiler\n");

   return FALSE;
}


MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Machine Monitor.");
MODULE_LICENSE("GPL v2");
/*
 * Starting with SLE10sp2, Novell requires that IHVs sign a support agreement
 * with them and mark their kernel modules as externally supported via a
 * change to the module header. If this isn't done, the module will not load
 * by default (i.e., neither mkinitrd nor modprobe will accept it).
 */
MODULE_INFO(supported, "external");
module_init(LinuxDriverInit);
module_exit(LinuxDriverExit);