graal-jvmci-8: src/os/windows/vm/os

comparison src/os/windows/vm/os_windows.cpp @ 3885:3cd0157e1d4d

7082969: NUMA interleaving Summary: Support interleaving on NUMA systems for collectors that don't have NUMA-awareness. Reviewed-by: iveresov, ysr Contributed-by: Tom Deneau <tom.deneau@amd.com>

author	iveresov
date	Thu, 25 Aug 2011 02:57:46 -0700
parents	279ef1916773
children	a6128a8ed624

comparison

equal deleted inserted replaced

-:2f27ed2a98fa
+:3cd0157e1d4d
 #endif
 static HANDLE    _hProcess;
 static HANDLE    _hToken;
+// Container for NUMA node list info
+class NUMANodeListHolder {
+private:
+int *_numa_used_node_list;  // allocated below
+int _numa_used_node_count;
+void free_node_list() {
+if (_numa_used_node_list != NULL) {
+FREE_C_HEAP_ARRAY(int, _numa_used_node_list);
+}
+}
+public:
+NUMANodeListHolder() {
+_numa_used_node_count = 0;
+_numa_used_node_list = NULL;
+// do rest of initialization in build routine (after function pointers are set up)
+}
+~NUMANodeListHolder() {
+free_node_list();
+}
+bool build() {
+DWORD_PTR proc_aff_mask;
+DWORD_PTR sys_aff_mask;
+if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false;
+ULONG highest_node_number;
+if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false;
+free_node_list();
+_numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number);
+for (unsigned int i = 0; i <= highest_node_number; i++) {
+ULONGLONG proc_mask_numa_node;
+if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false;
+if ((proc_aff_mask & proc_mask_numa_node)!=0) {
+_numa_used_node_list[_numa_used_node_count++] = i;
+}
+}
+return (_numa_used_node_count > 1);
+}
+int get_count() {return _numa_used_node_count;}
+int get_node_list_entry(int n) {
+// for indexes out of range, returns -1
+return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1);
+}
+} numa_node_list_holder;
 static size_t _large_page_size = 0;
 static bool resolve_functions_for_large_page_init() {
 return os::Kernel32Dll::GetLargePageMinimumAvailable() &&
 os::Advapi32Dll::AdvapiAvailable();
 if (_hProcess) CloseHandle(_hProcess);
 _hProcess = NULL;
 if (_hToken) CloseHandle(_hToken);
 _hToken = NULL;
 }
+static bool numa_interleaving_init() {
+bool success = false;
+bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA);
+bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving);
+// print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line
+bool warn_on_failure =  use_numa_specified || use_numa_interleaving_specified;
+# define WARN(msg) if (warn_on_failure) { warning(msg); }
+// NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages)
+size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity);
+if (os::Kernel32Dll::NumaCallsAvailable()) {
+if (numa_node_list_holder.build()) {
+if (PrintMiscellaneous && Verbose) {
+tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num());
+for (int i = 0; i < numa_node_list_holder.get_count(); i++) {
+tty->print("%d ", numa_node_list_holder.get_node_list_entry(i));
+}
+tty->print("\n");
+}
+success = true;
+} else {
+WARN("Process does not cover multiple NUMA nodes.");
+}
+} else {
+WARN("NUMA Interleaving is not supported by the operating system.");
+}
+if (!success) {
+if (use_numa_specified) WARN("...Ignoring UseNUMA flag.");
+if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag.");
+}
+return success;
+#undef WARN
+}
+// this routine is used whenever we need to reserve a contiguous VA range
+// but we need to make separate VirtualAlloc calls for each piece of the range
+// Reasons for doing this:
+//  * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise)
+//  * UseNUMAInterleaving requires a separate node for each piece
+static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot,
+bool should_inject_error=false) {
+char * p_buf;
+// note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size
+size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size;
+// first reserve enough address space in advance since we want to be
+// able to break a single contiguous virtual address range into multiple
+// large page commits but WS2003 does not allow reserving large page space
+// so we just use 4K pages for reserve, this gives us a legal contiguous
+// address space. then we will deallocate that reservation, and re alloc
+// using large pages
+const size_t size_of_reserve = bytes + chunk_size;
+if (bytes > size_of_reserve) {
+// Overflowed.
+return NULL;
+}
+p_buf = (char *) VirtualAlloc(addr,
+size_of_reserve,  // size of Reserve
+MEM_RESERVE,
+PAGE_READWRITE);
+// If reservation failed, return NULL
+if (p_buf == NULL) return NULL;
+os::release_memory(p_buf, bytes + chunk_size);
+// we still need to round up to a page boundary (in case we are using large pages)
+// but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size)
+// instead we handle this in the bytes_to_rq computation below
+p_buf = (char *) align_size_up((size_t)p_buf, page_size);
+// now go through and allocate one chunk at a time until all bytes are
+// allocated
+size_t  bytes_remaining = bytes;
+// An overflow of align_size_up() would have been caught above
+// in the calculation of size_of_reserve.
+char * next_alloc_addr = p_buf;
+HANDLE hProc = GetCurrentProcess();
+#ifdef ASSERT
+// Variable for the failure injection
+long ran_num = os::random();
+size_t fail_after = ran_num % bytes;
+#endif
+int count=0;
+while (bytes_remaining) {
+// select bytes_to_rq to get to the next chunk_size boundary
+size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size));
+// Note allocate and commit
+char * p_new;
+#ifdef ASSERT
+bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after);
+#else
+const bool inject_error_now = false;
+#endif
+if (inject_error_now) {
+p_new = NULL;
+} else {
+if (!UseNUMAInterleaving) {
+p_new = (char *) VirtualAlloc(next_alloc_addr,
+bytes_to_rq,
+flags,
+prot);
+} else {
+// get the next node to use from the used_node_list
+DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num());
+p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc,
+next_alloc_addr,
+bytes_to_rq,
+flags,
+prot,
+node);
+}
+}
+if (p_new == NULL) {
+// Free any allocated pages
+if (next_alloc_addr > p_buf) {
+// Some memory was committed so release it.
+size_t bytes_to_release = bytes - bytes_remaining;
+os::release_memory(p_buf, bytes_to_release);
+}
+#ifdef ASSERT
+if (should_inject_error) {
+if (TracePageSizes && Verbose) {
+tty->print_cr("Reserving pages individually failed.");
+}
+}
+#endif
+return NULL;
+}
+bytes_remaining -= bytes_to_rq;
+next_alloc_addr += bytes_to_rq;
+count++;
+}
+// made it this far, success
+return p_buf;
+}
 void os::large_page_init() {
 if (!UseLargePages) return;
 // print a warning if any large page related flag is specified on command line
 char* os::reserve_memory(size_t bytes, char* addr, size_t alignment_hint) {
 assert((size_t)addr % os::vm_allocation_granularity() == 0,
 "reserve alignment");
 assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size");
-char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+char* res;
+// note that if UseLargePages is on, all the areas that require interleaving
+// will go thru reserve_memory_special rather than thru here.
+bool use_individual = (UseNUMAInterleaving && !UseLargePages);
+if (!use_individual) {
+res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+} else {
+elapsedTimer reserveTimer;
+if( Verbose && PrintMiscellaneous ) reserveTimer.start();
+// in numa interleaving, we have to allocate pages individually
+// (well really chunks of NUMAInterleaveGranularity size)
+res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE);
+if (res == NULL) {
+warning("NUMA page allocation failed");
+}
+if( Verbose && PrintMiscellaneous ) {
+reserveTimer.stop();
+tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes,
+reserveTimer.milliseconds(), reserveTimer.ticks());
+}
+}
 assert(res == NULL || addr == NULL || addr == res,
 "Unexpected address from reserve.");
 return res;
 }
 // Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).
 }
 char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
 const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
+const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
-if (UseLargePagesIndividualAllocation) {
+// with large pages, there are two cases where we need to use Individual Allocation
+// 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
+// 2) NUMA Interleaving is enabled, in which case we use a different node for each page
+if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
 if (TracePageSizes && Verbose) {
 tty->print_cr("Reserving large pages individually.");
 }
-char * p_buf;
+char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
-// first reserve enough address space in advance since we want to be
+if (p_buf == NULL) {
-// able to break a single contiguous virtual address range into multiple
+// give an appropriate warning message
-// large page commits but WS2003 does not allow reserving large page space
+if (UseNUMAInterleaving) {
-// so we just use 4K pages for reserve, this gives us a legal contiguous
+warning("NUMA large page allocation failed, UseLargePages flag ignored");
-// address space. then we will deallocate that reservation, and re alloc
+}
-// using large pages
+if (UseLargePagesIndividualAllocation) {
-const size_t size_of_reserve = bytes + _large_page_size;
+warning("Individually allocated large pages failed, "
-if (bytes > size_of_reserve) {
+"use -XX:-UseLargePagesIndividualAllocation to turn off");
-// Overflowed.
+}
-warning("Individually allocated large pages failed, "
-"use -XX:-UseLargePagesIndividualAllocation to turn off");
 return NULL;
-}
-p_buf = (char *) VirtualAlloc(addr,
-size_of_reserve,  // size of Reserve
-MEM_RESERVE,
-PAGE_READWRITE);
-// If reservation failed, return NULL
-if (p_buf == NULL) return NULL;
-release_memory(p_buf, bytes + _large_page_size);
-// round up to page boundary.  If the size_of_reserve did not
-// overflow and the reservation did not fail, this align up
-// should not overflow.
-p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
-// now go through and allocate one page at a time until all bytes are
-// allocated
-size_t  bytes_remaining = align_size_up(bytes, _large_page_size);
-// An overflow of align_size_up() would have been caught above
-// in the calculation of size_of_reserve.
-char * next_alloc_addr = p_buf;
-#ifdef ASSERT
-// Variable for the failure injection
-long ran_num = os::random();
-size_t fail_after = ran_num % bytes;
-#endif
-while (bytes_remaining) {
-size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
-// Note allocate and commit
-char * p_new;
-#ifdef ASSERT
-bool inject_error = LargePagesIndividualAllocationInjectError &&
-(bytes_remaining <= fail_after);
-#else
-const bool inject_error = false;
-#endif
-if (inject_error) {
-p_new = NULL;
-} else {
-p_new = (char *) VirtualAlloc(next_alloc_addr,
-bytes_to_rq,
-MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
-prot);
-}
-if (p_new == NULL) {
-// Free any allocated pages
-if (next_alloc_addr > p_buf) {
-// Some memory was committed so release it.
-size_t bytes_to_release = bytes - bytes_remaining;
-release_memory(p_buf, bytes_to_release);
-}
-#ifdef ASSERT
-if (UseLargePagesIndividualAllocation &&
-LargePagesIndividualAllocationInjectError) {
-if (TracePageSizes && Verbose) {
-tty->print_cr("Reserving large pages individually failed.");
-}
-}
-#endif
-return NULL;
-}
-bytes_remaining -= bytes_to_rq;
-next_alloc_addr += bytes_to_rq;
 }
 return p_buf;
 } else {
 }
 assert((size_t) addr % os::vm_page_size() == 0, "commit on page boundaries");
 assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks");
 // Don't attempt to print anything if the OS call fails. We're
 // probably low on resources, so the print itself may cause crashes.
-bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0;
-if (result != NULL && exec) {
+// unless we have NUMAInterleaving enabled, the range of a commit
-DWORD oldprot;
+// is always within a reserve covered by a single VirtualAlloc
-// Windows doc says to use VirtualProtect to get execute permissions
+// in that case we can just do a single commit for the requested size
-return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0;
+if (!UseNUMAInterleaving) {
+if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false;
+if (exec) {
+DWORD oldprot;
+// Windows doc says to use VirtualProtect to get execute permissions
+if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false;
+}
+return true;
 } else {
-return result;
-}
+// when NUMAInterleaving is enabled, the commit might cover a range that
+// came from multiple VirtualAlloc reserves (using allocate_pages_individually).
+// VirtualQuery can help us determine that.  The RegionSize that VirtualQuery
+// returns represents the number of bytes that can be committed in one step.
+size_t bytes_remaining = bytes;
+char * next_alloc_addr = addr;
+while (bytes_remaining > 0) {
+MEMORY_BASIC_INFORMATION alloc_info;
+VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info));
+size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize);
+if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL)
+return false;
+if (exec) {
+DWORD oldprot;
+if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot))
+return false;
+}
+bytes_remaining -= bytes_to_rq;
+next_alloc_addr += bytes_to_rq;
+}
+}
+// if we made it this far, return true
+return true;
 }
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
 bool exec) {
 return commit_memory(addr, size, exec);
 void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
 void os::free_memory(char *addr, size_t bytes)         { }
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 bool os::numa_topology_changed()                       { return false; }
-size_t os::numa_get_groups_num()                       { return 1; }
+size_t os::numa_get_groups_num()                       { return numa_node_list_holder.get_count(); }
 int os::numa_get_group_id()                            { return 0; }
 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
-if (size > 0) {
+// check for size bigger than actual groups_num
-ids[0] = 0;
+size = MIN2(size, numa_get_groups_num());
-return 1;
+for (int i = 0; i < (int)size; i++) {
-}
+ids[i] = numa_node_list_holder.get_node_list_entry(i);
-return 0;
+}
+return size;
 }
 bool os::get_page_info(char *start, page_info* info) {
 return false;
 }
 #ifndef PRODUCT
 if(Verbose && PrintMiscellaneous)
 tty->print("[Memory Serialize  Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
 #endif
 }
 os::large_page_init();
 // Setup Windows Exceptions
 #endif
 // initialize thread priority policy
 prio_init();
-if (UseNUMA && !ForceNUMA) {
+if (UseNUMAInterleaving) {
-UseNUMA = false; // Currently unsupported.
+// first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag
+bool success = numa_interleaving_init();
+if (!success) UseNUMAInterleaving = false;
 }
 return JNI_OK;
 }
 }
 // Kernel32 API
 typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void);
+typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG);
+typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG);
 GetLargePageMinimum_Fn      os::Kernel32Dll::_GetLargePageMinimum = NULL;
+VirtualAllocExNuma_Fn       os::Kernel32Dll::_VirtualAllocExNuma = NULL;
+GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL;
+GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL;
 BOOL                        os::Kernel32Dll::initialized = FALSE;
 SIZE_T os::Kernel32Dll::GetLargePageMinimum() {
 assert(initialized && _GetLargePageMinimum != NULL,
 "GetLargePageMinimumAvailable() not yet called");
 return _GetLargePageMinimum();
 initialize();
 }
 return _GetLargePageMinimum != NULL;
 }
+BOOL os::Kernel32Dll::NumaCallsAvailable() {
-#ifndef JDK6_OR_EARLIER
+if (!initialized) {
+initialize();
-void os::Kernel32Dll::initialize() {
+}
+return _VirtualAllocExNuma != NULL;
+}
+LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) {
+assert(initialized && _VirtualAllocExNuma != NULL,
+"NUMACallsAvailable() not yet called");
+return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node);
+}
+BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) {
+assert(initialized && _GetNumaHighestNodeNumber != NULL,
+"NUMACallsAvailable() not yet called");
+return _GetNumaHighestNodeNumber(ptr_highest_node_number);
+}
+BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) {
+assert(initialized && _GetNumaNodeProcessorMask != NULL,
+"NUMACallsAvailable() not yet called");
+return _GetNumaNodeProcessorMask(node, proc_mask);
+}
+void os::Kernel32Dll::initializeCommon() {
 if (!initialized) {
 HMODULE handle = ::GetModuleHandle("Kernel32.dll");
 assert(handle != NULL, "Just check");
 _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
+_VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma");
+_GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber");
+_GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask");
 initialized = TRUE;
 }
+}
+#ifndef JDK6_OR_EARLIER
+void os::Kernel32Dll::initialize() {
+initializeCommon();
 }
 // Kernel32 API
 inline BOOL os::Kernel32Dll::SwitchToThread() {
 CreateToolhelp32Snapshot_Fn os::Kernel32Dll::_CreateToolhelp32Snapshot = NULL;
 Module32First_Fn            os::Kernel32Dll::_Module32First = NULL;
 Module32Next_Fn             os::Kernel32Dll::_Module32Next = NULL;
 GetNativeSystemInfo_Fn      os::Kernel32Dll::_GetNativeSystemInfo = NULL;
 void os::Kernel32Dll::initialize() {
 if (!initialized) {
 HMODULE handle = ::GetModuleHandle("Kernel32.dll");
 assert(handle != NULL, "Just check");
 _SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread");
-_GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
 _CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn)
 ::GetProcAddress(handle, "CreateToolhelp32Snapshot");
 _Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First");
 _Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next");
 _GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo");
+initializeCommon();  // resolve the functions that always need resolving
 initialized = TRUE;
 }
 }
 assert(initialized && _GetNativeSystemInfo != NULL,
 "GetNativeSystemInfoAvailable() not yet called");
 _GetNativeSystemInfo(lpSystemInfo);
 }
 // PSAPI API
 typedef BOOL (WINAPI *EnumProcessModules_Fn)(HANDLE, HMODULE *, DWORD, LPDWORD);

Mercurial > hg > graal-jvmci-8

comparison src/os/windows/vm/os_windows.cpp @ 3885:3cd0157e1d4d