graal-compiler: src/os/linux/vm/os

comparison src/os/linux/vm/os_linux.cpp @ 12355:cefad50507d8

Merge with hs25-b53

author	Gilles Duboscq <duboscq@ssw.jku.at>
date	Fri, 11 Oct 2013 10:38:03 +0200
parents	6b0fd0964b87 899ecf76b570
children	096c224171c4

comparison

equal deleted inserted replaced

-:ccb4f2af2319
+:cefad50507d8
 bool os::Linux::_is_floating_stack = false;
 bool os::Linux::_is_NPTL = false;
 bool os::Linux::_supports_fast_thread_cpu_time = false;
 const char * os::Linux::_glibc_version = NULL;
 const char * os::Linux::_libpthread_version = NULL;
+pthread_condattr_t os::Linux::_condattr[1];
 static jlong initial_time_count=0;
 static int clock_tics_per_sec = 100;
 struct timespec tp;
 if (clock_getres_func (CLOCK_MONOTONIC, &res) == 0 &&
 clock_gettime_func(CLOCK_MONOTONIC, &tp)  == 0) {
 // yes, monotonic clock is supported
 _clock_gettime = clock_gettime_func;
+return;
 } else {
 // close librt if there is no monotonic clock
 dlclose(handle);
 }
 }
 }
+warning("No monotonic clock was available - timed services may " \
+"be adversely affected if the time-of-day clock changes");
 }
 #ifndef SYS_clock_getres
 #if defined(IA32) || defined(AMD64)
 os::Linux::print_full_memory_info(st);
 }
 // Try to identify popular distros.
-// Most Linux distributions have /etc/XXX-release file, which contains
+// Most Linux distributions have a /etc/XXX-release file, which contains
-// the OS version string. Some have more than one /etc/XXX-release file
+// the OS version string. Newer Linux distributions have a /etc/lsb-release
-// (e.g. Mandrake has both /etc/mandrake-release and /etc/redhat-release.),
+// file that also contains the OS version string. Some have more than one
-// so the order is important.
+// /etc/XXX-release file (e.g. Mandrake has both /etc/mandrake-release and
+// /etc/redhat-release.), so the order is important.
+// Any Linux that is based on Redhat (i.e. Oracle, Mandrake, Sun JDS...) have
+// their own specific XXX-release file as well as a redhat-release file.
+// Because of this the XXX-release file needs to be searched for before the
+// redhat-release file.
+// Since Red Hat has a lsb-release file that is not very descriptive the
+// search for redhat-release needs to be before lsb-release.
+// Since the lsb-release file is the new standard it needs to be searched
+// before the older style release files.
+// Searching system-release (Red Hat) and os-release (other Linuxes) are a
+// next to last resort.  The os-release file is a new standard that contains
+// distribution information and the system-release file seems to be an old
+// standard that has been replaced by the lsb-release and os-release files.
+// Searching for the debian_version file is the last resort.  It contains
+// an informative string like "6.0.6" or "wheezy/sid". Because of this
+// "Debian " is printed before the contents of the debian_version file.
 void os::Linux::print_distro_info(outputStream* st) {
-if (!_print_ascii_file("/etc/mandrake-release", st) &&
+if (!_print_ascii_file("/etc/oracle-release", st) &&
-!_print_ascii_file("/etc/sun-release", st) &&
+!_print_ascii_file("/etc/mandriva-release", st) &&
-!_print_ascii_file("/etc/redhat-release", st) &&
+!_print_ascii_file("/etc/mandrake-release", st) &&
-!_print_ascii_file("/etc/SuSE-release", st) &&
+!_print_ascii_file("/etc/sun-release", st) &&
-!_print_ascii_file("/etc/turbolinux-release", st) &&
+!_print_ascii_file("/etc/redhat-release", st) &&
-!_print_ascii_file("/etc/gentoo-release", st) &&
+!_print_ascii_file("/etc/lsb-release", st) &&
-!_print_ascii_file("/etc/debian_version", st) &&
+!_print_ascii_file("/etc/SuSE-release", st) &&
-!_print_ascii_file("/etc/ltib-release", st) &&
+!_print_ascii_file("/etc/turbolinux-release", st) &&
-!_print_ascii_file("/etc/angstrom-version", st)) {
+!_print_ascii_file("/etc/gentoo-release", st) &&
-st->print("Linux");
+!_print_ascii_file("/etc/ltib-release", st) &&
-}
+!_print_ascii_file("/etc/angstrom-version", st) &&
-st->cr();
+!_print_ascii_file("/etc/system-release", st) &&
+!_print_ascii_file("/etc/os-release", st)) {
+if (file_exists("/etc/debian_version")) {
+st->print("Debian ");
+_print_ascii_file("/etc/debian_version", st);
+} else {
+st->print("Linux");
+}
+}
+st->cr();
 }
 void os::Linux::print_libversion_info(outputStream* st) {
 // libc, pthread
 st->print("libc:");
 #define MADV_HUGEPAGE 14
 #endif
 int os::Linux::commit_memory_impl(char* addr, size_t size,
 size_t alignment_hint, bool exec) {
-int err;
+int err = os::Linux::commit_memory_impl(addr, size, exec);
-if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
-int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
-uintptr_t res =
-(uintptr_t) ::mmap(addr, size, prot,
-MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
--1, 0);
-if (res != (uintptr_t) MAP_FAILED) {
-if (UseNUMAInterleaving) {
-numa_make_global(addr, size);
-}
-return 0;
-}
-err = errno;  // save errno from mmap() call above
-if (!recoverable_mmap_error(err)) {
-// However, it is not clear that this loss of our reserved mapping
-// happens with large pages on Linux or that we cannot recover
-// from the loss. For now, we just issue a warning and we don't
-// call vm_exit_out_of_memory(). This issue is being tracked by
-// JBS-8007074.
-warn_fail_commit_memory(addr, size, alignment_hint, exec, err);
-//    vm_exit_out_of_memory(size, OOM_MMAP_ERROR,
-//                          "committing reserved memory.");
-}
-// Fall through and try to use small pages
-}
-err = os::Linux::commit_memory_impl(addr, size, exec);
 if (err == 0) {
 realign_memory(addr, size, alignment_hint);
 }
 return err;
 }
 vm_exit_out_of_memory(size, OOM_MMAP_ERROR, mesg);
 }
 }
 void os::pd_realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
-if (UseHugeTLBFS && alignment_hint > (size_t)vm_page_size()) {
+if (UseTransparentHugePages && alignment_hint > (size_t)vm_page_size()) {
 // We don't check the return value: madvise(MADV_HUGEPAGE) may not
 // be supported or the memory may already be backed by huge pages.
 ::madvise(addr, bytes, MADV_HUGEPAGE);
 }
 }
 // This method works by doing an mmap over an existing mmaping and effectively discarding
 // the existing pages. However it won't work for SHM-based large pages that cannot be
 // uncommitted at all. We don't do anything in this case to avoid creating a segment with
 // small pages on top of the SHM segment. This method always works for small pages, so we
 // allow that in any case.
-if (alignment_hint <= (size_t)os::vm_page_size() || !UseSHM) {
+if (alignment_hint <= (size_t)os::vm_page_size() || can_commit_large_page_memory()) {
 commit_memory(addr, bytes, alignment_hint, !ExecMem);
 }
 }
 void os::numa_make_global(char *addr, size_t bytes) {
 Linux::numa_interleave_memory(addr, bytes);
 }
+// Define for numa_set_bind_policy(int). Setting the argument to 0 will set the
+// bind policy to MPOL_PREFERRED for the current thread.
+#define USE_MPOL_PREFERRED 0
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
+// To make NUMA and large pages more robust when both enabled, we need to ease
+// the requirements on where the memory should be allocated. MPOL_BIND is the
+// default policy and it will force memory to be allocated on the specified
+// node. Changing this to MPOL_PREFERRED will prefer to allocate the memory on
+// the specified node, but will not force it. Using this policy will prevent
+// getting SIGBUS when trying to allocate large pages on NUMA nodes with no
+// free large pages.
+Linux::numa_set_bind_policy(USE_MPOL_PREFERRED);
 Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
 }
 bool os::numa_topology_changed()   { return false; }
 libnuma_dlsym(handle, "numa_available")));
 set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
 libnuma_dlsym(handle, "numa_tonode_memory")));
 set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
 libnuma_dlsym(handle, "numa_interleave_memory")));
+set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
+libnuma_dlsym(handle, "numa_set_bind_policy")));
 if (numa_available() != -1) {
 set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
 // Create a cpu -> node mapping
 os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
 os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
 os::Linux::numa_available_func_t os::Linux::_numa_available;
 os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
 os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
+os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
 unsigned long* os::Linux::_numa_all_nodes;
 bool os::pd_uncommit_memory(char* addr, size_t size) {
 uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
 MAP_PRIVATE|MAP_FIXED|MAP_NORESERVE|MAP_ANONYMOUS, -1, 0);
 return res  != (uintptr_t) MAP_FAILED;
 }
+static
+address get_stack_commited_bottom(address bottom, size_t size) {
+address nbot = bottom;
+address ntop = bottom + size;
+size_t page_sz = os::vm_page_size();
+unsigned pages = size / page_sz;
+unsigned char vec[1];
+unsigned imin = 1, imax = pages + 1, imid;
+int mincore_return_value;
+while (imin < imax) {
+imid = (imax + imin) / 2;
+nbot = ntop - (imid * page_sz);
+// Use a trick with mincore to check whether the page is mapped or not.
+// mincore sets vec to 1 if page resides in memory and to 0 if page
+// is swapped output but if page we are asking for is unmapped
+// it returns -1,ENOMEM
+mincore_return_value = mincore(nbot, page_sz, vec);
+if (mincore_return_value == -1) {
+// Page is not mapped go up
+// to find first mapped page
+if (errno != EAGAIN) {
+assert(errno == ENOMEM, "Unexpected mincore errno");
+imax = imid;
+}
+} else {
+// Page is mapped go down
+// to find first not mapped page
+imin = imid + 1;
+}
+}
+nbot = nbot + page_sz;
+// Adjust stack bottom one page up if last checked page is not mapped
+if (mincore_return_value == -1) {
+nbot = nbot + page_sz;
+}
+return nbot;
+}
 // Linux uses a growable mapping for the stack, and if the mapping for
 // the stack guard pages is not removed when we detach a thread the
 // stack cannot grow beyond the pages where the stack guard was
 // mapped.  If at some point later in the process the stack expands to
 // so if the stack mapping has already grown beyond the guard pages at
 // the time we create them, we have to truncate the stack mapping.
 // So, we need to know the extent of the stack mapping when
 // create_stack_guard_pages() is called.
-// Find the bounds of the stack mapping.  Return true for success.
-//
 // We only need this for stacks that are growable: at the time of
 // writing thread stacks don't use growable mappings (i.e. those
 // creeated with MAP_GROWSDOWN), and aren't marked "[stack]", so this
 // only applies to the main thread.
-static
-bool get_stack_bounds(uintptr_t *bottom, uintptr_t *top) {
-char buf[128];
-int fd, sz;
-if ((fd = ::open("/proc/self/maps", O_RDONLY)) < 0) {
-return false;
-}
-const char kw[] = "[stack]";
-const int kwlen = sizeof(kw)-1;
-// Address part of /proc/self/maps couldn't be more than 128 bytes
-while ((sz = os::get_line_chars(fd, buf, sizeof(buf))) > 0) {
-if (sz > kwlen && ::memcmp(buf+sz-kwlen, kw, kwlen) == 0) {
-// Extract addresses
-if (sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, bottom, top) == 2) {
-uintptr_t sp = (uintptr_t) __builtin_frame_address(0);
-if (sp >= *bottom && sp <= *top) {
-::close(fd);
-return true;
-}
-}
-}
-}
-::close(fd);
-return false;
-}
 // If the (growable) stack mapping already extends beyond the point
 // where we're going to put our guard pages, truncate the mapping at
 // that point by munmap()ping it.  This ensures that when we later
 // munmap() the guard pages we don't leave a hole in the stack
-// mapping. This only affects the main/initial thread, but guard
+// mapping. This only affects the main/initial thread
-// against future OS changes
 bool os::pd_create_stack_guard_pages(char* addr, size_t size) {
-uintptr_t stack_extent, stack_base;
-bool chk_bounds = NOT_DEBUG(os::Linux::is_initial_thread()) DEBUG_ONLY(true);
+if (os::Linux::is_initial_thread()) {
-if (chk_bounds && get_stack_bounds(&stack_extent, &stack_base)) {
+// As we manually grow stack up to bottom inside create_attached_thread(),
-assert(os::Linux::is_initial_thread(),
+// it's likely that os::Linux::initial_thread_stack_bottom is mapped and
-"growable stack in non-initial thread");
+// we don't need to do anything special.
-if (stack_extent < (uintptr_t)addr)
+// Check it first, before calling heavy function.
-::munmap((void*)stack_extent, (uintptr_t)addr - stack_extent);
+uintptr_t stack_extent = (uintptr_t) os::Linux::initial_thread_stack_bottom();
+unsigned char vec[1];
+if (mincore((address)stack_extent, os::vm_page_size(), vec) == -1) {
+// Fallback to slow path on all errors, including EAGAIN
+stack_extent = (uintptr_t) get_stack_commited_bottom(
+os::Linux::initial_thread_stack_bottom(),
+(size_t)addr - stack_extent);
+}
+if (stack_extent < (uintptr_t)addr) {
+::munmap((void*)stack_extent, (uintptr_t)(addr - stack_extent));
+}
 }
 return os::commit_memory(addr, size, !ExecMem);
 }
 // If this is a growable mapping, remove the guard pages entirely by
 // munmap()ping them.  If not, just call uncommit_memory(). This only
 // affects the main/initial thread, but guard against future OS changes
+// It's safe to always unmap guard pages for initial thread because we
+// always place it right after end of the mapped region
 bool os::remove_stack_guard_pages(char* addr, size_t size) {
 uintptr_t stack_extent, stack_base;
-bool chk_bounds = NOT_DEBUG(os::Linux::is_initial_thread()) DEBUG_ONLY(true);
-if (chk_bounds && get_stack_bounds(&stack_extent, &stack_base)) {
+if (os::Linux::is_initial_thread()) {
-assert(os::Linux::is_initial_thread(),
-"growable stack in non-initial thread");
 return ::munmap(addr, size) == 0;
 }
 return os::uncommit_memory(addr, size);
 }
 bool os::unguard_memory(char* addr, size_t size) {
 return linux_mprotect(addr, size, PROT_READ|PROT_WRITE);
 }
+bool os::Linux::transparent_huge_pages_sanity_check(bool warn, size_t page_size) {
+bool result = false;
+void *p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
+MAP_ANONYMOUS|MAP_PRIVATE,
+-1, 0);
+if (p != MAP_FAILED) {
+void *aligned_p = align_ptr_up(p, page_size);
+result = madvise(aligned_p, page_size, MADV_HUGEPAGE) == 0;
+munmap(p, page_size * 2);
+}
+if (warn && !result) {
+warning("TransparentHugePages is not supported by the operating system.");
+}
+return result;
+}
 bool os::Linux::hugetlbfs_sanity_check(bool warn, size_t page_size) {
 bool result = false;
-void *p = mmap (NULL, page_size, PROT_READ|PROT_WRITE,
+void *p = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
 MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB,
 -1, 0);
 if (p != MAP_FAILED) {
 // We don't know if this really is a huge page or not.
 FILE *fp = fopen("/proc/self/maps", "r");
 if (fp) {
 }
 }
 }
 fclose(fp);
 }
-munmap (p, page_size);
+munmap(p, page_size);
-if (result)
+}
-return true;
-}
+if (warn && !result) {
-if (warn) {
 warning("HugeTLBFS is not supported by the operating system.");
 }
 return result;
 }
 // Large page support
 static size_t _large_page_size = 0;
-void os::large_page_init() {
+size_t os::Linux::find_large_page_size() {
-if (!UseLargePages) {
+size_t large_page_size = 0;
-UseHugeTLBFS = false;
-UseSHM = false;
+// large_page_size on Linux is used to round up heap size. x86 uses either
-return;
+// 2M or 4M page, depending on whether PAE (Physical Address Extensions)
-}
+// mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
+// page as large as 256M.
-if (FLAG_IS_DEFAULT(UseHugeTLBFS) && FLAG_IS_DEFAULT(UseSHM)) {
+//
-// If UseLargePages is specified on the command line try both methods,
+// Here we try to figure out page size by parsing /proc/meminfo and looking
-// if it's default, then try only HugeTLBFS.
+// for a line with the following format:
-if (FLAG_IS_DEFAULT(UseLargePages)) {
+//    Hugepagesize:     2048 kB
-UseHugeTLBFS = true;
+//
-} else {
+// If we can't determine the value (e.g. /proc is not mounted, or the text
-UseHugeTLBFS = UseSHM = true;
+// format has been changed), we'll use the largest page size supported by
-}
+// the processor.
-}
-if (LargePageSizeInBytes) {
-_large_page_size = LargePageSizeInBytes;
-} else {
-// large_page_size on Linux is used to round up heap size. x86 uses either
-// 2M or 4M page, depending on whether PAE (Physical Address Extensions)
-// mode is enabled. AMD64/EM64T uses 2M page in 64bit mode. IA64 can use
-// page as large as 256M.
-//
-// Here we try to figure out page size by parsing /proc/meminfo and looking
-// for a line with the following format:
-//    Hugepagesize:     2048 kB
-//
-// If we can't determine the value (e.g. /proc is not mounted, or the text
-// format has been changed), we'll use the largest page size supported by
-// the processor.
 #ifndef ZERO
-_large_page_size = IA32_ONLY(4 * M) AMD64_ONLY(2 * M) IA64_ONLY(256 * M) SPARC_ONLY(4 * M)
+large_page_size = IA32_ONLY(4 * M) AMD64_ONLY(2 * M) IA64_ONLY(256 * M) SPARC_ONLY(4 * M)
 ARM_ONLY(2 * M) PPC_ONLY(4 * M);
 #endif // ZERO
 FILE *fp = fopen("/proc/meminfo", "r");
 if (fp) {
 while (!feof(fp)) {
 int x = 0;
 char buf[16];
 if (fscanf(fp, "Hugepagesize: %d", &x) == 1) {
 if (x && fgets(buf, sizeof(buf), fp) && strcmp(buf, " kB\n") == 0) {
-_large_page_size = x * K;
+large_page_size = x * K;
 break;
 }
 } else {
 // skip to next line
 for (;;) {
 int ch = fgetc(fp);
 if (ch == EOF || ch == (int)'\n') break;
-}
 }
 }
-fclose(fp);
+}
-}
+fclose(fp);
 }
-// print a warning if any large page related flag is specified on command line
+if (!FLAG_IS_DEFAULT(LargePageSizeInBytes) && LargePageSizeInBytes != large_page_size) {
-bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+warning("Setting LargePageSizeInBytes has no effect on this OS. Large page size is "
+SIZE_FORMAT "%s.", byte_size_in_proper_unit(large_page_size),
+proper_unit_for_byte_size(large_page_size));
+}
+return large_page_size;
+}
+size_t os::Linux::setup_large_page_size() {
+_large_page_size = Linux::find_large_page_size();
 const size_t default_page_size = (size_t)Linux::page_size();
 if (_large_page_size > default_page_size) {
 _page_sizes[0] = _large_page_size;
 _page_sizes[1] = default_page_size;
 _page_sizes[2] = 0;
 }
-UseHugeTLBFS = UseHugeTLBFS &&
-Linux::hugetlbfs_sanity_check(warn_on_failure, _large_page_size);
+return _large_page_size;
+}
-if (UseHugeTLBFS)
+bool os::Linux::setup_large_page_type(size_t page_size) {
+if (FLAG_IS_DEFAULT(UseHugeTLBFS) &&
+FLAG_IS_DEFAULT(UseSHM) &&
+FLAG_IS_DEFAULT(UseTransparentHugePages)) {
+// If UseLargePages is specified on the command line try all methods,
+// if it's default, then try only UseTransparentHugePages.
+if (FLAG_IS_DEFAULT(UseLargePages)) {
+UseTransparentHugePages = true;
+} else {
+UseHugeTLBFS = UseTransparentHugePages = UseSHM = true;
+}
+}
+if (UseTransparentHugePages) {
+bool warn_on_failure = !FLAG_IS_DEFAULT(UseTransparentHugePages);
+if (transparent_huge_pages_sanity_check(warn_on_failure, page_size)) {
+UseHugeTLBFS = false;
+UseSHM = false;
+return true;
+}
+UseTransparentHugePages = false;
+}
+if (UseHugeTLBFS) {
+bool warn_on_failure = !FLAG_IS_DEFAULT(UseHugeTLBFS);
+if (hugetlbfs_sanity_check(warn_on_failure, page_size)) {
+UseSHM = false;
+return true;
+}
+UseHugeTLBFS = false;
+}
+return UseSHM;
+}
+void os::large_page_init() {
+if (!UseLargePages) {
+UseHugeTLBFS = false;
+UseTransparentHugePages = false;
 UseSHM = false;
+return;
-UseLargePages = UseHugeTLBFS || UseSHM;
+}
+size_t large_page_size = Linux::setup_large_page_size();
+UseLargePages          = Linux::setup_large_page_type(large_page_size);
 set_coredump_filter();
 }
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
-char* os::reserve_memory_special(size_t bytes, char* req_addr, bool exec) {
+char* os::Linux::reserve_memory_special_shm(size_t bytes, size_t alignment, char* req_addr, bool exec) {
 // "exec" is passed in but not used.  Creating the shared image for
 // the code cache doesn't have an SHM_X executable permission to check.
 assert(UseLargePages && UseSHM, "only for SHM large pages");
+assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
+if (!is_size_aligned(bytes, os::large_page_size()) || alignment > os::large_page_size()) {
+return NULL; // Fallback to small pages.
+}
 key_t key = IPC_PRIVATE;
 char *addr;
 bool warn_on_failure = UseLargePages &&
 (!FLAG_IS_DEFAULT(UseLargePages) ||
+!FLAG_IS_DEFAULT(UseSHM) ||
 !FLAG_IS_DEFAULT(LargePageSizeInBytes)
 );
 char msg[128];
 // Create a large shared memory region to attach to based on size.
 warning(msg);
 }
 return NULL;
 }
-if ((addr != NULL) && UseNUMAInterleaving) {
-numa_make_global(addr, bytes);
-}
-// The memory is committed
-MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, mtNone, CALLER_PC);
 return addr;
 }
+static void warn_on_large_pages_failure(char* req_addr, size_t bytes, int error) {
+assert(error == ENOMEM, "Only expect to fail if no memory is available");
+bool warn_on_failure = UseLargePages &&
+(!FLAG_IS_DEFAULT(UseLargePages) ||
+!FLAG_IS_DEFAULT(UseHugeTLBFS) ||
+!FLAG_IS_DEFAULT(LargePageSizeInBytes));
+if (warn_on_failure) {
+char msg[128];
+jio_snprintf(msg, sizeof(msg), "Failed to reserve large pages memory req_addr: "
+PTR_FORMAT " bytes: " SIZE_FORMAT " (errno = %d).", req_addr, bytes, error);
+warning(msg);
+}
+}
+char* os::Linux::reserve_memory_special_huge_tlbfs_only(size_t bytes, char* req_addr, bool exec) {
+assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
+assert(is_size_aligned(bytes, os::large_page_size()), "Unaligned size");
+assert(is_ptr_aligned(req_addr, os::large_page_size()), "Unaligned address");
+int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+char* addr = (char*)::mmap(req_addr, bytes, prot,
+MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
+-1, 0);
+if (addr == MAP_FAILED) {
+warn_on_large_pages_failure(req_addr, bytes, errno);
+return NULL;
+}
+assert(is_ptr_aligned(addr, os::large_page_size()), "Must be");
+return addr;
+}
+char* os::Linux::reserve_memory_special_huge_tlbfs_mixed(size_t bytes, size_t alignment, char* req_addr, bool exec) {
+size_t large_page_size = os::large_page_size();
+assert(bytes >= large_page_size, "Shouldn't allocate large pages for small sizes");
+// Allocate small pages.
+char* start;
+if (req_addr != NULL) {
+assert(is_ptr_aligned(req_addr, alignment), "Must be");
+assert(is_size_aligned(bytes, alignment), "Must be");
+start = os::reserve_memory(bytes, req_addr);
+assert(start == NULL || start == req_addr, "Must be");
+} else {
+start = os::reserve_memory_aligned(bytes, alignment);
+}
+if (start == NULL) {
+return NULL;
+}
+assert(is_ptr_aligned(start, alignment), "Must be");
+// os::reserve_memory_special will record this memory area.
+// Need to release it here to prevent overlapping reservations.
+MemTracker::record_virtual_memory_release((address)start, bytes);
+char* end = start + bytes;
+// Find the regions of the allocated chunk that can be promoted to large pages.
+char* lp_start = (char*)align_ptr_up(start, large_page_size);
+char* lp_end   = (char*)align_ptr_down(end, large_page_size);
+size_t lp_bytes = lp_end - lp_start;
+assert(is_size_aligned(lp_bytes, large_page_size), "Must be");
+if (lp_bytes == 0) {
+// The mapped region doesn't even span the start and the end of a large page.
+// Fall back to allocate a non-special area.
+::munmap(start, end - start);
+return NULL;
+}
+int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
+void* result;
+if (start != lp_start) {
+result = ::mmap(start, lp_start - start, prot,
+MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
+-1, 0);
+if (result == MAP_FAILED) {
+::munmap(lp_start, end - lp_start);
+return NULL;
+}
+}
+result = ::mmap(lp_start, lp_bytes, prot,
+MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_HUGETLB,
+-1, 0);
+if (result == MAP_FAILED) {
+warn_on_large_pages_failure(req_addr, bytes, errno);
+// If the mmap above fails, the large pages region will be unmapped and we
+// have regions before and after with small pages. Release these regions.
+//
+// |  mapped  |  unmapped  |  mapped  |
+// ^          ^            ^          ^
+// start      lp_start     lp_end     end
+//
+::munmap(start, lp_start - start);
+::munmap(lp_end, end - lp_end);
+return NULL;
+}
+if (lp_end != end) {
+result = ::mmap(lp_end, end - lp_end, prot,
+MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
+-1, 0);
+if (result == MAP_FAILED) {
+::munmap(start, lp_end - start);
+return NULL;
+}
+}
+return start;
+}
+char* os::Linux::reserve_memory_special_huge_tlbfs(size_t bytes, size_t alignment, char* req_addr, bool exec) {
+assert(UseLargePages && UseHugeTLBFS, "only for Huge TLBFS large pages");
+assert(is_ptr_aligned(req_addr, alignment), "Must be");
+assert(is_power_of_2(alignment), "Must be");
+assert(is_power_of_2(os::large_page_size()), "Must be");
+assert(bytes >= os::large_page_size(), "Shouldn't allocate large pages for small sizes");
+if (is_size_aligned(bytes, os::large_page_size()) && alignment <= os::large_page_size()) {
+return reserve_memory_special_huge_tlbfs_only(bytes, req_addr, exec);
+} else {
+return reserve_memory_special_huge_tlbfs_mixed(bytes, alignment, req_addr, exec);
+}
+}
+char* os::reserve_memory_special(size_t bytes, size_t alignment, char* req_addr, bool exec) {
+assert(UseLargePages, "only for large pages");
+char* addr;
+if (UseSHM) {
+addr = os::Linux::reserve_memory_special_shm(bytes, alignment, req_addr, exec);
+} else {
+assert(UseHugeTLBFS, "must be");
+addr = os::Linux::reserve_memory_special_huge_tlbfs(bytes, alignment, req_addr, exec);
+}
+if (addr != NULL) {
+if (UseNUMAInterleaving) {
+numa_make_global(addr, bytes);
+}
+// The memory is committed
+MemTracker::record_virtual_memory_reserve_and_commit((address)addr, bytes, mtNone, CALLER_PC);
+}
+return addr;
+}
+bool os::Linux::release_memory_special_shm(char* base, size_t bytes) {
+// detaching the SHM segment will also delete it, see reserve_memory_special_shm()
+return shmdt(base) == 0;
+}
+bool os::Linux::release_memory_special_huge_tlbfs(char* base, size_t bytes) {
+return pd_release_memory(base, bytes);
+}
 bool os::release_memory_special(char* base, size_t bytes) {
+assert(UseLargePages, "only for large pages");
 MemTracker::Tracker tkr = MemTracker::get_virtual_memory_release_tracker();
-// detaching the SHM segment will also delete it, see reserve_memory_special()
-int rslt = shmdt(base);
+bool res;
-if (rslt == 0) {
+if (UseSHM) {
+res = os::Linux::release_memory_special_shm(base, bytes);
+} else {
+assert(UseHugeTLBFS, "must be");
+res = os::Linux::release_memory_special_huge_tlbfs(base, bytes);
+}
+if (res) {
 tkr.record((address)base, bytes);
-return true;
 } else {
 tkr.discard();
-return false;
+}
-}
+return res;
 }
 size_t os::large_page_size() {
 return _large_page_size;
 }
-// HugeTLBFS allows application to commit large page memory on demand;
+// With SysV SHM the entire memory region must be allocated as shared
-// with SysV SHM the entire memory region must be allocated as shared
 // memory.
+// HugeTLBFS allows application to commit large page memory on demand.
+// However, when committing memory with HugeTLBFS fails, the region
+// that was supposed to be committed will lose the old reservation
+// and allow other threads to steal that memory region. Because of this
+// behavior we can't commit HugeTLBFS memory.
 bool os::can_commit_large_page_memory() {
-return UseHugeTLBFS;
+return UseTransparentHugePages;
 }
 bool os::can_execute_large_page_memory() {
-return UseHugeTLBFS;
+return UseTransparentHugePages || UseHugeTLBFS;
 }
 // Reserve memory at an arbitrary address, only if that area is
 // available (and not reserved for something else).
 // main_thread points to the aboriginal thread
 Linux::_main_thread = pthread_self();
 Linux::clock_init();
 initial_time_count = os::elapsed_counter();
+// pthread_condattr initialization for monotonic clock
+int status;
+pthread_condattr_t* _condattr = os::Linux::condAttr();
+if ((status = pthread_condattr_init(_condattr)) != 0) {
+fatal(err_msg("pthread_condattr_init: %s", strerror(status)));
+}
+// Only set the clock if CLOCK_MONOTONIC is available
+if (Linux::supports_monotonic_clock()) {
+if ((status = pthread_condattr_setclock(_condattr, CLOCK_MONOTONIC)) != 0) {
+if (status == EINVAL) {
+warning("Unable to use monotonic clock with relative timed-waits" \
+" - changes to the time-of-day clock may have adverse affects");
+} else {
+fatal(err_msg("pthread_condattr_setclock: %s", strerror(status)));
+}
+}
+}
+// else it defaults to CLOCK_REALTIME
 pthread_mutex_init(&dl_mutex, NULL);
 // If the pagesize of the VM is greater than 8K determine the appropriate
 // number of initial guard pages.  The user can change this with the
 // command line arguments, if needed.
 #ifndef PRODUCT
 if(Verbose && PrintMiscellaneous)
 tty->print("[Memory Serialize  Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
 #endif
 }
-os::large_page_init();
 // initialize suspend/resume support - must do this before signal_sets_init()
 if (SR_initialize() != 0) {
 perror("SR_initialize failed");
 return JNI_ERR;
 JavaThread::set_stack_size_at_create(round_to(threadStackSizeInBytes,
 vm_page_size()));
 Linux::capture_initial_stack(JavaThread::stack_size_at_create());
+#if defined(IA32)
+workaround_expand_exec_shield_cs_limit();
+#endif
 Linux::libpthread_init();
 if (PrintMiscellaneous && (Verbose || WizardMode)) {
 tty->print_cr("[HotSpot is running with %s, %s(%s)]\n",
 Linux::glibc_version(), Linux::libpthread_version(),
 Linux::is_floating_stack() ? "floating stack" : "fixed stack");
 if ((Linux::numa_max_node() < 1)) {
 // There's only one node(they start from 0), disable NUMA.
 UseNUMA = false;
 }
 }
-// With SHM large pages we cannot uncommit a page, so there's not way
+// With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
 // we can make the adaptive lgrp chunk resizing work. If the user specified
-// both UseNUMA and UseLargePages (or UseSHM) on the command line - warn and
+// both UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn and
 // disable adaptive resizing.
-if (UseNUMA && UseLargePages && UseSHM) {
+if (UseNUMA && UseLargePages && !can_commit_large_page_memory()) {
-if (!FLAG_IS_DEFAULT(UseNUMA)) {
+if (FLAG_IS_DEFAULT(UseNUMA)) {
-if (FLAG_IS_DEFAULT(UseLargePages) && FLAG_IS_DEFAULT(UseSHM)) {
+UseNUMA = false;
+} else {
+if (FLAG_IS_DEFAULT(UseLargePages) &&
+FLAG_IS_DEFAULT(UseSHM) &&
+FLAG_IS_DEFAULT(UseHugeTLBFS)) {
 UseLargePages = false;
 } else {
-warning("UseNUMA is not fully compatible with SHM large pages, disabling adaptive resizing");
+warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, disabling adaptive resizing");
 UseAdaptiveSizePolicy = false;
 UseAdaptiveNUMAChunkSizing = false;
 }
-} else {
-UseNUMA = false;
 }
 }
 if (!UseNUMA && ForceNUMA) {
 UseNUMA = true;
 }
 // abstime will be the absolute timeout time
 // TODO: replace compute_abstime() with unpackTime()
 static struct timespec* compute_abstime(timespec* abstime, jlong millis) {
 if (millis < 0)  millis = 0;
-struct timeval now;
-int status = gettimeofday(&now, NULL);
-assert(status == 0, "gettimeofday");
 jlong seconds = millis / 1000;
 millis %= 1000;
 if (seconds > 50000000) { // see man cond_timedwait(3T)
 seconds = 50000000;
 }
-abstime->tv_sec = now.tv_sec  + seconds;
-long       usec = now.tv_usec + millis * 1000;
+if (os::Linux::supports_monotonic_clock()) {
-if (usec >= 1000000) {
+struct timespec now;
-abstime->tv_sec += 1;
+int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
-usec -= 1000000;
+assert_status(status == 0, status, "clock_gettime");
-}
+abstime->tv_sec = now.tv_sec  + seconds;
-abstime->tv_nsec = usec * 1000;
+long nanos = now.tv_nsec + millis * NANOSECS_PER_MILLISEC;
+if (nanos >= NANOSECS_PER_SEC) {
+abstime->tv_sec += 1;
+nanos -= NANOSECS_PER_SEC;
+}
+abstime->tv_nsec = nanos;
+} else {
+struct timeval now;
+int status = gettimeofday(&now, NULL);
+assert(status == 0, "gettimeofday");
+abstime->tv_sec = now.tv_sec  + seconds;
+long usec = now.tv_usec + millis * 1000;
+if (usec >= 1000000) {
+abstime->tv_sec += 1;
+usec -= 1000000;
+}
+abstime->tv_nsec = usec * 1000;
+}
 return abstime;
 }
 // Test-and-clear _Event, always leaves _Event set to 0, returns immediately.
 while (_Event < 0) {
 status = os::Linux::safe_cond_timedwait(_cond, _mutex, &abst);
 if (status != 0 && WorkAroundNPTLTimedWaitHang) {
 pthread_cond_destroy (_cond);
-pthread_cond_init (_cond, NULL) ;
+pthread_cond_init (_cond, os::Linux::condAttr()) ;
 }
 assert_status(status == 0 || status == EINTR ||
 status == ETIME || status == ETIMEDOUT,
 status, "cond_timedwait");
 if (!FilterSpuriousWakeups) break ;                 // previous semantics
 * years from "now".
 */
 static void unpackTime(timespec* absTime, bool isAbsolute, jlong time) {
 assert (time > 0, "convertTime");
+time_t max_secs = 0;
-struct timeval now;
-int status = gettimeofday(&now, NULL);
+if (!os::Linux::supports_monotonic_clock() || isAbsolute) {
-assert(status == 0, "gettimeofday");
+struct timeval now;
+int status = gettimeofday(&now, NULL);
-time_t max_secs = now.tv_sec + MAX_SECS;
+assert(status == 0, "gettimeofday");
-if (isAbsolute) {
+max_secs = now.tv_sec + MAX_SECS;
-jlong secs = time / 1000;
-if (secs > max_secs) {
+if (isAbsolute) {
-absTime->tv_sec = max_secs;
+jlong secs = time / 1000;
-}
+if (secs > max_secs) {
-else {
+absTime->tv_sec = max_secs;
-absTime->tv_sec = secs;
+} else {
-}
+absTime->tv_sec = secs;
-absTime->tv_nsec = (time % 1000) * NANOSECS_PER_MILLISEC;
+}
-}
+absTime->tv_nsec = (time % 1000) * NANOSECS_PER_MILLISEC;
-else {
+} else {
+jlong secs = time / NANOSECS_PER_SEC;
+if (secs >= MAX_SECS) {
+absTime->tv_sec = max_secs;
+absTime->tv_nsec = 0;
+} else {
+absTime->tv_sec = now.tv_sec + secs;
+absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_usec*1000;
+if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
+absTime->tv_nsec -= NANOSECS_PER_SEC;
+++absTime->tv_sec; // note: this must be <= max_secs
+}
+}
+}
+} else {
+// must be relative using monotonic clock
+struct timespec now;
+int status = os::Linux::clock_gettime(CLOCK_MONOTONIC, &now);
+assert_status(status == 0, status, "clock_gettime");
+max_secs = now.tv_sec + MAX_SECS;
 jlong secs = time / NANOSECS_PER_SEC;
 if (secs >= MAX_SECS) {
 absTime->tv_sec = max_secs;
 absTime->tv_nsec = 0;
-}
+} else {
-else {
 absTime->tv_sec = now.tv_sec + secs;
-absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_usec*1000;
+absTime->tv_nsec = (time % NANOSECS_PER_SEC) + now.tv_nsec;
 if (absTime->tv_nsec >= NANOSECS_PER_SEC) {
 absTime->tv_nsec -= NANOSECS_PER_SEC;
 ++absTime->tv_sec; // note: this must be <= max_secs
 }
 }
 OSThreadWaitState osts(thread->osthread(), false /* not Object.wait() */);
 jt->set_suspend_equivalent();
 // cleared by handle_special_suspend_equivalent_condition() or java_suspend_self()
+assert(_cur_index == -1, "invariant");
 if (time == 0) {
-status = pthread_cond_wait (_cond, _mutex) ;
+_cur_index = REL_INDEX; // arbitrary choice when not timed
+status = pthread_cond_wait (&_cond[_cur_index], _mutex) ;
 } else {
-status = os::Linux::safe_cond_timedwait (_cond, _mutex, &absTime) ;
+_cur_index = isAbsolute ? ABS_INDEX : REL_INDEX;
+status = os::Linux::safe_cond_timedwait (&_cond[_cur_index], _mutex, &absTime) ;
 if (status != 0 && WorkAroundNPTLTimedWaitHang) {
-pthread_cond_destroy (_cond) ;
+pthread_cond_destroy (&_cond[_cur_index]) ;
-pthread_cond_init    (_cond, NULL);
+pthread_cond_init    (&_cond[_cur_index], isAbsolute ? NULL : os::Linux::condAttr());
 }
 }
+_cur_index = -1;
 assert_status(status == 0 || status == EINTR ||
 status == ETIME || status == ETIMEDOUT,
 status, "cond_timedwait");
 #ifdef ASSERT
 status = pthread_mutex_lock(_mutex);
 assert (status == 0, "invariant") ;
 s = _counter;
 _counter = 1;
 if (s < 1) {
-if (WorkAroundNPTLTimedWaitHang) {
+// thread might be parked
-status = pthread_cond_signal (_cond) ;
+if (_cur_index != -1) {
-assert (status == 0, "invariant") ;
+// thread is definitely parked
+if (WorkAroundNPTLTimedWaitHang) {
+status = pthread_cond_signal (&_cond[_cur_index]);
+assert (status == 0, "invariant");
 status = pthread_mutex_unlock(_mutex);
-assert (status == 0, "invariant") ;
+assert (status == 0, "invariant");
 } else {
 status = pthread_mutex_unlock(_mutex);
-assert (status == 0, "invariant") ;
+assert (status == 0, "invariant");
-status = pthread_cond_signal (_cond) ;
+status = pthread_cond_signal (&_cond[_cur_index]);
-assert (status == 0, "invariant") ;
+assert (status == 0, "invariant");
 }
+} else {
+pthread_mutex_unlock(_mutex);
+assert (status == 0, "invariant") ;
+}
 } else {
 pthread_mutex_unlock(_mutex);
 assert (status == 0, "invariant") ;
 }
 }
 new MemNotifyThread(fd);
 }
 }
 #endif // JAVASE_EMBEDDED
+/////////////// Unit tests ///////////////
+#ifndef PRODUCT
+#define test_log(...) \
+do {\
+if (VerboseInternalVMTests) { \
+tty->print_cr(__VA_ARGS__); \
+tty->flush(); \
+}\
+} while (false)
+class TestReserveMemorySpecial : AllStatic {
+public:
+static void small_page_write(void* addr, size_t size) {
+size_t page_size = os::vm_page_size();
+char* end = (char*)addr + size;
+for (char* p = (char*)addr; p < end; p += page_size) {
+*p = 1;
+}
+}
+static void test_reserve_memory_special_huge_tlbfs_only(size_t size) {
+if (!UseHugeTLBFS) {
+return;
+}
+test_log("test_reserve_memory_special_huge_tlbfs_only(" SIZE_FORMAT ")", size);
+char* addr = os::Linux::reserve_memory_special_huge_tlbfs_only(size, NULL, false);
+if (addr != NULL) {
+small_page_write(addr, size);
+os::Linux::release_memory_special_huge_tlbfs(addr, size);
+}
+}
+static void test_reserve_memory_special_huge_tlbfs_only() {
+if (!UseHugeTLBFS) {
+return;
+}
+size_t lp = os::large_page_size();
+for (size_t size = lp; size <= lp * 10; size += lp) {
+test_reserve_memory_special_huge_tlbfs_only(size);
+}
+}
+static void test_reserve_memory_special_huge_tlbfs_mixed(size_t size, size_t alignment) {
+if (!UseHugeTLBFS) {
+return;
+}
+test_log("test_reserve_memory_special_huge_tlbfs_mixed(" SIZE_FORMAT ", " SIZE_FORMAT ")",
+size, alignment);
+assert(size >= os::large_page_size(), "Incorrect input to test");
+char* addr = os::Linux::reserve_memory_special_huge_tlbfs_mixed(size, alignment, NULL, false);
+if (addr != NULL) {
+small_page_write(addr, size);
+os::Linux::release_memory_special_huge_tlbfs(addr, size);
+}
+}
+static void test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(size_t size) {
+size_t lp = os::large_page_size();
+size_t ag = os::vm_allocation_granularity();
+for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
+test_reserve_memory_special_huge_tlbfs_mixed(size, alignment);
+}
+}
+static void test_reserve_memory_special_huge_tlbfs_mixed() {
+size_t lp = os::large_page_size();
+size_t ag = os::vm_allocation_granularity();
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp + ag);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp + lp / 2);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 2);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 2 + ag);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 2 - ag);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 2 + lp / 2);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 10);
+test_reserve_memory_special_huge_tlbfs_mixed_all_alignments(lp * 10 + lp / 2);
+}
+static void test_reserve_memory_special_huge_tlbfs() {
+if (!UseHugeTLBFS) {
+return;
+}
+test_reserve_memory_special_huge_tlbfs_only();
+test_reserve_memory_special_huge_tlbfs_mixed();
+}
+static void test_reserve_memory_special_shm(size_t size, size_t alignment) {
+if (!UseSHM) {
+return;
+}
+test_log("test_reserve_memory_special_shm(" SIZE_FORMAT ", " SIZE_FORMAT ")", size, alignment);
+char* addr = os::Linux::reserve_memory_special_shm(size, alignment, NULL, false);
+if (addr != NULL) {
+assert(is_ptr_aligned(addr, alignment), "Check");
+assert(is_ptr_aligned(addr, os::large_page_size()), "Check");
+small_page_write(addr, size);
+os::Linux::release_memory_special_shm(addr, size);
+}
+}
+static void test_reserve_memory_special_shm() {
+size_t lp = os::large_page_size();
+size_t ag = os::vm_allocation_granularity();
+for (size_t size = ag; size < lp * 3; size += ag) {
+for (size_t alignment = ag; is_size_aligned(size, alignment); alignment *= 2) {
+test_reserve_memory_special_shm(size, alignment);
+}
+}
+}
+static void test() {
+test_reserve_memory_special_huge_tlbfs();
+test_reserve_memory_special_shm();
+}
+};
+void TestReserveMemorySpecial_test() {
+TestReserveMemorySpecial::test();
+}
+#endif

Mercurial > hg > graal-compiler

comparison src/os/linux/vm/os_linux.cpp @ 12355:cefad50507d8