# HG changeset patch
# User tschatzl
# Date 1395671430 -3600
# Node ID d7070f3717703de7e442cc33ace220748c0ba003
# Parent  191174b49bec5fcb022b131bd221e7d1091f14c0
8035815: Cache-align and pad the from card cache
Summary: The from card cache is a very frequently accessed data structure. It is essentially a 2d array of per-region values, one row of values for every GC thread. Pad and align the data structure to avoid false sharing.
Reviewed-by: stefank

diff -r 191174b49bec -r d7070f371770 src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp
--- a/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp	Mon Mar 24 15:30:14 2014 +0100
+++ b/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp	Mon Mar 24 15:30:30 2014 +0100
@@ -29,6 +29,7 @@
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
 #include "memory/allocation.hpp"
+#include "memory/padded.inline.hpp"
 #include "memory/space.inline.hpp"
 #include "oops/oop.inline.hpp"
 #include "utilities/bitMap.inline.hpp"
@@ -358,27 +359,29 @@
 }
 
 int**  OtherRegionsTable::_from_card_cache = NULL;
-size_t OtherRegionsTable::_from_card_cache_max_regions = 0;
+uint   OtherRegionsTable::_from_card_cache_max_regions = 0;
 size_t OtherRegionsTable::_from_card_cache_mem_size = 0;
 
-void OtherRegionsTable::init_from_card_cache(size_t max_regions) {
-  _from_card_cache_max_regions = max_regions;
+void OtherRegionsTable::init_from_card_cache(uint max_regions) {
+  guarantee(_from_card_cache == NULL, "Should not call this multiple times");
+  uint n_par_rs = HeapRegionRemSet::num_par_rem_sets();
 
-  int n_par_rs = HeapRegionRemSet::num_par_rem_sets();
-  _from_card_cache = NEW_C_HEAP_ARRAY(int*, n_par_rs, mtGC);
-  for (int i = 0; i < n_par_rs; i++) {
-    _from_card_cache[i] = NEW_C_HEAP_ARRAY(int, max_regions, mtGC);
-    for (size_t j = 0; j < max_regions; j++) {
+  _from_card_cache_max_regions = max_regions;
+  _from_card_cache = Padded2DArray<int, mtGC>::create_unfreeable(n_par_rs,
+                                                                 _from_card_cache_max_regions,
+                                                                 &_from_card_cache_mem_size);
+
+  for (uint i = 0; i < n_par_rs; i++) {
+    for (uint j = 0; j < _from_card_cache_max_regions; j++) {
       _from_card_cache[i][j] = -1;  // An invalid value.
     }
   }
-  _from_card_cache_mem_size = n_par_rs * max_regions * sizeof(int);
 }
 
-void OtherRegionsTable::shrink_from_card_cache(size_t new_n_regs) {
-  for (int i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
+void OtherRegionsTable::shrink_from_card_cache(uint new_n_regs) {
+  for (uint i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
     assert(new_n_regs <= _from_card_cache_max_regions, "Must be within max.");
-    for (size_t j = new_n_regs; j < _from_card_cache_max_regions; j++) {
+    for (uint j = new_n_regs; j < _from_card_cache_max_regions; j++) {
       _from_card_cache[i][j] = -1;  // An invalid value.
     }
   }
@@ -386,8 +389,8 @@
 
 #ifndef PRODUCT
 void OtherRegionsTable::print_from_card_cache() {
-  for (int i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
-    for (size_t j = 0; j < _from_card_cache_max_regions; j++) {
+  for (uint i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
+    for (uint j = 0; j < _from_card_cache_max_regions; j++) {
       gclog_or_tty->print_cr("_from_card_cache[%d][%d] = %d.",
                     i, j, _from_card_cache[i][j]);
     }
@@ -727,8 +730,8 @@
 }
 
 void OtherRegionsTable::clear_fcc() {
-  size_t hrs_idx = hr()->hrs_index();
-  for (int i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
+  uint hrs_idx = hr()->hrs_index();
+  for (uint i = 0; i < HeapRegionRemSet::num_par_rem_sets(); i++) {
     _from_card_cache[i][hrs_idx] = -1;
   }
 }
@@ -762,8 +765,8 @@
     _coarse_map.par_at_put(hrs_ind, 0);
   }
   // Check to see if any of the fcc entries come from here.
-  size_t hr_ind = (size_t) hr()->hrs_index();
-  for (int tid = 0; tid < HeapRegionRemSet::num_par_rem_sets(); tid++) {
+  uint hr_ind = hr()->hrs_index();
+  for (uint tid = 0; tid < HeapRegionRemSet::num_par_rem_sets(); tid++) {
     int fcc_ent = _from_card_cache[tid][hr_ind];
     if (fcc_ent != -1) {
       HeapWord* card_addr = (HeapWord*)
@@ -838,8 +841,8 @@
 // Determines how many threads can add records to an rset in parallel.
 // This can be done by either mutator threads together with the
 // concurrent refinement threads or GC threads.
-int HeapRegionRemSet::num_par_rem_sets() {
-  return (int)MAX2(DirtyCardQueueSet::num_par_ids() + ConcurrentG1Refine::thread_num(), ParallelGCThreads);
+uint HeapRegionRemSet::num_par_rem_sets() {
+  return (uint)MAX2(DirtyCardQueueSet::num_par_ids() + ConcurrentG1Refine::thread_num(), ParallelGCThreads);
 }
 
 HeapRegionRemSet::HeapRegionRemSet(G1BlockOffsetSharedArray* bosa,
diff -r 191174b49bec -r d7070f371770 src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp
--- a/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp	Mon Mar 24 15:30:14 2014 +0100
+++ b/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp	Mon Mar 24 15:30:30 2014 +0100
@@ -121,7 +121,7 @@
 
   // Indexed by thread X heap region, to minimize thread contention.
   static int** _from_card_cache;
-  static size_t _from_card_cache_max_regions;
+  static uint _from_card_cache_max_regions;
   static size_t _from_card_cache_mem_size;
 
   // link/add the given fine grain remembered set into the "all" list
@@ -170,11 +170,11 @@
 
   // Declare the heap size (in # of regions) to the OtherRegionsTable.
   // (Uses it to initialize from_card_cache).
-  static void init_from_card_cache(size_t max_regions);
+  static void init_from_card_cache(uint max_regions);
 
   // Declares that only regions i s.t. 0 <= i < new_n_regs are in use.
   // Make sure any entries for higher regions are invalid.
-  static void shrink_from_card_cache(size_t new_n_regs);
+  static void shrink_from_card_cache(uint new_n_regs);
 
   static void print_from_card_cache();
 };
@@ -222,7 +222,7 @@
 public:
   HeapRegionRemSet(G1BlockOffsetSharedArray* bosa, HeapRegion* hr);
 
-  static int num_par_rem_sets();
+  static uint num_par_rem_sets();
   static void setup_remset_size();
 
   HeapRegion* hr() const {
@@ -358,12 +358,12 @@
   // (Uses it to initialize from_card_cache).
   static void init_heap(uint max_regions) {
     G1CodeRootSet::initialize();
-    OtherRegionsTable::init_from_card_cache((size_t) max_regions);
+    OtherRegionsTable::init_from_card_cache(max_regions);
   }
 
   // Declares that only regions i s.t. 0 <= i < new_n_regs are in use.
   static void shrink_heap(uint new_n_regs) {
-    OtherRegionsTable::shrink_from_card_cache((size_t) new_n_regs);
+    OtherRegionsTable::shrink_from_card_cache(new_n_regs);
   }
 
 #ifndef PRODUCT
diff -r 191174b49bec -r d7070f371770 src/share/vm/memory/padded.hpp
--- a/src/share/vm/memory/padded.hpp	Mon Mar 24 15:30:14 2014 +0100
+++ b/src/share/vm/memory/padded.hpp	Mon Mar 24 15:30:30 2014 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -90,4 +90,15 @@
   static PaddedEnd<T>* create_unfreeable(uint length);
 };
 
+// Helper class to create an array of references to arrays of primitive types
+// Both the array of references and the data arrays are aligned to the given
+// alignment. The allocated memory is zero-filled.
+template <class T, MEMFLAGS flags, size_t alignment = DEFAULT_CACHE_LINE_SIZE>
+class Padded2DArray {
+ public:
+  // Creates an aligned padded 2D array.
+  // The memory cannot be deleted since the raw memory chunk is not returned.
+  static T** create_unfreeable(uint rows, uint columns, size_t* allocation_size = NULL);
+};
+
 #endif // SHARE_VM_MEMORY_PADDED_HPP
diff -r 191174b49bec -r d7070f371770 src/share/vm/memory/padded.inline.hpp
--- a/src/share/vm/memory/padded.inline.hpp	Mon Mar 24 15:30:14 2014 +0100
+++ b/src/share/vm/memory/padded.inline.hpp	Mon Mar 24 15:30:30 2014 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,3 +47,32 @@
 
   return aligned_padded_array;
 }
+
+template <class T, MEMFLAGS flags, size_t alignment>
+T** Padded2DArray<T, flags, alignment>::create_unfreeable(uint rows, uint columns, size_t* allocation_size) {
+  // Calculate and align the size of the first dimension's table.
+  size_t table_size = align_size_up_(rows * sizeof(T*), alignment);
+  // The size of the separate rows.
+  size_t row_size = align_size_up_(columns * sizeof(T), alignment);
+  // Total size consists of the indirection table plus the rows.
+  size_t total_size = table_size + rows * row_size + alignment;
+
+  // Allocate a chunk of memory large enough to allow alignment of the chunk.
+  void* chunk = AllocateHeap(total_size, flags);
+  // Clear the allocated memory.
+  memset(chunk, 0, total_size);
+  // Align the chunk of memory.
+  T** result = (T**)align_pointer_up(chunk, alignment);
+  void* data_start = (void*)((uintptr_t)result + table_size);
+
+  // Fill in the row table.
+  for (size_t i = 0; i < rows; i++) {
+    result[i] = (T*)((uintptr_t)data_start + i * row_size);
+  }
+
+  if (allocation_size != NULL) {
+    *allocation_size = total_size;
+  }
+
+  return result;
+}