libstdc++
|
00001 // -*- C++ -*- 00002 00003 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the terms 00007 // of the GNU General Public License as published by the Free Software 00008 // Foundation; either version 3, or (at your option) any later 00009 // version. 00010 00011 // This library is distributed in the hope that it will be useful, but 00012 // WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 // General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** @file parallel/random_shuffle.h 00026 * @brief Parallel implementation of std::random_shuffle(). 00027 * This file is a GNU parallel extension to the Standard C++ Library. 00028 */ 00029 00030 // Written by Johannes Singler. 00031 00032 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 00033 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1 00034 00035 #include <limits> 00036 #include <bits/stl_numeric.h> 00037 #include <parallel/parallel.h> 00038 #include <parallel/random_number.h> 00039 00040 namespace __gnu_parallel 00041 { 00042 /** @brief Type to hold the index of a bin. 00043 * 00044 * Since many variables of this type are allocated, it should be 00045 * chosen as small as possible. 00046 */ 00047 typedef unsigned short bin_index; 00048 00049 /** @brief Data known to every thread participating in 00050 __gnu_parallel::parallel_random_shuffle(). */ 00051 template<typename RandomAccessIterator> 00052 struct DRandomShufflingGlobalData 00053 { 00054 typedef std::iterator_traits<RandomAccessIterator> traits_type; 00055 typedef typename traits_type::value_type value_type; 00056 typedef typename traits_type::difference_type difference_type; 00057 00058 /** @brief Begin iterator of the source. */ 00059 RandomAccessIterator& source; 00060 00061 /** @brief Temporary arrays for each thread. */ 00062 value_type** temporaries; 00063 00064 /** @brief Two-dimensional array to hold the thread-bin distribution. 00065 * 00066 * Dimensions (num_threads + 1) x (num_bins + 1). */ 00067 difference_type** dist; 00068 00069 /** @brief Start indexes of the threads' chunks. */ 00070 difference_type* starts; 00071 00072 /** @brief Number of the thread that will further process the 00073 corresponding bin. */ 00074 thread_index_t* bin_proc; 00075 00076 /** @brief Number of bins to distribute to. */ 00077 int num_bins; 00078 00079 /** @brief Number of bits needed to address the bins. */ 00080 int num_bits; 00081 00082 /** @brief Constructor. */ 00083 DRandomShufflingGlobalData(RandomAccessIterator& _source) 00084 : source(_source) { } 00085 }; 00086 00087 /** @brief Local data for a thread participating in 00088 __gnu_parallel::parallel_random_shuffle(). 00089 */ 00090 template<typename RandomAccessIterator, typename RandomNumberGenerator> 00091 struct DRSSorterPU 00092 { 00093 /** @brief Number of threads participating in total. */ 00094 int num_threads; 00095 00096 /** @brief Begin index for bins taken care of by this thread. */ 00097 bin_index bins_begin; 00098 00099 /** @brief End index for bins taken care of by this thread. */ 00100 bin_index bins_end; 00101 00102 /** @brief Random seed for this thread. */ 00103 uint32 seed; 00104 00105 /** @brief Pointer to global data. */ 00106 DRandomShufflingGlobalData<RandomAccessIterator>* sd; 00107 }; 00108 00109 /** @brief Generate a random number in @c [0,2^logp). 00110 * @param logp Logarithm (basis 2) of the upper range bound. 00111 * @param rng Random number generator to use. 00112 */ 00113 template<typename RandomNumberGenerator> 00114 inline int 00115 random_number_pow2(int logp, RandomNumberGenerator& rng) 00116 { return rng.genrand_bits(logp); } 00117 00118 /** @brief Random shuffle code executed by each thread. 00119 * @param pus Array of thread-local data records. */ 00120 template<typename RandomAccessIterator, typename RandomNumberGenerator> 00121 void 00122 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, 00123 RandomNumberGenerator>* pus) 00124 { 00125 typedef std::iterator_traits<RandomAccessIterator> traits_type; 00126 typedef typename traits_type::value_type value_type; 00127 typedef typename traits_type::difference_type difference_type; 00128 00129 thread_index_t iam = omp_get_thread_num(); 00130 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam]; 00131 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; 00132 00133 // Indexing: dist[bin][processor] 00134 difference_type length = sd->starts[iam + 1] - sd->starts[iam]; 00135 bin_index* oracles = new bin_index[length]; 00136 difference_type* dist = new difference_type[sd->num_bins + 1]; 00137 bin_index* bin_proc = new bin_index[sd->num_bins]; 00138 value_type** temporaries = new value_type*[d->num_threads]; 00139 00140 // Compute oracles and count appearances. 00141 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 00142 dist[b] = 0; 00143 int num_bits = sd->num_bits; 00144 00145 random_number rng(d->seed); 00146 00147 // First main loop. 00148 for (difference_type i = 0; i < length; ++i) 00149 { 00150 bin_index oracle = random_number_pow2(num_bits, rng); 00151 oracles[i] = oracle; 00152 00153 // To allow prefix (partial) sum. 00154 ++(dist[oracle + 1]); 00155 } 00156 00157 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 00158 sd->dist[b][iam + 1] = dist[b]; 00159 00160 # pragma omp barrier 00161 00162 # pragma omp single 00163 { 00164 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the 00165 // total number of items in bin s 00166 for (bin_index s = 0; s < sd->num_bins; ++s) 00167 __gnu_sequential::partial_sum(sd->dist[s + 1], 00168 sd->dist[s + 1] + d->num_threads + 1, 00169 sd->dist[s + 1]); 00170 } 00171 00172 # pragma omp barrier 00173 00174 sequence_index_t offset = 0, global_offset = 0; 00175 for (bin_index s = 0; s < d->bins_begin; ++s) 00176 global_offset += sd->dist[s + 1][d->num_threads]; 00177 00178 # pragma omp barrier 00179 00180 for (bin_index s = d->bins_begin; s < d->bins_end; ++s) 00181 { 00182 for (int t = 0; t < d->num_threads + 1; ++t) 00183 sd->dist[s + 1][t] += offset; 00184 offset = sd->dist[s + 1][d->num_threads]; 00185 } 00186 00187 sd->temporaries[iam] = static_cast<value_type*>( 00188 ::operator new(sizeof(value_type) * offset)); 00189 00190 # pragma omp barrier 00191 00192 // Draw local copies to avoid false sharing. 00193 for (bin_index b = 0; b < sd->num_bins + 1; ++b) 00194 dist[b] = sd->dist[b][iam]; 00195 for (bin_index b = 0; b < sd->num_bins; ++b) 00196 bin_proc[b] = sd->bin_proc[b]; 00197 for (thread_index_t t = 0; t < d->num_threads; ++t) 00198 temporaries[t] = sd->temporaries[t]; 00199 00200 RandomAccessIterator source = sd->source; 00201 difference_type start = sd->starts[iam]; 00202 00203 // Distribute according to oracles, second main loop. 00204 for (difference_type i = 0; i < length; ++i) 00205 { 00206 bin_index target_bin = oracles[i]; 00207 thread_index_t target_p = bin_proc[target_bin]; 00208 00209 // Last column [d->num_threads] stays unchanged. 00210 ::new(&(temporaries[target_p][dist[target_bin + 1]++])) 00211 value_type(*(source + i + start)); 00212 } 00213 00214 delete[] oracles; 00215 delete[] dist; 00216 delete[] bin_proc; 00217 delete[] temporaries; 00218 00219 # pragma omp barrier 00220 00221 // Shuffle bins internally. 00222 for (bin_index b = d->bins_begin; b < d->bins_end; ++b) 00223 { 00224 value_type* begin = 00225 sd->temporaries[iam] + 00226 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), 00227 * end = 00228 sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; 00229 sequential_random_shuffle(begin, end, rng); 00230 std::copy(begin, end, sd->source + global_offset + 00231 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); 00232 } 00233 00234 ::operator delete(sd->temporaries[iam]); 00235 } 00236 00237 /** @brief Round up to the next greater power of 2. 00238 * @param x Integer to round up */ 00239 template<typename T> 00240 T 00241 round_up_to_pow2(T x) 00242 { 00243 if (x <= 1) 00244 return 1; 00245 else 00246 return (T)1 << (__log2(x - 1) + 1); 00247 } 00248 00249 /** @brief Main parallel random shuffle step. 00250 * @param begin Begin iterator of sequence. 00251 * @param end End iterator of sequence. 00252 * @param n Length of sequence. 00253 * @param num_threads Number of threads to use. 00254 * @param rng Random number generator to use. 00255 */ 00256 template<typename RandomAccessIterator, typename RandomNumberGenerator> 00257 void 00258 parallel_random_shuffle_drs(RandomAccessIterator begin, 00259 RandomAccessIterator end, 00260 typename std::iterator_traits 00261 <RandomAccessIterator>::difference_type n, 00262 thread_index_t num_threads, 00263 RandomNumberGenerator& rng) 00264 { 00265 typedef std::iterator_traits<RandomAccessIterator> traits_type; 00266 typedef typename traits_type::value_type value_type; 00267 typedef typename traits_type::difference_type difference_type; 00268 00269 _GLIBCXX_CALL(n) 00270 00271 const _Settings& __s = _Settings::get(); 00272 00273 if (num_threads > n) 00274 num_threads = static_cast<thread_index_t>(n); 00275 00276 bin_index num_bins, num_bins_cache; 00277 00278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 00279 // Try the L1 cache first. 00280 00281 // Must fit into L1. 00282 num_bins_cache = std::max<difference_type>( 00283 1, n / (__s.L1_cache_size_lb / sizeof(value_type))); 00284 num_bins_cache = round_up_to_pow2(num_bins_cache); 00285 00286 // No more buckets than TLB entries, power of 2 00287 // Power of 2 and at least one element per bin, at most the TLB size. 00288 num_bins = std::min<difference_type>(n, num_bins_cache); 00289 00290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 00291 // 2 TLB entries needed per bin. 00292 num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins); 00293 #endif 00294 num_bins = round_up_to_pow2(num_bins); 00295 00296 if (num_bins < num_bins_cache) 00297 { 00298 #endif 00299 // Now try the L2 cache 00300 // Must fit into L2 00301 num_bins_cache = static_cast<bin_index>(std::max<difference_type>( 00302 1, n / (__s.L2_cache_size / sizeof(value_type)))); 00303 num_bins_cache = round_up_to_pow2(num_bins_cache); 00304 00305 // No more buckets than TLB entries, power of 2. 00306 num_bins = static_cast<bin_index>( 00307 std::min(n, static_cast<difference_type>(num_bins_cache))); 00308 // Power of 2 and at least one element per bin, at most the TLB size. 00309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 00310 // 2 TLB entries needed per bin. 00311 num_bins = std::min( 00312 static_cast<difference_type>(__s.TLB_size / 2), num_bins); 00313 #endif 00314 num_bins = round_up_to_pow2(num_bins); 00315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 00316 } 00317 #endif 00318 00319 num_threads = std::min<bin_index>(num_threads, num_bins); 00320 00321 if (num_threads <= 1) 00322 return sequential_random_shuffle(begin, end, rng); 00323 00324 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); 00325 DRSSorterPU<RandomAccessIterator, random_number >* pus; 00326 difference_type* starts; 00327 00328 # pragma omp parallel num_threads(num_threads) 00329 { 00330 thread_index_t num_threads = omp_get_num_threads(); 00331 # pragma omp single 00332 { 00333 pus = new DRSSorterPU<RandomAccessIterator, random_number> 00334 [num_threads]; 00335 00336 sd.temporaries = new value_type*[num_threads]; 00337 sd.dist = new difference_type*[num_bins + 1]; 00338 sd.bin_proc = new thread_index_t[num_bins]; 00339 for (bin_index b = 0; b < num_bins + 1; ++b) 00340 sd.dist[b] = new difference_type[num_threads + 1]; 00341 for (bin_index b = 0; b < (num_bins + 1); ++b) 00342 { 00343 sd.dist[0][0] = 0; 00344 sd.dist[b][0] = 0; 00345 } 00346 starts = sd.starts = new difference_type[num_threads + 1]; 00347 int bin_cursor = 0; 00348 sd.num_bins = num_bins; 00349 sd.num_bits = __log2(num_bins); 00350 00351 difference_type chunk_length = n / num_threads, 00352 split = n % num_threads, start = 0; 00353 difference_type bin_chunk_length = num_bins / num_threads, 00354 bin_split = num_bins % num_threads; 00355 for (thread_index_t i = 0; i < num_threads; ++i) 00356 { 00357 starts[i] = start; 00358 start += (i < split) ? (chunk_length + 1) : chunk_length; 00359 int j = pus[i].bins_begin = bin_cursor; 00360 00361 // Range of bins for this processor. 00362 bin_cursor += (i < bin_split) ? 00363 (bin_chunk_length + 1) : bin_chunk_length; 00364 pus[i].bins_end = bin_cursor; 00365 for (; j < bin_cursor; ++j) 00366 sd.bin_proc[j] = i; 00367 pus[i].num_threads = num_threads; 00368 pus[i].seed = rng(std::numeric_limits<uint32>::max()); 00369 pus[i].sd = &sd; 00370 } 00371 starts[num_threads] = start; 00372 } //single 00373 // Now shuffle in parallel. 00374 parallel_random_shuffle_drs_pu(pus); 00375 } // parallel 00376 00377 delete[] starts; 00378 delete[] sd.bin_proc; 00379 for (int s = 0; s < (num_bins + 1); ++s) 00380 delete[] sd.dist[s]; 00381 delete[] sd.dist; 00382 delete[] sd.temporaries; 00383 00384 delete[] pus; 00385 } 00386 00387 /** @brief Sequential cache-efficient random shuffle. 00388 * @param begin Begin iterator of sequence. 00389 * @param end End iterator of sequence. 00390 * @param rng Random number generator to use. 00391 */ 00392 template<typename RandomAccessIterator, typename RandomNumberGenerator> 00393 void 00394 sequential_random_shuffle(RandomAccessIterator begin, 00395 RandomAccessIterator end, 00396 RandomNumberGenerator& rng) 00397 { 00398 typedef std::iterator_traits<RandomAccessIterator> traits_type; 00399 typedef typename traits_type::value_type value_type; 00400 typedef typename traits_type::difference_type difference_type; 00401 00402 difference_type n = end - begin; 00403 const _Settings& __s = _Settings::get(); 00404 00405 bin_index num_bins, num_bins_cache; 00406 00407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 00408 // Try the L1 cache first, must fit into L1. 00409 num_bins_cache = 00410 std::max<difference_type> 00411 (1, n / (__s.L1_cache_size_lb / sizeof(value_type))); 00412 num_bins_cache = round_up_to_pow2(num_bins_cache); 00413 00414 // No more buckets than TLB entries, power of 2 00415 // Power of 2 and at least one element per bin, at most the TLB size 00416 num_bins = std::min(n, (difference_type)num_bins_cache); 00417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 00418 // 2 TLB entries needed per bin 00419 num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins); 00420 #endif 00421 num_bins = round_up_to_pow2(num_bins); 00422 00423 if (num_bins < num_bins_cache) 00424 { 00425 #endif 00426 // Now try the L2 cache, must fit into L2. 00427 num_bins_cache = 00428 static_cast<bin_index>(std::max<difference_type>( 00429 1, n / (__s.L2_cache_size / sizeof(value_type)))); 00430 num_bins_cache = round_up_to_pow2(num_bins_cache); 00431 00432 // No more buckets than TLB entries, power of 2 00433 // Power of 2 and at least one element per bin, at most the TLB size. 00434 num_bins = static_cast<bin_index> 00435 (std::min(n, static_cast<difference_type>(num_bins_cache))); 00436 00437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 00438 // 2 TLB entries needed per bin 00439 num_bins = 00440 std::min<difference_type>(__s.TLB_size / 2, num_bins); 00441 #endif 00442 num_bins = round_up_to_pow2(num_bins); 00443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 00444 } 00445 #endif 00446 00447 int num_bits = __log2(num_bins); 00448 00449 if (num_bins > 1) 00450 { 00451 value_type* target = static_cast<value_type*>( 00452 ::operator new(sizeof(value_type) * n)); 00453 bin_index* oracles = new bin_index[n]; 00454 difference_type* dist0 = new difference_type[num_bins + 1], 00455 * dist1 = new difference_type[num_bins + 1]; 00456 00457 for (int b = 0; b < num_bins + 1; ++b) 00458 dist0[b] = 0; 00459 00460 random_number bitrng(rng(0xFFFFFFFF)); 00461 00462 for (difference_type i = 0; i < n; ++i) 00463 { 00464 bin_index oracle = random_number_pow2(num_bits, bitrng); 00465 oracles[i] = oracle; 00466 00467 // To allow prefix (partial) sum. 00468 ++(dist0[oracle + 1]); 00469 } 00470 00471 // Sum up bins. 00472 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0); 00473 00474 for (int b = 0; b < num_bins + 1; ++b) 00475 dist1[b] = dist0[b]; 00476 00477 // Distribute according to oracles. 00478 for (difference_type i = 0; i < n; ++i) 00479 ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i)); 00480 00481 for (int b = 0; b < num_bins; ++b) 00482 { 00483 sequential_random_shuffle(target + dist1[b], 00484 target + dist1[b + 1], 00485 rng); 00486 } 00487 00488 // Copy elements back. 00489 std::copy(target, target + n, begin); 00490 00491 delete[] dist0; 00492 delete[] dist1; 00493 delete[] oracles; 00494 ::operator delete(target); 00495 } 00496 else 00497 __gnu_sequential::random_shuffle(begin, end, rng); 00498 } 00499 00500 /** @brief Parallel random public call. 00501 * @param begin Begin iterator of sequence. 00502 * @param end End iterator of sequence. 00503 * @param rng Random number generator to use. 00504 */ 00505 template<typename RandomAccessIterator, typename RandomNumberGenerator> 00506 inline void 00507 parallel_random_shuffle(RandomAccessIterator begin, 00508 RandomAccessIterator end, 00509 RandomNumberGenerator rng = random_number()) 00510 { 00511 typedef std::iterator_traits<RandomAccessIterator> traits_type; 00512 typedef typename traits_type::difference_type difference_type; 00513 difference_type n = end - begin; 00514 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ; 00515 } 00516 00517 } 00518 00519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */