Artur Laksberg

Download Report

Transcript Artur Laksberg

Renderscript
OpenMP
CUDA
C++ AMP
PPL
TBB
MPI
OpenACC
OpenCL
Cilk Plus GCD
void quicksort(int *v, int start, int end) {
if (start < end) {
int pivot = partition(v, start, end);
quicksort(v, start, pivot - 1);
quicksort(v, pivot + 1, end);
}
}
void quicksort(int *v, int start, int end) {
if (start < end) {
Problem 1:
expensive
int pivot = partition(v, start, end);
std::thread t1([&] {
quicksort(v, start, pivot - 1);
});
std::thread t2([&] {
quicksort(v, pivot + 1, end);
});
t1.join();
t2.join();
}
}
Problem 3:
Exceptions??
Problem 2:
Fork-join not enforced
void quicksort(int *v, int start, int end) {
if (start < end) {
int pivot = partition(v, start, end);
parallel region
task
quicksort(v, start, pivot - 1);
quicksort(v, pivot + 1, end);
}
}
task
void quicksort(int *v, int start, int end) {
if (start < end) {
parallel region
task_region([&] (auto& r) {
int pivot = partition(v, start, end);
r.run([&] {
quicksort(v, start, pivot - 1);
});
r.run([&] {
quicksort(v, pivot + 1, end);
});
});
}
}
task
task
proc 1
proc 2
proc 3
proc 4
proc 1
proc 2
proc 3
proc 4
Old items
New items
proc 1
proc 2
proc 3
proc 4
Old items
New items
proc 1
proc 2
proc 3
proc 4
Old items
New items
“Thief”
e();
Q1: What
thread runs f?
e()
Q2: What
thread runs g?
task_region([] (auto& r) {
r.run(f);
g();
});
h();
g()
f()
Q3: What
thread runs h?
h()
task_region([] (auto& r) {
for(int i=0; i<n; ++i)
r.run(f);
});
Alex Stepanov: Start With The Algorithms
std::sort(begin, end,
[](int a, int b) { return a < b; });
int comparisons = 0;
std::sort(begin, end,
[&](int a, int b) { comparisons++; return a < b; });
template<typename Iterator, typename Compare>
void chaos_sort( Iterator first, Iterator last, Compare comp ) {
auto n = last-first;
std::vector<char> c(n);
for(;;) {
bool flag = false;
for( size_t i=1; i<n; ++i ) {
c[i] = comp(first[i],first[i-1]);
flag |= c[i];
}
if( !flag ) break;
for( size_t i=1; i<n; ++i )
if( c[i] )
std::swap( first[i-1], first[i] );
}
}
extern const sequential_execution_policy seq;
extern const parallel_execution_policy par;
extern const vector_execution_policy vec;
class execution_policy
{
public:
// ...
const type_info& target_type() const;
template<class T> T *target();
template<class T> const T *target() const;
};
std::vector<int> vec = ...
// standard sequential sort
std::sort(vec.begin(), vec.end());
using namespace std::experimental::parallel;
// explicitly sequential sort
sort(seq, vec.begin(), vec.end());
// permitting parallel execution
sort(par, vec.begin(), vec.end());
// permitting vectorization as well
sort(vec, vec.begin(), vec.end());
size_t threshold = ...
execution_policy exec = seq;
if(vec.size() > threshold)
{
exec = par;
}
sort(exec, vec.begin(), vec.end());
try
{
r = std::inner_product(std::par, a.begin(), a.end(), b.begin(),
func1, func2, 0);
}
catch(const exception_list& list)
{
for(auto& exptr : list)
{
// process exception pointer exptr
}
}
int a[n] = ...;
int b[n] = ...;
for(int i=0; i<n; ++i)
{
a[i] = b[i] + c;
}
movdqu
movdqu
paddd
paddd
movdqu
xmm1, XMMWORD PTR _b$[esp+eax+132]
xmm0, XMMWORD PTR _a$[esp+eax+132]
xmm1, xmm2
xmm1, xmm0
XMMWORD PTR _a$[esp+eax+132], xmm1
a[i:i+3] = b[i:i+3] + c;
void f(int* a, int*b)
{
for(int i=0; i<n; ++i)
{
a[i] = b[i] + c;
func();
Aliasing?
mov
add
add
call
}
}
Side effects?
Dependence?
Exceptions?
ecx, DWORD PTR _b$[esp+esi+140]
ecx, edi
DWORD PTR _a$[esp+esi+140], ecx
func
for(int i=0; i<n; ++i)
{
lock.enter();
a[i] = b[i] + c;
lock.release();
}
?
for(int i=0; i<n; i+=4)
{
for(int j=0; j<4; ++j)
lock.enter();
a[i:i+3] = b[i:i+3] + c;
for(int j=0; j<4; ++j)
lock.release();
}
void f(int* a, int*b)
{
for(int i=0; i<n; ++i)
{
a[i] = b[i] + c;
func();
}
}
for(int i=0; i<n; i+=4)
{
a[i:i+3] = b[i:i+3] + c;
for(int j=0; j<4; ++j)
func();
}
void f(int* a, int*b)
{
integer_iterator begin {0};
integer_iterator end {n};
std::for_each( std::vec, begin, end, [&](int i)
{
a[i] = b[i] + c;
func();
}
}