Can OpenMP 4 runs target regions in parallel?

Can OpenMP 4 runs target regions in parallel? - gpu

Reading some tutorials from OpenMP 4, I found that target regions can participate in the same dependency graph of CPU tasks, using the depend clause.
When programming OpenMP tasks, we know they can be run concurrently. But is this possible on GPUs? Can a GPU run multiple target regions simultaneously?
I tried with this code:
#include <omp.h>
#include <stdio.h>
int main() {
int i;
#pragma omp parallel
#pragma omp single
{
#pragma omp task private(i)
#pragma omp target
{
for (i = 0; i < 100; i++)
printf("1 %d\n", i);
}
#pragma omp task private(i)
#pragma omp target
{
for (i = 0; i < 100; i++)
printf("2 %d\n", i);
}
#pragma omp task private(i)
#pragma omp target
{
for (i = 0; i < 100; i++)
printf("3 %d\n", i);
}
}
#pragma omp taskwait
}
Although the tasks are executed in arbitrary order, the target regions are executed atomically, one region at a time.

Related

OpenMP offloading on GPU, 'simd' specificities

I was wondering how to interpret the following OpenMP constructs:
#pragma omp target teams distribute parallel for
for(int i = 0; i < N; ++i) {
// compute
}
#pragma omp target teams distribute parallel for simd
for(int i = 0; i < N; ++i) {
// compute
}
Note the simd clause added on the second loop. According to the OpenMP 5.1 specification, this clause declare that: "multiple iterations of the loop can be executed concurrently by using SIMD instructions".
I believe I can very well conceive how simd is implemented and behaves on CPU but on GPU, more precisely, AMD GPUs, there is no such thing as exposed SIMD instruction in the sense that a HIP thread is in fact a SIMD instruction lane.
According to the OpenMP specification, if there is a loop carried dependency or if the compiler can not prove there is none, when OpenMP maps the teams to thread blocks/workgroups and the treads to simd lanes it is forced to use thread blocks of only one thread.
How do you interpret the target teams distribute parallel for simd:
Does it mean that in this context simd can't be translated for a GPU?
Or maybe - each thread is handled as if it had a single SIMD lane?
There is at least one similar but old and unanswered question:
How is omp simd for loop executed on GPUs?

According to the test case below, the assembly generated for AMD MI250 (gfx90a) is the same with or without simd. Though, if you look at the CPU code, you shall see a significant change with the simd clause which in this case, allows for a similar optimization to the ones observed with an explicit usage of the restrict keyword.
TLDR: Currently, the simd clause is irrelevant and only leads to this warning, even for extremely trivial cases:
loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning].
#include <cstdint>
#define RESTRICT __restrict
using Float = double;
void test0_0(Float* a, const Float* b) {
a[0] = b[0] * b[0];
// Forced store/reload (b[0] could be a[0]).
a[1] = b[0];
}
void test0_1(Float* RESTRICT a, const Float* RESTRICT b) {
a[0] = b[0] * b[0];
// No forced store/reload.
a[1] = b[0];
}
void test1_0(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// Forced store/reload
a[i + 1] = b[i + 0];
}
}
void test1_1(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// simd -> no loop carried dependencies:
// No forced store/reload -> easier vectorization, less generated code.
a[i + 1] = b[i + 0];
}
}
void test2_0(Float* a, Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, as expected.
a[i + 1] = b[i + 0];
}
}
void test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
Code available at: https://godbolt.org/z/sMY48s8jz

stringstream segmentation fault

Using linux and g++.
This works:
stringstream ss;
for (int k = 1; k < 1000; k++){
}
This should also works but result in "segmentation fault":
for (int k = 1; k <1000; k++){
stringstream ss;
}
Why?

Thank you Antonio Perez for your reply.
Actually my code was exactly this:
#pragma pack(1)
#include <sstream>
#include <iostream>
int main(){
for (int i = 0; i < 2; i++){
std::stringstream ss;
}
}
Amazingly if I displace the #pragma pack(1) like this:
#include <sstream>
#pragma pack(1)
#include <iostream>
int main(){
for (int i = 0; i < 2; i++){
std::stringstream ss;
}
}
...then no error occurs!
Is there a possible (non-bug) reason for why sstream does not permit packing of its structure?

OpenACC; copy_in not working?

I have this sample code:
#include <stdio.h>
#include <stdlib.h>
#ifdef _OPENACC
#include <openacc.h>
#endif
#define N 1000
int main() {
#ifdef _OPENACC
acc_init(acc_device_not_host);
printf(" Compiling with OpenACC support \n");
#endif
double * a;
int n = 100;
a = (double *) malloc(n * sizeof(double));
for (int i = 0; i < n; i++)
a[i] = 1.0f;
#pragma acc data copy_in(a[0:n])
{
#pragma acc kernels loop
for (int i = 0; i < n; i++)
a[i] = (double) i + a[i];
}
#ifdef _OPENACC
acc_shutdown(acc_device_not_host);
#endif
printf("Value of a[10]: %lf\n", a[10]);
return 0;
}
Teacher told me that the output is 1.0, because I have copy_in; then, a is copied on the acceñeratpr, but when it ends, a contains 1.0 in every position; but if I run this code I get 11.0, why?

There's a couple of things going on here. First, the correct clause is copyin (no underscore). Second, since you're only copying the input values into the region, any changes made within the data region will not come back to the CPU, so unless you're running this on a shared memory system, for example running on a multicore CPU, then the value of a at your printf statement will be like that loop never ran. In order to get the results back from the data region, you'll actually want a copy clause instead. That informs the compiler to copy in the input values to the region and copy out the output values from the region.
Since you're getting 11, clearly the loop is getting run somewhere. What compiler are you using and what flags? Either you're not actually building with OpenACC enabled or you're running on a shared memory target and your teacher isn't.

Two processes substracting a number using pipe

Having difficulty to make two processes comunicate through pipe and substract a number alternatively.
Output should be like:
process1: 9
process2: 8
process1: 7...
What I've did so far:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
int main() {
int p2c[2];
int c2p[2];
int n = 9;
pipe(p2c);
pipe(c2p);
write(p2c[1], &n, sizeof(int));
if(fork() == 0) {
read(p2c[0], &n, sizeof(int));
printf("Got from parent: %d", n);
n--;
write(c2p[1], &n, sizeof(int));
close(p2c[0]);
close(p2c[1]);
close(c2p[0]);
close(c2p[1]);
exit(0);
}
else{
read(c2p[0], &n, sizeof(int));
printf("Got from child: %d", n);
n--;
write(p2c[1], &n; sizeof(int));
close(p2c[0]);
close(p2c[1]);
close(c2p[0]);
close(c2p[1]);
}
return 0;
}
Whith the output:
Got from parent:9
Got from child:8
What's the proper way to get these two processes substract the number till 0?

It makes sense that you're only getting "Got from parent:9 Got from child:8" as a result, you need, you need a while or for loop for both child and parent processes to get what you're expecting, and the stop conditions for those loops are (n < 0) after decrementing n or the write end of pipe get closed:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
int main() {
int p2c[2];
int c2p[2];
int n = 9;
pipe(p2c);
pipe(c2p);
// this is important to prevent deadlock situation, at least one of both processes
// must start a write operation before while loop, unless the read will block and
// and each process still waiting the other to write something on the pipe
write(p2c[1], &n, sizeof(int));
if(fork() == 0) {
int readStatus;
while(1){
readStatus=read(p2c[0], &n, sizeof(int));
// when read returns 0, this means the write end of pipe was closed, so we have to break the loop
// because no more data to recieve
if(readStatus == 0) break;
printf("Got from parent: %d\n", n);
n--;
// we check if n less than 0, if yes we are finished
if(n < 0) break;
write(c2p[1], &n, sizeof(int));
}
close(p2c[0]);
close(p2c[1]);
close(c2p[0]);
close(c2p[1]);
exit(0);
}
else{
int readStatus;
while(1){
readStatus= read(c2p[0], &n, sizeof(int));
if(readStatus == 0) break;
printf("Got from child: %d\n", n);
n--;
if(n < 0) break;
write(p2c[1], &n, sizeof(int));
}
close(p2c[0]);
close(p2c[1]);
close(c2p[0]);
close(c2p[1]);
}
return 0;
}

Implementing a header in Objective C

I'm completely new to Objective C and I'm trying to use it to wrap a C-library. I have a main.m wrap.m and wrap.h files. From what I gather in the header file I included #interface and in the source file I will include #implementation However I'm not really understanding what to include in each of them. Right now my main file is:
int copy_data(struct archive *ar, struct archive *aw) {
for (;;) {
const void *buff;
size_t size;
off_t offset;
int r = archive_read_data_block(ar, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
return (ARCHIVE_OK);
archive_write_data_block(aw, buff, size, offset);
}
}
int main(int argc, const char * argv[])
{
#autoreleasepool {
struct archive *a;
struct archive *ext;
struct archive_entry *entry;
int flags;
int r;
/* Select which attributes we want to restore. */
flags = ARCHIVE_EXTRACT_TIME;
flags |= ARCHIVE_EXTRACT_PERM;
flags |= ARCHIVE_EXTRACT_ACL;
flags |= ARCHIVE_EXTRACT_FFLAGS;
a = archive_read_new();
archive_read_support_format_all(a);
archive_read_support_compression_all(a);
ext = archive_write_disk_new();
archive_write_disk_set_options(ext, flags);
archive_write_disk_set_standard_lookup(ext);
r = archive_read_open_filename(a, argv[1], 10240);
for (;;) {
r = archive_read_next_header(a, &entry);
if (r == ARCHIVE_EOF)
break;
r = archive_write_header(ext, entry);
if (archive_entry_size(entry) > 0) {
copy_data(a, ext);
}
archive_write_finish_entry(ext);
}
archive_read_close(a);
archive_read_free(a);
archive_write_close(ext);
archive_write_free(ext);
NSLog(#"No Issues");
}
return 0;
}
So far what I'm getting in my wrap.h file is:
typedef struct{
int *a;
int *ext;
}archive;
#interface main : NSObject
#property int flags;
#property int r;
I don't know if that is close to what I need to do, and I'm getting errors on my ARCHIVE_EXTRACT saying they are undeclared identifiers which I assume also have to go into my wrap.h file but I'm not sure how to do that. Any help at all would be appreciated!

If you start your project in Xcode using the CommandLineTool template, you can select your language to be "C" or "C++", so you wouldn't have to mess with Objective-C at all.
As for the .h file that you currently have, don't do "#property" or "#interface" for "main". "main" is a C style function and not an Objective-C thing.

If you are actually interested in an objectivec solution, follow Michael Dautermann's instructions to start a new Command Line project but instead of Type C use the Foundation option. This will give you a working main (just a regular c function). Then select new->objective c class to create your wrap.h/wrap.m. In the wrap.h you will pretty much exclusively be declaring your own objectivec public wrapper methods. In the wrap.m, you'll be importing what you want to wrap, and defining your wrapper functions.
//
// main.m
//
#import <Foundation/Foundation.h>
#import "wrap.h"
int main(int argc, const char * argv[])
{
#autoreleasepool {
[wrap wrappedStuff];
}
return 0;
}
//
// wrap.h
//
----------
#import <Foundation/Foundation.h>
#interface wrap : NSObject
+ (void)wrappedStuff;
#end
//
// wrap.m
//
#import "wrap.h"
#include "WhatImWrapping.h"
#implementation wrap
int copy_data(struct archive *ar, struct archive *aw) {
for (;;) {
const void *buff;
size_t size;
off_t offset;
int r = archive_read_data_block(ar, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
return (ARCHIVE_OK);
archive_write_data_block(aw, buff, size, offset);
}
}
+ (void)wrappedStuff
{
struct archive *a;
struct archive *ext;
struct archive_entry *entry;
int flags;
int r;
/* Select which attributes we want to restore. */
flags = ARCHIVE_EXTRACT_TIME;
flags |= ARCHIVE_EXTRACT_PERM;
flags |= ARCHIVE_EXTRACT_ACL;
flags |= ARCHIVE_EXTRACT_FFLAGS;
a = archive_read_new();
archive_read_support_format_all(a);
archive_read_support_compression_all(a);
ext = archive_write_disk_new();
archive_write_disk_set_options(ext, flags);
archive_write_disk_set_standard_lookup(ext);
r = archive_read_open_filename(a, argv[1], 10240);
for (;;) {
r = archive_read_next_header(a, &entry);
if (r == ARCHIVE_EOF)
break;
r = archive_write_header(ext, entry);
if (archive_entry_size(entry) > 0) {
copy_data(a, ext);
}
archive_write_finish_entry(ext);
}
archive_read_close(a);
archive_read_free(a);
archive_write_close(ext);
archive_write_free(ext);
NSLog(#"No Issues");
}
#end

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Can OpenMP 4 runs target regions in parallel? - gpu

Related

OpenMP offloading on GPU, 'simd' specificities

stringstream segmentation fault

OpenACC; copy_in not working?

Two processes substracting a number using pipe

Implementing a header in Objective C

Categories

Resources