CUDA में, एक कर्नेल में सभी थ्रेड्स के लिए प्रतीक्षा करने के लिए एक बाधा कैसे बनाता है, जब तक कि CPU उस बैरियर को संकेत नहीं भेजता कि यह आगे बढ़ने के लिए सुरक्षित/सहायक है?

मैं एक CUDA कर्नेल लॉन्च करने के ऊपरी हिस्से से बचना चाहता हूं। बचने के लिए दो प्रकार के ओवरहेड हैं: (1) एक्स ब्लॉक और वाई थ्रेड्स पर कर्नेल को लॉन्च करने की लागत, और (2) मुझे अपनी साझा मेमोरी को फिर से शुरू करने में लगने वाला समय, जिसमें मोटे तौर पर इनवोकेशन के बीच समान सामग्री होगी .

हम CPU वर्कलोड में हर समय थ्रेड्स को रीसायकल/री-यूज करते हैं। और CUDA event सिंक्रोनाइज़ेशन प्रिमिटिव भी प्रदान करता है। शायद अधिक पारंपरिक सिग्नलिंग ऑब्जेक्ट प्रदान करने के लिए यह न्यूनतम हार्डवेयर लागत होगी।

यहां कुछ कोड दिया गया है जो उस अवधारणा के लिए एक छेद प्रदान करता है जिसे मैं चाहता हूं। पाठक शायद QUESTION IS HERE खोजना चाहेगा। इसे Nsight में बनाने के लिए डिवाइस लिंकर मोड को अलग संकलन में सेट करने की आवश्यकता है (कम से कम, मुझे यह आवश्यक लगा)।

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 5; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 3;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}

/**
 * CUDA kernel waits for events and then counts down upon receiving them.
 */
__global__ void kernel(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);

        // QUESTION IS HERE: I want this to block on receiving a signal from the
        // CPU, indicating that work is ready to be done
        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);

        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);
        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    cudaEvent_t go_event;
    cudaEventCreateWithFlags(&go_event, cudaEventBlockingSync);

    cudaEvent_t done_event;
    cudaEventCreateWithFlags(&done_event, cudaEventBlockingSync);

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());  // probably unnecessary

    printf("CPU: spawning kernel\n");
    kernel<<<BLOCK_COUNT, THREADS_PER_BLOCK, sizeof(int), stream>>>(stream, go_event, done_event, data);


    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        usleep(4 * 1000 * 1000); // accepts time in microseconds

        // Simulate the sending of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        printf("CPU: sending go_event\n");
        cudaEventRecord(go_event, stream);
        cudaStreamWaitEvent(stream, done_event, cudaEventBlockingSync); // doesn't block even though I wish it would
    }

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    for (int b = 0; b < BLOCK_COUNT; ++b) {
        for (int t = 0; t < THREADS_PER_BLOCK; ++t) {
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);
        }
    }

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);

    cudaEventDestroy(done_event);
    cudaEventDestroy(go_event);
    cudaStreamDestroy(stream);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

और यहाँ इसे चलाने से आउटपुट है। ध्यान दें कि आउटपुट "गलत" हैं, केवल इसलिए कि वे लूप द्वारा ओवर-राइट किए गए थे जिसका सिग्नल GPU थ्रेड्स के लिए ब्लॉकिंग मैकेनिज्म माना जाता है।

CPU: spawning kernel
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: copying result 2497500 back to managed memory
Block 0, Thread 1: copying result 2497500 back to managed memory
Block 0, Thread 0: exiting kernel
Block 0, Thread 1: exiting kernel
CPU: sending go_event
CPU: sending go_event
CPU: sending go_event
Result for Block 0 and Thread 0: 2
Result for Block 0 and Thread 1: 3
CPU: exiting program
2
interestedparty333 20 अप्रैल 2018, 22:57

2 जवाब

सबसे बढ़िया उत्तर

यह उत्तर पढ़ें। मैं आम सहमति पर पहुंचने के बाद पहले वाले को हटाने की योजना बना रहा हूं, क्योंकि मुझे उम्मीद है कि इसका एकमात्र मूल्य ऐतिहासिक होगा।

एक संभावित कार्यान्वयन डिवाइस मेमोरी में झंडे या पूर्णांक का एक सेट होना है। CUDA थ्रेड ब्लॉक करेगा (उदाहरण के लिए, क्लॉक64 () को कॉल करके) जब तक कि ध्वज/पूर्णांक एक निश्चित मान तक नहीं पहुंच जाता, यह दर्शाता है कि CUDA थ्रेड को संसाधित करने के लिए और अधिक काम है। यह संभवत: प्रथम श्रेणी के CUDA द्वारा प्रदान किए गए सिंक्रनाइज़ेशन आदिम का उपयोग करने से धीमा होगा, लेकिन प्रत्येक कर्नेल आमंत्रण के साथ मेरी साझा मेमोरी को पुन: प्रारंभ करने से तेज़ होगा। इसमें किसी प्रकार का व्यस्त प्रतीक्षा/नींद तंत्र भी शामिल है, जिसके बारे में मैं रोमांचित नहीं हूं।

यहां एक कार्यान्वयन है जो काम करता प्रतीत होता है - हालांकि, मुझे चिंता है कि मैं प्रबंधित स्मृति के कुछ अपरिभाषित व्यवहार पर भरोसा कर रहा हूं जो प्रोग्राम के निष्पादन के लाभ के लिए होता है। यहाँ कोड है:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

#include <chrono>
#include <thread>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 1000; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 1000;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}


__device__ void clock_block(clock_t clock_count)
{
    clock_t start_clock = clock64();
    while (clock64() - start_clock < clock_count);
}

/**
 * CUDA kernel waits for flag to increment and then counts down.
 */
__global__ void spawn_worker_threads(int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_go_flag, int * cuda_malloc_managed_done_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    volatile int * my_go_flag = cuda_malloc_managed_go_flag;
    volatile int * volatile_done_flag = cuda_malloc_managed_done_flag;

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        while (*my_go_flag <= i) {
            clock_block(10000); // in cycles, not seconds!
        }

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);

        // Wait for all worker threads to finish and then signal readiness for new work
        __syncthreads(); // TODO: sync with other blocks too

        if (blockIdx.x == 0 && threadIdx.x == 0)
            *volatile_done_flag  = *volatile_done_flag + 1;
        //__threadfence_system(); // based on the documentation, it's not clear that this should actually help
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    int * go_flag;
    int * done_flag;
    cudaMallocManaged(&go_flag, sizeof(int));
    cudaMallocManaged(&done_flag, sizeof(int));

    volatile int * my_volatile_done_flag = done_flag;

    printf("CPU: spawning kernel\n");
    spawn_worker_threads<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>(data, go_flag, done_flag);

    // The cudaMemAdvise calls seem to be unnecessary, but they make it ~13% faster
    CUDA_CHECK_RETURN(cudaMemAdvise(go_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
    CUDA_CHECK_RETURN(cudaMemAdvise(done_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));


    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        if (i % 50 == 0) printf("============== CPU: On iteration %i ============\n", i);

        // Simulate the writing of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read this way
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        *go_flag = *go_flag + 1; // since it's monotonically increasing, and only written to by the CPU code, this is fine

        while (*my_volatile_done_flag < i)
            std::this_thread::sleep_for(std::chrono::microseconds(50));
    }
    CUDA_CHECK_RETURN(cudaDeviceSynchronize());

    for (int b = 0; b < BLOCK_COUNT; ++b)
        for (int t = 0; t < THREADS_PER_BLOCK; ++t)
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);
    cudaFree(go_flag);
    cudaFree(done_flag);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}

और यहाँ आउटपुट है, जो उत्पन्न करने के लिए लगभग 50ms की बात करता है। यह लगभग 50 माइक्रोसेकंड प्रति "रीसाइक्लिंग" है जो मेरे वास्तविक एप्लिकेशन की सहनशीलता के भीतर अच्छी तरह से है।

Starting timer for Synchronization timer
CPU: spawning kernel
============== CPU: On iteration 0 ============
============== CPU: On iteration 50 ============
============== CPU: On iteration 100 ============
============== CPU: On iteration 150 ============
============== CPU: On iteration 200 ============
============== CPU: On iteration 250 ============
============== CPU: On iteration 300 ============
============== CPU: On iteration 350 ============
============== CPU: On iteration 400 ============
============== CPU: On iteration 450 ============
============== CPU: On iteration 500 ============
============== CPU: On iteration 550 ============
============== CPU: On iteration 600 ============
============== CPU: On iteration 650 ============
============== CPU: On iteration 700 ============
============== CPU: On iteration 750 ============
============== CPU: On iteration 800 ============
============== CPU: On iteration 850 ============
============== CPU: On iteration 900 ============
============== CPU: On iteration 950 ============
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: copying result 499500001 back to managed memory
Block 0, Thread 1: copying result 499500001 back to managed memory
Block 0, Thread 0: exiting kernel
Block 0, Thread 1: exiting kernel
Result for Block 0 and Thread 0: 499500001
Result for Block 0 and Thread 1: 499500001
CPU: exiting program

volatile के उपयोग का सुझाव देने के लिए @einpoklum और @robertcrovella को धन्यवाद। ऐसा लगता है कि यह काम कर रहा है, लेकिन मुझे volatile का अनुभव नहीं है। मैंने जो पढ़ा है उसके आधार पर, यह एक वैध और सही उपयोग है जिसके परिणामस्वरूप परिभाषित व्यवहार होना चाहिए। क्या आप कृपया इस निष्कर्ष की पुष्टि या सुधार करने का कष्ट करेंगे?

0
ragerdl 23 अप्रैल 2018, 05:18

दूसरे उत्तर को पहले पढ़ें। यह उत्तर अभी भी ऐतिहासिक संदर्भ के लिए यहां है। मैं या तो इसे डाउनवोट कर दूंगा या इसे जल्द ही हटा दूंगा।

एक संभावित कार्यान्वयन डिवाइस मेमोरी में झंडे या पूर्णांक का एक सेट होना है। CUDA थ्रेड ब्लॉक (शायद clock64() को कॉल करके) तब तक ब्लॉक करेंगे जब तक कि ध्वज/पूर्णांक एक निश्चित मान तक नहीं पहुंच जाता, यह दर्शाता है कि CUDA थ्रेड को संसाधित करने के लिए और अधिक काम है। यह संभवत: प्रथम श्रेणी के CUDA द्वारा प्रदत्त सिंक्रनाइज़ेशन आदिम का उपयोग करने से धीमा होगा लेकिन प्रत्येक कर्नेल आमंत्रण के साथ मेरी __shared__ मेमोरी को पुन: प्रारंभ करने से तेज़ होगा। इसमें किसी प्रकार का व्यस्त प्रतीक्षा/नींद तंत्र भी शामिल है, जिसके बारे में मैं रोमांचित नहीं हूं।

फॉलो-अप: ऐसा लगता है कि यह काम कर रहा है - कभी-कभी (printf कॉल मदद करते हैं)। मुझे लगता है कि प्रबंधित स्मृति में कुछ अपरिभाषित व्यवहार है जो मुझे लाभान्वित कर रहा है। यहाँ कोड है:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

#include <cuda_runtime_api.h>
#include <cuda.h>

static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

const int COUNT_DOWN_ITERATIONS = 1000;
const int KERNEL_MAXIMUM_LOOPS = 5; // IRL, we'd set this large enough to prevent hitting this value, unless the kernel is externally terminated
const int SIGNALS_TO_SEND_COUNT = 3;
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 2;

__device__ void count_down(int * shared_location_to_ensure_side_effect) {
    int x = *shared_location_to_ensure_side_effect;
    for (int i = 0; i < COUNT_DOWN_ITERATIONS; ++i) {
        x += i;
    }
    *shared_location_to_ensure_side_effect = x;
}


__device__ void clock_block(clock_t clock_count)
{
    //printf("time used so far: %lu\n", clock64());
    clock_t start_clock = clock64();
    while (clock64() - start_clock < clock_count);
}

/**
 * CUDA kernel waits for flag to increment and then counts down.
 */
__global__ void kernel_block_via_flag(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_synchronization_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);
        while (*cuda_malloc_managed_synchronization_flag <= i)

            //printf("%lu\n", *cuda_malloc_managed_synchronization_flag);
            clock_block(1000000000); // in cycles, not seconds!

        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);
        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}


int main(void)
{

    int ** data;
    cudaMallocManaged(&data, BLOCK_COUNT * sizeof(int *));
    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaMallocManaged(&(data[b]), THREADS_PER_BLOCK * sizeof(int));

    cudaEvent_t go_event;
    cudaEventCreateWithFlags(&go_event, cudaEventBlockingSync);

    cudaEvent_t done_event;
    cudaEventCreateWithFlags(&done_event, cudaEventBlockingSync);

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    int * synchronization_flag;
    cudaMallocManaged(&synchronization_flag, sizeof(int));
    //cudaMalloc(&synchronization_flag, sizeof(int));
    //int my_copy_of_synchronization_flag = 0;

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());  // probably unnecessary

    printf("CPU: spawning kernel\n");
    kernel_block_via_flag<<<BLOCK_COUNT, THREADS_PER_BLOCK, sizeof(int), stream>>>(stream, go_event, done_event, data, synchronization_flag);
    CUDA_CHECK_RETURN(cudaMemAdvise(synchronization_flag, sizeof(int), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));

    for (int i = 0; i < SIGNALS_TO_SEND_COUNT; ++i) {
        usleep(4 * 1000 * 1000); // accepts time in microseconds

        // Simulate the sending of the "next" piece of work
        data[0][0] = i;      // unrolled, because it's easier to read
        data[0][1] = i + 1;  // unrolled, because it's easier to read

        printf("CPU: sending go_event\n");
        //++my_copy_of_synchronization_flag;
        //CUDA_CHECK_RETURN(cudaMemcpyAsync(synchronization_flag, &my_copy_of_synchronization_flag, sizeof(int), cudaMemcpyHostToDevice));
        *synchronization_flag = *synchronization_flag + 1; // since it's monotonically increasing, and only written to by the CPU code, this is fine
    }

    CUDA_CHECK_RETURN(cudaDeviceSynchronize());
    for (int b = 0; b < BLOCK_COUNT; ++b) {
        for (int t = 0; t < THREADS_PER_BLOCK; ++t) {
            printf("Result for Block %i and Thread %i: %i\n", b, t, data[b][t]);
        }
    }

    for (int b = 0; b < BLOCK_COUNT; ++b)
        cudaFree(data[b]);
    cudaFree(data);
    cudaFree(synchronization_flag);

    cudaEventDestroy(done_event);
    cudaEventDestroy(go_event);
    cudaStreamDestroy(stream);

    printf("CPU: exiting program");

    return 0;
}

/**
 * Check the return value of the CUDA runtime API call and exit
 * the application if the call has failed.
 */
static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
{
    if (err == cudaSuccess)
        return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
}




__global__ void kernel_block_via_flag(cudaStream_t stream, cudaEvent_t go_event, cudaEvent_t done_event, int ** cuda_malloc_managed_int_address, int * cuda_malloc_managed_synchronization_flag) {
    __shared__ int local_copy_of_cuda_malloc_managed_int_address; // we always start at 0

    printf("Block %i, Thread %i: entered kernel\n", blockIdx.x, threadIdx.x);
    for (int i = 0; i < KERNEL_MAXIMUM_LOOPS; ++i) {
        printf("Block %i, Thread %i: entered loop; waitin 4 go_event\n", blockIdx.x, threadIdx.x);
        while (*cuda_malloc_managed_synchronization_flag <= i)
            //printf("%i\n", *cuda_malloc_managed_synchronization_flag);
            clock_block(1000000000);

        cudaStreamWaitEvent(stream, go_event, cudaEventBlockingSync);
        printf("Block %i, Thread %i:      in loop; received go_event\n", blockIdx.x, threadIdx.x);

        if (i == 0) { // we have received the signal and data is ready to be interpreted
            local_copy_of_cuda_malloc_managed_int_address = cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x];
        }
        count_down(&local_copy_of_cuda_malloc_managed_int_address);
        printf("Block %i, Thread %i:      finished counting\n", blockIdx.x, threadIdx.x);
        cudaEventRecord(done_event, stream);
        printf("Block %i, Thread %i:      recorded event; may loop back\n", blockIdx.x, threadIdx.x);
    }
    printf("Block %i, Thread %i: copying result %i back to managed memory\n", blockIdx.x, threadIdx.x, local_copy_of_cuda_malloc_managed_int_address);
    cuda_malloc_managed_int_address[blockIdx.x][threadIdx.x] = local_copy_of_cuda_malloc_managed_int_address;
    printf("Block %i, Thread %i: exiting kernel\n", blockIdx.x, threadIdx.x);
}

और आउटपुट:

CPU: spawning kernel
Block 0, Thread 0: entered kernel
Block 0, Thread 1: entered kernel
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event
CPU: sending go_event
Block 0, Thread 0:      in loop; received go_event
Block 0, Thread 1:      in loop; received go_event
Block 0, Thread 0:      finished counting
Block 0, Thread 1:      finished counting
Block 0, Thread 0:      recorded event; may loop back
Block 0, Thread 1:      recorded event; may loop back
Block 0, Thread 0: entered loop; waitin 4 go_event
Block 0, Thread 1: entered loop; waitin 4 go_event

यह अभी भी एक बुरा समाधान है। मुझे किसी और के जवाब को स्वीकार करने की उम्मीद है।

0
ragerdl 23 अप्रैल 2018, 05:18