[FFmpeg-devel] [PATCH]opencl: automatically select the fastest opencl device

Michael Niedermayer michaelni at gmx.at
Tue Dec 3 06:03:40 CET 2013


On Mon, Dec 02, 2013 at 10:39:20PM -0600, Lenny Wang wrote:
> On Mon, Dec 2, 2013 at 10:12 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Mon, Dec 02, 2013 at 04:48:41PM -0600, Lenny Wang wrote:
> >> On Mon, Dec 2, 2013 at 4:40 PM, Stefano Sabatini <stefasab at gmail.com> wrote:
> >> > On date Monday 2013-12-02 11:46:09 -0600, Lenny Wang encoded:
> >> >> On Mon, Dec 2, 2013 at 5:46 AM, Stefano Sabatini <stefasab at gmail.com> wrote:
> >> > [...]
> >> >> From: Lenny Wang <lwanghpc at gmail.com>
> >> >> Date: Mon, 2 Dec 2013 11:40:00 -0600
> >> >> Subject: [PATCH] cmdutils & opencl: add -opencl_bench option to test and show available OpenCL devices
> >> >>
> >> >> ---
> >> >>  cmdutils.c                     | 166 +++++++++++++++++++++++++++++++++++++++++
> >> >>  cmdutils.h                     |  15 ++++
> >> >>  cmdutils_common_opts.h         |   1 +
> >> >>  cmdutils_opencl_bench_kernel.h |  86 +++++++++++++++++++++
> >> >>  doc/APIchanges                 |   2 +
> >> >>  doc/fftools-common-opts.texi   |   4 +
> >> >>  doc/utils.texi                 |   4 +-
> >> >>  libavutil/opencl.c             |  42 +++++++++++
> >> >>  libavutil/opencl.h             |  16 ++++
> >> >>  libavutil/version.h            |   2 +-
> >> >>  10 files changed, 335 insertions(+), 3 deletions(-)
> >> >>
> >> >> diff --git a/cmdutils.c b/cmdutils.c
> >> >> index 46ade3f..4f61a80 100644
> >> >> --- a/cmdutils.c
> >> >> +++ b/cmdutils.c
> >> >> @@ -60,6 +60,8 @@
> >> >>  #endif
> >> >>  #if CONFIG_OPENCL
> >> >>  #include "libavutil/opencl.h"
> >> >> +#include "libavutil/time.h"
> >> >> +#include "cmdutils_opencl_bench_kernel.h"
> >> >>  #endif
> >> >>
> >> >>
> >> >> @@ -986,6 +988,170 @@ int opt_timelimit(void *optctx, const char *opt, const char *arg)
> >> >>  }
> >> >>
> >> >>  #if CONFIG_OPENCL
> >> >> +#define OCLCHECK(method, ... )                                                 \
> >> >> +do {                                                                           \
> >> >> +    status = method(__VA_ARGS__);                                              \
> >> >> +    if (status != CL_SUCCESS) {                                                \
> >> >> +        av_log(NULL, AV_LOG_ERROR, # method " error '%s'\n",                   \
> >> >> +               av_opencl_errstr(status));                                      \
> >> >> +        ret = AVERROR_EXTERNAL;                                                \
> >> >> +        goto end;                                                              \
> >> >> +    }                                                                          \
> >> >> +} while (0)
> >> >> +
> >> >> +#define CREATEBUF(out, flags, size)                                            \
> >> >> +do {                                                                           \
> >> >> +    out = clCreateBuffer(ext_opencl_env->context, flags, size, NULL, &status); \
> >> >> +    if (status != CL_SUCCESS) {                                                \
> >> >> +        av_log(NULL, AV_LOG_ERROR, "Could not create OpenCL buffer\n");        \
> >> >> +        ret = AVERROR_EXTERNAL;                                                \
> >> >> +        goto end;                                                              \
> >> >> +    }                                                                          \
> >> >> +} while (0)
> >> >> +
> >> >> +static void fill_rand_int(int *data, int n)
> >> >> +{
> >> >> +    int i;
> >> >> +    srand(av_gettime());
> >> >> +    for (i = 0; i < n; i++)
> >> >> +        data[i] = rand();
> >> >> +}
> >> >> +
> >> >> +#define OPENCL_NB_ITER 5
> >> >> +static int64_t run_opencl_bench(AVOpenCLExternalEnv *ext_opencl_env)
> >> >> +{
> >> >> +    int i, arg = 0, width = 1920, height = 1088;
> >> >> +    int64_t start, ret = 0;
> >> >> +    cl_int status;
> >> >> +    size_t kernel_len;
> >> >> +    char *inbuf;
> >> >> +    int *mask;
> >> >> +    int buf_size = width * height * sizeof(char);
> >> >> +    int mask_size = sizeof(uint32_t) * 128;
> >> >> +
> >> >> +    cl_mem cl_mask, cl_inbuf, cl_outbuf;
> >> >> +    cl_kernel kernel = NULL;
> >> >> +    cl_program program = NULL;
> >> >> +    size_t local_work_size_2d[2] = {16, 16};
> >> >> +    size_t global_work_size_2d[2] = {(size_t)width, (size_t)height};
> >> >> +
> >> >> +    if (!(inbuf = av_malloc(buf_size)) || !(mask = av_malloc(mask_size))) {
> >> >> +        av_log(NULL, AV_LOG_ERROR, "Out of memory\n");
> >> >> +        ret = AVERROR(ENOMEM);
> >> >> +        goto end;
> >> >> +    }
> >> >> +    fill_rand_int((int*)inbuf, buf_size/4);
> >> >> +    fill_rand_int(mask, mask_size/4);
> >> >> +
> >> >> +    CREATEBUF(cl_mask, CL_MEM_READ_ONLY, mask_size);
> >> >> +    CREATEBUF(cl_inbuf, CL_MEM_READ_ONLY, buf_size);
> >> >> +    CREATEBUF(cl_outbuf, CL_MEM_READ_WRITE, buf_size);
> >> >> +
> >> >> +    kernel_len = strlen(ocl_bench_source);
> >> >> +    program = clCreateProgramWithSource(ext_opencl_env->context, 1, &ocl_bench_source,
> >> >> +                                        &kernel_len, &status);
> >> >> +    if (status != CL_SUCCESS || !program) {
> >> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark program\n");
> >> >> +        ret = AVERROR_EXTERNAL;
> >> >> +        goto end;
> >> >> +    }
> >> >> +    status = clBuildProgram(program, 1, &(ext_opencl_env->device_id), NULL, NULL, NULL);
> >> >> +    if (status != CL_SUCCESS) {
> >> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to build benchmark program\n");
> >> >> +        ret = AVERROR_EXTERNAL;
> >> >> +        goto end;
> >> >> +    }
> >> >> +    kernel = clCreateKernel(program, "unsharp_bench", &status);
> >> >> +    if (status != CL_SUCCESS) {
> >> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark kernel\n");
> >> >> +        ret = AVERROR_EXTERNAL;
> >> >> +        goto end;
> >> >> +    }
> >> >> +
> >> >> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_inbuf, CL_TRUE, 0,
> >> >> +             buf_size, inbuf, 0, NULL, NULL);
> >> >> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_mask, CL_TRUE, 0,
> >> >> +             mask_size, mask, 0, NULL, NULL);
> >> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_inbuf);
> >> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_outbuf);
> >> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_mask);
> >> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &width);
> >> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &height);
> >> >> +
> >> >> +    start = av_gettime();
> >> >
> >> >> +    for (i = 0; i < OPENCL_NB_ITER; i++)
> >> >> +         OCLCHECK(clEnqueueNDRangeKernel, ext_opencl_env->command_queue, kernel, 2, NULL,
> >> >> +                 global_work_size_2d, local_work_size_2d, 0, NULL, NULL);
> >> >> +     clFinish(ext_opencl_env->command_queue);
> >> >
> >> > nit: still weird indent, it should be:
> >> >
> >> >     for (i = 0; i < OPENCL_NB_ITER; i++)
> >> >         OCLCHECK(clEnqueueNDRangeKernel, ext_opencl_env->command_queue, kernel, 2, NULL,
> >> >                  global_work_size_2d, local_work_size_2d, 0, NULL, NULL);
> >> >     clFinish(ext_opencl_env->command_queue);
> >> >
> >> > LGTM, but Wei should approve the patch, thanks.
> >>
> >> Indent fixed.  Pending Wei's approval.
> >
> >>  cmdutils.c                     |  166 +++++++++++++++++++++++++++++++++++++++++
> >>  cmdutils.h                     |   15 +++
> >>  cmdutils_common_opts.h         |    1
> >>  cmdutils_opencl_bench_kernel.h |   86 +++++++++++++++++++++
> >>  doc/APIchanges                 |    2
> >>  doc/fftools-common-opts.texi   |    4
> >>  doc/utils.texi                 |    4
> >>  libavutil/opencl.c             |   42 ++++++++++
> >>  libavutil/opencl.h             |   16 +++
> >>  libavutil/version.h            |    2
> >>  10 files changed, 335 insertions(+), 3 deletions(-)
> >> 6fccaf0e1c18908b2ce57bb460749dfe14824831  add-opencl-bench-option.patch
> >> From: Lenny Wang <lwanghpc at gmail.com>
> >> Date: Mon, 2 Dec 2013 11:40:00 -0600
> >> Subject: [PATCH] cmdutils & opencl: add -opencl_bench option to test and show available OpenCL devices
> >>
> >> ---
> >>  cmdutils.c                     | 166 +++++++++++++++++++++++++++++++++++++++++
> >>  cmdutils.h                     |  15 ++++
> >>  cmdutils_common_opts.h         |   1 +
> >>  cmdutils_opencl_bench_kernel.h |  86 +++++++++++++++++++++
> >>  doc/APIchanges                 |   2 +
> >>  doc/fftools-common-opts.texi   |   4 +
> >>  doc/utils.texi                 |   4 +-
> >>  libavutil/opencl.c             |  42 +++++++++++
> >>  libavutil/opencl.h             |  16 ++++
> >>  libavutil/version.h            |   2 +-
> >>  10 files changed, 335 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/cmdutils.c b/cmdutils.c
> >> index 46ade3f..4f61a80 100644
> >> --- a/cmdutils.c
> >> +++ b/cmdutils.c
> >> @@ -60,6 +60,8 @@
> >>  #endif
> >>  #if CONFIG_OPENCL
> >>  #include "libavutil/opencl.h"
> >> +#include "libavutil/time.h"
> >> +#include "cmdutils_opencl_bench_kernel.h"
> >>  #endif
> >>
> >>
> >> @@ -986,6 +988,170 @@ int opt_timelimit(void *optctx, const char *opt, const char *arg)
> >>  }
> >>
> >>  #if CONFIG_OPENCL
> >> +#define OCLCHECK(method, ... )                                                 \
> >> +do {                                                                           \
> >> +    status = method(__VA_ARGS__);                                              \
> >> +    if (status != CL_SUCCESS) {                                                \
> >> +        av_log(NULL, AV_LOG_ERROR, # method " error '%s'\n",                   \
> >> +               av_opencl_errstr(status));                                      \
> >> +        ret = AVERROR_EXTERNAL;                                                \
> >> +        goto end;                                                              \
> >> +    }                                                                          \
> >> +} while (0)
> >> +
> >> +#define CREATEBUF(out, flags, size)                                            \
> >> +do {                                                                           \
> >> +    out = clCreateBuffer(ext_opencl_env->context, flags, size, NULL, &status); \
> >> +    if (status != CL_SUCCESS) {                                                \
> >> +        av_log(NULL, AV_LOG_ERROR, "Could not create OpenCL buffer\n");        \
> >> +        ret = AVERROR_EXTERNAL;                                                \
> >> +        goto end;                                                              \
> >> +    }                                                                          \
> >> +} while (0)
> >> +
> >> +static void fill_rand_int(int *data, int n)
> >> +{
> >> +    int i;
> >> +    srand(av_gettime());
> >> +    for (i = 0; i < n; i++)
> >> +        data[i] = rand();
> >> +}
> >> +
> >> +#define OPENCL_NB_ITER 5
> >> +static int64_t run_opencl_bench(AVOpenCLExternalEnv *ext_opencl_env)
> >> +{
> >> +    int i, arg = 0, width = 1920, height = 1088;
> >> +    int64_t start, ret = 0;
> >> +    cl_int status;
> >> +    size_t kernel_len;
> >> +    char *inbuf;
> >> +    int *mask;
> >> +    int buf_size = width * height * sizeof(char);
> >> +    int mask_size = sizeof(uint32_t) * 128;
> >> +
> >> +    cl_mem cl_mask, cl_inbuf, cl_outbuf;
> >> +    cl_kernel kernel = NULL;
> >> +    cl_program program = NULL;
> >> +    size_t local_work_size_2d[2] = {16, 16};
> >> +    size_t global_work_size_2d[2] = {(size_t)width, (size_t)height};
> >> +
> >> +    if (!(inbuf = av_malloc(buf_size)) || !(mask = av_malloc(mask_size))) {
> >> +        av_log(NULL, AV_LOG_ERROR, "Out of memory\n");
> >> +        ret = AVERROR(ENOMEM);
> >> +        goto end;
> >> +    }
> >> +    fill_rand_int((int*)inbuf, buf_size/4);
> >> +    fill_rand_int(mask, mask_size/4);
> >> +
> >> +    CREATEBUF(cl_mask, CL_MEM_READ_ONLY, mask_size);
> >> +    CREATEBUF(cl_inbuf, CL_MEM_READ_ONLY, buf_size);
> >> +    CREATEBUF(cl_outbuf, CL_MEM_READ_WRITE, buf_size);
> >> +
> >> +    kernel_len = strlen(ocl_bench_source);
> >> +    program = clCreateProgramWithSource(ext_opencl_env->context, 1, &ocl_bench_source,
> >> +                                        &kernel_len, &status);
> >> +    if (status != CL_SUCCESS || !program) {
> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark program\n");
> >> +        ret = AVERROR_EXTERNAL;
> >> +        goto end;
> >> +    }
> >> +    status = clBuildProgram(program, 1, &(ext_opencl_env->device_id), NULL, NULL, NULL);
> >> +    if (status != CL_SUCCESS) {
> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to build benchmark program\n");
> >> +        ret = AVERROR_EXTERNAL;
> >> +        goto end;
> >> +    }
> >> +    kernel = clCreateKernel(program, "unsharp_bench", &status);
> >> +    if (status != CL_SUCCESS) {
> >> +        av_log(NULL, AV_LOG_ERROR, "OpenCL unable to create benchmark kernel\n");
> >> +        ret = AVERROR_EXTERNAL;
> >> +        goto end;
> >> +    }
> >> +
> >> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_inbuf, CL_TRUE, 0,
> >> +             buf_size, inbuf, 0, NULL, NULL);
> >> +    OCLCHECK(clEnqueueWriteBuffer, ext_opencl_env->command_queue, cl_mask, CL_TRUE, 0,
> >> +             mask_size, mask, 0, NULL, NULL);
> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_inbuf);
> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_outbuf);
> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_mem), &cl_mask);
> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &width);
> >> +    OCLCHECK(clSetKernelArg, kernel, arg++, sizeof(cl_int), &height);
> >> +
> >> +    start = av_gettime();
> >> +    for (i = 0; i < OPENCL_NB_ITER; i++)
> >> +         OCLCHECK(clEnqueueNDRangeKernel, ext_opencl_env->command_queue, kernel, 2, NULL,
> >> +                 global_work_size_2d, local_work_size_2d, 0, NULL, NULL);
> >> +      clFinish(ext_opencl_env->command_queue);
> >> +    ret = (av_gettime() - start)/OPENCL_NB_ITER;
> >> +end:
> >> +    if (kernel)
> >> +        clReleaseKernel(kernel);
> >> +    if (program)
> >> +        clReleaseProgram(program);
> >> +    if (cl_inbuf)
> >> +        clReleaseMemObject(cl_inbuf);
> >> +    if (cl_outbuf)
> >> +        clReleaseMemObject(cl_outbuf);
> >> +    if (cl_mask)
> >> +        clReleaseMemObject(cl_mask);
> >> +    av_free(inbuf);
> >> +    av_free(mask);
> >> +    return ret;
> >> +}
> >> +
> >> +static int compare_ocl_device_desc(const void *a, const void *b)
> >> +{
> >> +    return ((OpenCLDeviceBenchmark*)a)->runtime - ((OpenCLDeviceBenchmark*)b)->runtime;
> >> +}
> >> +
> >> +int opt_opencl_bench(void *optctx, const char *opt, const char *arg)
> >> +{
> >> +    int i, j, nb_devices = 0, count = 0;
> >> +    int64_t score = 0;
> >> +    AVOpenCLDeviceList *device_list;
> >> +    AVOpenCLDeviceNode *device_node = NULL;
> >> +    OpenCLDeviceBenchmark *devices = NULL;
> >> +    cl_platform_id platform;
> >> +
> >> +    av_opencl_get_device_list(&device_list);
> >> +    for (i = 0; i < device_list->platform_num; i++)
> >> +        nb_devices += device_list->platform_node[i]->device_num;
> >> +    if (!nb_devices) {
> >> +        av_log(NULL, AV_LOG_ERROR, "No OpenCL device detected!\n");
> >> +        return AVERROR(EINVAL);
> >> +    }
> >> +    if (!(devices = av_malloc(sizeof(OpenCLDeviceBenchmark) * nb_devices))) {
> >> +        av_log(NULL, AV_LOG_ERROR, "Could not allocate buffer\n");
> >> +        return AVERROR(ENOMEM);
> >> +    }
> >> +
> >> +    for (i = 0; i < device_list->platform_num; i++) {
> >> +        for (j = 0; j < device_list->platform_node[i]->device_num; j++) {
> >> +            device_node = device_list->platform_node[i]->device_node[j];
> >> +            platform = device_list->platform_node[i]->platform_id;
> >> +            score = av_opencl_benchmark(device_node, platform, run_opencl_bench);
> >> +            if (score > 0) {
> >> +                devices[count].platform_idx = i;
> >> +                devices[count].device_idx = j;
> >> +                devices[count].runtime = score;
> >> +                strcpy(devices[count].device_name, device_node->device_name);
> >> +                count++;
> >> +            }
> >> +        }
> >> +    }
> >> +    qsort(devices, count, sizeof(OpenCLDeviceBenchmark), compare_ocl_device_desc);
> >> +    fprintf(stderr, "platform_idx\tdevice_idx\tdevice_name\truntime\n");
> >> +    for (i = 0; i < count; i++)
> >> +        fprintf(stdout, "%d\t%d\t%s\t%"PRId64"\n",
> >> +                devices[i].platform_idx, devices[i].device_idx,
> >> +                devices[i].device_name, devices[i].runtime);
> >> +
> >> +    av_opencl_free_device_list(&device_list);
> >> +    av_free(devices);
> >> +    return 0;
> >> +}
> >> +
> >>  int opt_opencl(void *optctx, const char *opt, const char *arg)
> >>  {
> >>      char *key, *value;
> >> diff --git a/cmdutils.h b/cmdutils.h
> >> index b814961..bb60a26 100644
> >> --- a/cmdutils.h
> >> +++ b/cmdutils.h
> >> @@ -98,8 +98,23 @@ int opt_max_alloc(void *optctx, const char *opt, const char *arg);
> >>
> >>  int opt_codec_debug(void *optctx, const char *opt, const char *arg);
> >>
> >> +#if CONFIG_OPENCL
> >>  int opt_opencl(void *optctx, const char *opt, const char *arg);
> >>
> >> +typedef struct {
> >> +    int platform_idx;
> >> +    int device_idx;
> >> +    char device_name[64];
> >> +    int64_t runtime;
> >> +} OpenCLDeviceBenchmark;
> >> +
> >> +/**
> >> + * Perform benchmark on all OpenCL devices and print results.
> >> + * This option processing function does not utilize the arguments.
> >> + */
> >> +int opt_opencl_bench(void *optctx, const char *opt, const char *arg);
> >> +#endif
> >> +
> >>  /**
> >>   * Limit the execution time.
> >>   */
> >
> > why is this in cmdutils.c/h ?
> >
> 
> It's designed and used as a cmd utility option similar to
> -codes,-filters etc, it lists available opencl devices sorted based on
> benchmark scores, so users can pick the fastest device using
> '-opencl_option'.

-codecs, filters, ... list libavcodec and filter codecs and filters
that are things registered with these libs.
the code here lists hardware devices and benchmarks them.
we dont put dshow device extraction and benchmarking code in cmdutils
either. nor alsa device listing, and benchmarking code.

cmdutils.c would turn in a total mess if everthing from alsa over
oss, dshow, caca, v4l to vfw would have listing code under #ifs in
it

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Asymptotically faster algorithms should always be preferred if you have
asymptotical amounts of data
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20131203/9a111224/attachment.asc>


More information about the ffmpeg-devel mailing list