Cycles: some tweaks for apple opencl with ATI cards, to get it working up to
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Tue, 20 Dec 2011 17:36:56 +0000 (17:36 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Tue, 20 Dec 2011 17:36:56 +0000 (17:36 +0000)
the level of ambient occlusion render, shaders still fail. Fixes found with
much help from Jens and Dalai.

intern/cycles/device/device.cpp
intern/cycles/device/device.h
intern/cycles/device/device_opencl.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel_compat_opencl.h

index f43ccffe4614afb420fbda639b99c090d530f269..6ebc359fdb3ae2be4bd33298667fdd44699884e3 100644 (file)
@@ -24,6 +24,7 @@
 
 #include "util_cuda.h"
 #include "util_debug.h"
+#include "util_foreach.h"
 #include "util_math.h"
 #include "util_opencl.h"
 #include "util_opengl.h"
@@ -41,7 +42,31 @@ DeviceTask::DeviceTask(Type type_)
 {
 }
 
-void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
+void DeviceTask::split_max_size(list<DeviceTask>& tasks, int max_size)
+{
+       int num;
+
+       if(type == DISPLACE) {
+               num = (displace_w + max_size - 1)/max_size;
+       }
+       else {
+               max_size = max(1, max_size/w);
+               num = (h + max_size - 1)/max_size;
+       }
+
+       split(tasks, num);
+}
+
+void DeviceTask::split(ThreadQueue<DeviceTask>& queue, int num)
+{
+       list<DeviceTask> tasks;
+       split(tasks, num);
+
+       foreach(DeviceTask& task, tasks)
+               queue.push(task);
+}
+
+void DeviceTask::split(list<DeviceTask>& tasks, int num)
 {
        if(type == DISPLACE) {
                num = min(displace_w, num);
@@ -55,7 +80,7 @@ void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
                        task.displace_x = tx;
                        task.displace_w = tw;
 
-                       tasks.push(task);
+                       tasks.push_back(task);
                }
        }
        else {
@@ -70,7 +95,7 @@ void DeviceTask::split(ThreadQueue<DeviceTask>& tasks, int num)
                        task.y = ty;
                        task.h = th;
 
-                       tasks.push(task);
+                       tasks.push_back(task);
                }
        }
 }
index be6a3f144edd6ca75c9625133d44fdae3391d020..a6a81e7b3268d10359f4e1d8e321f41873c5036c 100644 (file)
@@ -23,6 +23,7 @@
 
 #include "device_memory.h"
 
+#include "util_list.h"
 #include "util_string.h"
 #include "util_thread.h"
 #include "util_types.h"
@@ -67,7 +68,10 @@ public:
        int displace_x, displace_w;
 
        DeviceTask(Type type = PATH_TRACE);
+
+       void split(list<DeviceTask>& tasks, int num);
        void split(ThreadQueue<DeviceTask>& tasks, int num);
+       void split_max_size(list<DeviceTask>& tasks, int max_size);
 };
 
 /* Device */
index 3a1d3032d6e13bccfc319636d3ebf7db9eb62bf7..6014dd0fdb7bf9d98a5592490d20969cd1f913c8 100644 (file)
@@ -25,6 +25,7 @@
 #include "device.h"
 #include "device_intern.h"
 
+#include "util_foreach.h"
 #include "util_map.h"
 #include "util_math.h"
 #include "util_md5.h"
@@ -52,6 +53,7 @@ public:
        map<string, device_memory*> mem_map;
        device_ptr null_mem;
        bool device_initialized;
+       string platform_name;
 
        const char *opencl_error_string(cl_int err)
        {
@@ -175,6 +177,10 @@ public:
                if(opencl_error(ciErr))
                        return;
 
+               char name[256];
+               clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+               platform_name = name;
+
                cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
                if(opencl_error(ciErr))
                        return;
@@ -191,7 +197,7 @@ public:
        {
                char version[256];
 
-               int major, minor, req_major = 1, req_minor = 0;
+               int major, minor, req_major = 1, req_minor = 1;
 
                clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
 
@@ -277,14 +283,11 @@ public:
        {
                string build_options = " -cl-fast-relaxed-math ";
                
-               /* Full Shading only on NVIDIA cards at the moment */
-               char vendor[256];
-
-               clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(vendor), &vendor, NULL);
-               string name = vendor;
-               
-               if(name == "NVIDIA CUDA")
-                       build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ ";
+               /* full shading only on NVIDIA cards at the moment */
+               if(platform_name == "NVIDIA CUDA")
+                       build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
+               if(platform_name == "Apple")
+                       build_options += " -D__CL_NO_FLOAT3__ ";
 
                return build_options;
        }
@@ -657,12 +660,24 @@ public:
                opencl_assert(clFinish(cqCommandQueue));
        }
 
-       void task_add(DeviceTask& task)
+       void task_add(DeviceTask& maintask)
        {
-               if(task.type == DeviceTask::TONEMAP)
-                       tonemap(task);
-               else if(task.type == DeviceTask::PATH_TRACE)
-                       path_trace(task);
+               list<DeviceTask> tasks;
+
+               /* arbitrary limit to work around apple ATI opencl issue */
+               if(platform_name == "Apple")
+                       maintask.split_max_size(tasks, 76800);
+               else
+                       tasks.push_back(maintask);
+
+               DeviceTask task;
+
+               foreach(DeviceTask& task, tasks) {
+                       if(task.type == DeviceTask::TONEMAP)
+                               tonemap(task);
+                       else if(task.type == DeviceTask::PATH_TRACE)
+                               path_trace(task);
+               }
        }
 
        void task_wait()
index e17544bf7afbcd1b2b2adc80df9aa09de0fcb573..939a74660a1c807df9a5872b57fc19f5ea16d39d 100644 (file)
@@ -143,7 +143,7 @@ endif()
 #set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
 #add_custom_command(
 #      OUTPUT ${KERNEL_PREPROCESSED}
-#      COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DWITH_OPENCL -o ${KERNEL_PREPROCESSED}
+#      COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
 #      DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
index 5515966807b399059a7a7f9b13efb54ac7d6479f..9fbd8566ecda2ae82c8282d11c06c56e8d76937c 100644 (file)
 /* no namespaces in opencl */
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
-#define WITH_OPENCL
+
+#ifdef __CL_NO_FLOAT3__
+#define float3 float4
+#endif
+
+#ifdef __CL_NOINLINE__
+#define __noinline __attribute__((noinline))
+#else
+#define __noinline
+#endif
 
 /* in opencl all functions are device functions, so leave this empty */
 #define __device
-#define __device_inline
-#define __device_noinline
+#define __device_inline __device
+#define __device_noinline  __device __noinline
 
 /* no assert in opencl */
 #define kernel_assert(cond)
@@ -68,7 +77,11 @@ __device float kernel_tex_interp_(__global float *data, int width, float x)
 #endif
 
 #define make_float2(x, y) ((float2)(x, y))
+#ifdef __CL_NO_FLOAT3__
+#define make_float3(x, y, z) ((float4)(x, y, z, 0.0))
+#else
 #define make_float3(x, y, z) ((float3)(x, y, z))
+#endif
 #define make_float4(x, y, z, w) ((float4)(x, y, z, w))
 #define make_int2(x, y) ((int2)(x, y))
 #define make_int3(x, y, z) ((int3)(x, y, z))