cycles: Add an nvrtc based cubin cli compiler.
authorRay Molenkamp <github@lazydodo.com>
Sat, 3 Feb 2018 17:59:09 +0000 (10:59 -0700)
committerRay Molenkamp <github@lazydodo.com>
Sat, 3 Feb 2018 17:59:09 +0000 (10:59 -0700)
nvcc is very picky regarding compiler versions, severely limiting the compiler we can use, this commit adds a nvrtc based compiler that'll allow us to build the cubins even if the host compiler is unsupported. for details see D2913.

Differential Revision: http://developer.blender.org/D2913

CMakeLists.txt
extern/cuew/include/cuew.h
extern/cuew/src/cuew.c
intern/cycles/CMakeLists.txt
intern/cycles/app/CMakeLists.txt
intern/cycles/app/cycles_cubin_cc.cpp [new file with mode: 0644]
intern/cycles/device/device_cuda.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/kernel_compat_cuda.h

index 2152f6e51ae22cb929cd8f735dd2f70c12769f4f..213b5e96f06af842c6574439d0f8f145026b30e8 100644 (file)
@@ -409,12 +409,14 @@ option(WITH_CYCLES_STANDALONE_GUI "Build Cycles standalone with GUI" OFF)
 option(WITH_CYCLES_OSL                         "Build Cycles with OSL support" ${_init_CYCLES_OSL})
 option(WITH_CYCLES_OPENSUBDIV          "Build Cycles with OpenSubdiv support" ${_init_CYCLES_OPENSUBDIV})
 option(WITH_CYCLES_CUDA_BINARIES       "Build Cycles CUDA binaries" OFF)
+option(WITH_CYCLES_CUBIN_COMPILER      "Build cubins with nvrtc based compiler instead of nvcc" OFF)
 set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
 option(WITH_CYCLES_LOGGING     "Build Cycles with logging support" ON)
 option(WITH_CYCLES_DEBUG       "Build Cycles with extra debug capabilities" OFF)
 option(WITH_CYCLES_NATIVE_ONLY "Build Cycles with native kernel only (which fits current CPU, use for development only)" OFF)
+mark_as_advanced(WITH_CYCLES_CUBIN_COMPILER)
 mark_as_advanced(WITH_CYCLES_LOGGING)
 mark_as_advanced(WITH_CYCLES_DEBUG)
 mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
index 0eace96bc3ff7d3bb184ddf777610e07bb68472c..f5009d4f2c7b7e08e4d8093c895e2908b53b8a1f 100644 (file)
@@ -1323,6 +1323,7 @@ int cuewInit(void);
 const char *cuewErrorString(CUresult result);
 const char *cuewCompilerPath(void);
 int cuewCompilerVersion(void);
+int cuewNvrtcVersion(void);
 
 #ifdef __cplusplus
 }
index 962059bfcce98bc505545afdf9a2b3ab854a9847..b68dc59704994018b139c2f25c90bb364cb2f4e5 100644 (file)
@@ -329,7 +329,7 @@ int cuewInit(void) {
 #ifdef _WIN32
   /* Expected in c:/windows/system or similar, no path needed. */
   const char *cuda_paths[] = {"nvcuda.dll", NULL};
-  const char *nvrtc_paths[] = {"nvrtc.dll", NULL};
+  const char *nvrtc_paths[] = {"nvrtc64_80.dll", "nvrtc64_90.dll", "nvrtc64_91.dll", NULL};
 #elif defined(__APPLE__)
   /* Default installation path. */
   const char *cuda_paths[] = {"/usr/local/cuda/lib/libcuda.dylib", NULL};
@@ -766,6 +766,15 @@ const char *cuewCompilerPath(void) {
   return NULL;
 }
 
+int cuewNvrtcVersion(void) {
+  int major, minor;
+  if (nvrtcVersion) {
+    nvrtcVersion(&major, &minor);
+    return 10 * major + minor;
+  }
+  return 0;
+}
+
 int cuewCompilerVersion(void) {
   const char *path = cuewCompilerPath();
   const char *marker = "Cuda compilation tools, release ";
index 543fb5b2cf8e81dd1db6a4a1550234baaf6c57d7..3da1170ec77fb3095f712c62407e9ddc826b8479 100644 (file)
@@ -242,6 +242,24 @@ if(CMAKE_COMPILER_IS_GNUCXX)
        unset(_has_no_error_unused_macros)
 endif()
 
+if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
+       if(MSVC)
+               set(MAX_MSVC 1800)
+               if(${CUDA_VERSION} EQUAL "8.0")
+                       set(MAX_MSVC 1900)
+               elseif(${CUDA_VERSION} EQUAL "9.0")
+                       set(MAX_MSVC 1910)
+               elseif(${CUDA_VERSION} EQUAL "9.1")
+                       set(MAX_MSVC 1911)
+               endif()
+               if (NOT MSVC_VERSION LESS ${MAX_MSVC})
+                       message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
+                       set(WITH_CYCLES_CUBIN_COMPILER ON)
+               endif()
+               unset(MAX_MSVC)
+       endif()
+endif()
+
 
 # Subdirectories
 
@@ -254,7 +272,7 @@ if(WITH_CYCLES_NETWORK)
        add_definitions(-DWITH_NETWORK)
 endif()
 
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK)
+if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
        add_subdirectory(app)
 endif()
 
index 08a3931ef46b0b22da7498f695489c338b048f0a..9ebeceb1659eb11598252bbda0f7e0db03bc8506 100644 (file)
@@ -120,3 +120,27 @@ if(WITH_CYCLES_NETWORK)
        endif()
        unset(SRC)
 endif()
+
+if(WITH_CYCLES_CUBIN_COMPILER)
+       # 32 bit windows is special, nvrtc is not supported on x86, so even
+       # though we are building 32 bit blender a 64 bit cubin_cc will have
+       # to be build to compile the cubins.
+       if(MSVC AND NOT CMAKE_CL_64)
+               Message("cycles_cubin_cc not supported on x86")
+       else()
+               set(SRC cycles_cubin_cc.cpp)
+               set(INC ../../../extern/cuew/include)
+               add_executable(cycles_cubin_cc ${SRC})
+               include_directories(${INC})
+               target_link_libraries(cycles_cubin_cc
+                       extern_cuew
+                       ${OPENIMAGEIO_LIBRARIES}
+                       ${PLATFORM_LINKLIBS}
+               )
+               if(NOT CYCLES_STANDALONE_REPOSITORY)
+                       target_link_libraries(cycles_cubin_cc bf_intern_guardedalloc)
+               endif()
+               unset(SRC)
+               unset(INC)
+       endif()
+endif()
diff --git a/intern/cycles/app/cycles_cubin_cc.cpp b/intern/cycles/app/cycles_cubin_cc.cpp
new file mode 100644 (file)
index 0000000..c1f3974
--- /dev/null
@@ -0,0 +1,284 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include <OpenImageIO/argparse.h>
+#include <OpenImageIO/filesystem.h>
+
+#include "cuew.h"
+
+#ifdef _MSC_VER
+# include <Windows.h>
+#endif
+
+using std::string;
+using std::vector;
+
+class CompilationSettings
+{
+public:
+       CompilationSettings()
+       : target_arch(0),
+         bits(64),
+         verbose(false),
+         fast_math(false)
+       {}
+
+       string cuda_toolkit_dir;
+       string input_file;
+       string output_file;
+       string ptx_file;
+       vector<string> defines;
+       vector<string> includes;
+       int target_arch;
+       int bits;
+       bool verbose;
+       bool fast_math;
+};
+
+bool compile_cuda(CompilationSettings &settings)
+{
+       const char* headers[] = {"stdlib.h" , "float.h", "math.h", "stdio.h"};
+       const char* header_content[] = {"\n", "\n", "\n", "\n"};
+
+       printf("Building %s\n", settings.input_file.c_str());
+
+       string code;
+       if(!OIIO::Filesystem::read_text_file(settings.input_file, code)) {
+               fprintf(stderr, "Error: unable to read %s\n", settings.input_file.c_str());
+               return false;
+       }
+
+       vector<string> options;
+       for(size_t i = 0; i < settings.includes.size(); i++) {
+               options.push_back("-I" + settings.includes[i]);
+       }
+
+       for(size_t i = 0; i < settings.defines.size(); i++) {
+               options.push_back("-D" + settings.defines[i]);
+       }
+
+       options.push_back("-arch=compute_" + std::to_string(settings.target_arch));
+       options.push_back("--device-as-default-execution-space");
+       if(settings.fast_math)
+               options.push_back("--use_fast_math");
+
+       nvrtcProgram prog;
+       nvrtcResult result = nvrtcCreateProgram(&prog,
+               code.c_str(),                    // buffer
+               NULL,                            // name
+               sizeof(headers) / sizeof(void*), // numHeaders
+               header_content,                  // headers
+               headers);                        // includeNames
+
+       if(result != NVRTC_SUCCESS) {
+               fprintf(stderr, "Error: nvrtcCreateProgram failed (%x)\n\n", result);
+               return false;
+       }
+
+       /* Tranfer options to a classic C array. */
+       vector<const char*> opts(options.size());
+       for(size_t i = 0; i < options.size(); i++) {
+               opts[i] = options[i].c_str();
+       }
+
+       result = nvrtcCompileProgram(prog, options.size(), &opts[0]);
+
+       if(result != NVRTC_SUCCESS) {
+               fprintf(stderr, "Error: nvrtcCompileProgram failed (%x)\n\n", result);
+
+               size_t log_size;
+               nvrtcGetProgramLogSize(prog, &log_size);
+
+               vector<char> log(log_size);
+               nvrtcGetProgramLog(prog, &log[0]);
+               fprintf(stderr, "%s\n", &log[0]);
+
+               return false;
+       }
+
+       /* Retrieve the ptx code. */
+       size_t ptx_size;
+       result = nvrtcGetPTXSize(prog, &ptx_size);
+       if(result != NVRTC_SUCCESS) {
+               fprintf(stderr, "Error: nvrtcGetPTXSize failed (%x)\n\n", result);
+               return false;
+       }
+
+       vector<char> ptx_code(ptx_size);
+       result = nvrtcGetPTX(prog, &ptx_code[0]);
+       if(result != NVRTC_SUCCESS) {
+               fprintf(stderr, "Error: nvrtcGetPTX failed (%x)\n\n", result);
+               return false;
+       }
+
+       /* Write a file in the temp folder with the ptx code. */
+       settings.ptx_file = OIIO::Filesystem::temp_directory_path() + "/" + OIIO::Filesystem::unique_path();
+       FILE * f= fopen(settings.ptx_file.c_str(), "wb");
+       fwrite(&ptx_code[0], 1, ptx_size, f);
+       fclose(f);
+
+       return true;
+}
+
+bool link_ptxas(CompilationSettings &settings)
+{
+       string cudapath = "";
+       if(settings.cuda_toolkit_dir.size())
+               cudapath = settings.cuda_toolkit_dir + "/bin/";
+
+       string ptx = "\"" +cudapath + "ptxas\" " + settings.ptx_file +
+                                       " -o " + settings.output_file +
+                                       " --gpu-name sm_" + std::to_string(settings.target_arch) +
+                                       " -m" + std::to_string(settings.bits);
+
+       if(settings.verbose)
+               ptx += " --verbose";
+
+       int pxresult = system(ptx.c_str());
+       if(pxresult) {
+               fprintf(stderr, "Error: ptxas failed (%x)\n\n", pxresult);
+               return false;
+       }
+
+       if(!OIIO::Filesystem::remove(settings.ptx_file)) {
+               fprintf(stderr, "Error: removing %s\n\n", settings.ptx_file.c_str());
+       }
+
+       return true;
+}
+
+bool init(CompilationSettings &settings)
+{
+#ifdef _MSC_VER
+       if(settings.cuda_toolkit_dir.size()) {
+               SetDllDirectory((settings.cuda_toolkit_dir + "/bin").c_str());
+       }
+#endif
+
+       int cuewresult = cuewInit();
+       if(cuewresult != CUEW_SUCCESS) {
+               fprintf(stderr, "Error: cuew init fialed (0x%x)\n\n", cuewresult);
+               return false;
+       }
+
+       if(cuewNvrtcVersion() < 80) {
+               fprintf(stderr, "Error: only cuda 8 and higher is supported, %d\n\n", cuewCompilerVersion());
+               return false;
+       }
+
+       if(!nvrtcCreateProgram) {
+               fprintf(stderr, "Error: nvrtcCreateProgram not resolved\n");
+               return false;
+       }
+
+       if(!nvrtcCompileProgram) {
+               fprintf(stderr, "Error: nvrtcCompileProgram not resolved\n");
+               return false;
+       }
+
+       if(!nvrtcGetProgramLogSize) {
+               fprintf(stderr, "Error: nvrtcGetProgramLogSize not resolved\n");
+               return false;
+       }
+
+       if(!nvrtcGetProgramLog) {
+               fprintf(stderr, "Error: nvrtcGetProgramLog not resolved\n");
+               return false;
+       }
+
+       if(!nvrtcGetPTXSize) {
+               fprintf(stderr, "Error: nvrtcGetPTXSize not resolved\n");
+               return false;
+       }
+
+       if(!nvrtcGetPTX) {
+               fprintf(stderr, "Error: nvrtcGetPTX not resolved\n");
+               return false;
+       }
+
+       return true;
+}
+
+bool parse_parameters(int argc, const char **argv, CompilationSettings &settings)
+{
+       OIIO::ArgParse ap;
+       ap.options("Usage: cycles_cubin_cc [options]",
+               "-target %d", &settings.target_arch, "target shader model",
+               "-m %d", &settings.bits, "Cuda architecture bits",
+               "-i %s", &settings.input_file, "Input source filename",
+               "-o %s", &settings.output_file, "Output cubin filename",
+               "-I %L", &settings.includes, "Add additional includepath",
+               "-D %L", &settings.defines, "Add additional defines",
+               "-v", &settings.verbose, "Use verbose logging",
+               "--use_fast_math", &settings.fast_math, "Use fast math",
+               "-cuda-toolkit-dir %s", &settings.cuda_toolkit_dir, "path to the cuda toolkit binary directory",
+               NULL);
+
+       if(ap.parse(argc, argv) < 0) {
+               fprintf(stderr, "%s\n", ap.geterror().c_str());
+               ap.usage();
+               return false;
+       }
+
+       if(!settings.output_file.size()) {
+               fprintf(stderr, "Error: Output file not set(-o), required\n\n");
+               return false;
+       }
+
+       if(!settings.input_file.size()) {
+               fprintf(stderr, "Error: Input file not set(-i, required\n\n");
+               return false;
+       }
+
+       if(!settings.target_arch) {
+               fprintf(stderr, "Error: target shader model not set (-target), required\n\n");
+               return false;
+       }
+
+       return true;
+}
+
+int main(int argc, const char **argv)
+{
+       CompilationSettings settings;
+
+       if(!parse_parameters(argc, argv, settings)) {
+               fprintf(stderr, "Error: invalid parameters, exiting\n");
+               exit(EXIT_FAILURE);
+       }
+
+       if(!init(settings)) {
+               fprintf(stderr, "Error: initialization error, exiting\n");
+               exit(EXIT_FAILURE);
+       }
+
+       if(!compile_cuda(settings)) {
+               fprintf(stderr, "Error: compilation error, exiting\n");
+               exit(EXIT_FAILURE);
+       }
+
+       if(!link_ptxas(settings)) {
+               exit(EXIT_FAILURE);
+       }
+
+       return 0;
+}
index f3548640679482578ae9d8c8c84f0aa9e5eff1c3..18a83672a6d13111ce844926001dbec3cc2acb16 100644 (file)
@@ -348,7 +348,6 @@ public:
                const DeviceRequestedFeatures& requested_features,
                bool filter=false, bool split=false)
        {
-               const int cuda_version = cuewCompilerVersion();
                const int machine = system_cpu_bits();
                const string source_path = path_get("source");
                const string include_path = source_path;
@@ -356,10 +355,8 @@ public:
                                              "--ptxas-options=\"-v\" "
                                              "--use_fast_math "
                                              "-DNVCC "
-                                             "-D__KERNEL_CUDA_VERSION__=%d "
                                               "-I\"%s\"",
                                              machine,
-                                             cuda_version,
                                              include_path.c_str());
                if(!filter && use_adaptive_compilation()) {
                        cflags += " " + requested_features.get_build_options();
index 8f7bc7996a4a7dbb0992c34bfa513d4056e99739..3b76b3403e70d12bfdeff9e8c7bf634ab1758bfa 100644 (file)
@@ -356,55 +356,67 @@ if(WITH_CYCLES_CUDA_BINARIES)
        set(cuda_cubins)
 
        macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
+               set(cuda_cubin ${name}_${arch}.cubin)
+               set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+
+               set(cuda_flags
+                       -D CCL_NAMESPACE_BEGIN=
+                       -D CCL_NAMESPACE_END=
+                       -D NVCC
+                       -m ${CUDA_BITS}
+                       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
+                       -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+                       --use_fast_math
+                       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+
                if(${experimental})
-                       set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+                       set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
                        set(name ${name}_experimental)
                endif()
 
-               set(cuda_cubin ${name}_${arch}.cubin)
-
                if(WITH_CYCLES_DEBUG)
-                       set(cuda_debug_flags "-D__KERNEL_DEBUG__")
-               else()
-                       set(cuda_debug_flags "")
+                       set(cuda_flags ${cuda_flags} -D __KERNEL_DEBUG__)
                endif()
 
-               set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
-               set(cuda_nvcc_version ${CUDA_VERSION})
-
-               set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
-               set(cuda_math_flags "--use_fast_math")
-
-               set(cuda_kernel_src "/kernels/cuda/${name}.cu")
-
-               add_custom_command(
-                       OUTPUT ${cuda_cubin}
-                       COMMAND ${cuda_nvcc_command}
-                                       -arch=${arch}
-                                       ${CUDA_NVCC_FLAGS}
-                                       -m${CUDA_BITS}
-                                       --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-                                       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-                                       --ptxas-options="-v"
-                                       ${cuda_arch_flags}
-                                       ${cuda_version_flags}
-                                       ${cuda_math_flags}
-                                       ${flags}
-                                       ${cuda_debug_flags}
-                                       -I${CMAKE_CURRENT_SOURCE_DIR}/..
-                                       -DCCL_NAMESPACE_BEGIN=
-                                       -DCCL_NAMESPACE_END=
-                                       -DNVCC
-                       DEPENDS ${sources})
+               if(WITH_CYCLES_CUBIN_COMPILER)
+                       string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)
+
+                       # Needed to find libnvrtc-builtins.so. Can't do it from inside
+                       # cycles_cubin_cc since the env variable is read before main()
+                       if(APPLE)
+                               set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+                                       -E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
+                       elseif(UNIX)
+                               set(CUBIN_CC_ENV ${CMAKE_COMMAND}
+                                       -E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+                       endif()
 
+                       add_custom_command(
+                               OUTPUT ${cuda_cubin}
+                               COMMAND ${CUBIN_CC_ENV}
+                                               "$<TARGET_FILE:cycles_cubin_cc>"
+                                               -target ${CUDA_ARCH}
+                                               -i ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+                                               ${cuda_flags}
+                                               -v
+                                               -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
+                               DEPENDS ${sources} cycles_cubin_cc)
+               else()
+                       add_custom_command(
+                               OUTPUT ${cuda_cubin}
+                               COMMAND ${CUDA_NVCC_EXECUTABLE}
+                                               -arch=${arch}
+                                               ${CUDA_NVCC_FLAGS}
+                                               --cubin
+                                               ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+                                               --ptxas-options="-v"
+                                               ${cuda_flags}
+                               DEPENDS ${sources})
+               endif()
                delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
                list(APPEND cuda_cubins ${cuda_cubin})
 
-               unset(cuda_extra_flags)
                unset(cuda_debug_flags)
-
-               unset(cuda_nvcc_command)
-               unset(cuda_nvcc_version)
        endmacro()
 
        foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
@@ -412,12 +424,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
                        message(STATUS "CUDA binaries for ${arch} disabled, not supported by CUDA 9.")
                else()
                        # Compile regular kernel
-                       CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
                        CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
+                       CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
 
                        if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
                                # Compile split kernel
-                               CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
+                               CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D __SPLIT__" ${cuda_sources} FALSE)
                        endif()
                endif()
        endforeach()
index fa512f80e41e462ae71583792ddee01df0d27621..7b66bdc169e3d2ea282d66213875d9c73bf09387 100644 (file)
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <float.h>
-#include <stdint.h>
+/* Manual definitions so we can compile without CUDA toolkit. */
+
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned short half;
+typedef unsigned long long CUtexObject;
+
+#define FLT_MAX 1.175494350822287507969e-38f
+#define FLT_MIN 340282346638528859811704183484516925440.0f
+
+__device__ half __float2half(const float f)
+{
+       half val;
+       asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+       return val;
+}
 
 /* Qualifier wrappers for different names on different devices */