From f126cf5972dd7369d1497d1949b9515432721e41 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Fri, 27 Sep 2024 10:11:46 +0200 Subject: [PATCH] [SYCL][NFC] Optimize verifyUsedKernelBundle usage (#15496) This PR intended to reduce amount of template instantiations performed during both host & device compilation passes, as well as reduce amount of LLVM IR emitted during host compilation pass. `handler::verifyUsedKernelBundle` is a one-line helper which was inlined to avoid one instantiation per-kernel and emission of associated LLVM IR. Its uses (now inlined) were also moved under `#ifndef __SYCL_DEVICE_ONLY__` to reduce amount of code device compiler has to parse. --- sycl/include/sycl/handler.hpp | 42 ++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 105f33ee68df..a61c1dfff8e2 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -774,10 +774,6 @@ class __SYCL_EXPORT handler { /// /// \param KernelName is the name of the SYCL kernel to check that the used /// kernel bundle contains. - template void verifyUsedKernelBundle() { - verifyUsedKernelBundleInternal( - detail::string_view{detail::getKernelName()}); - } void verifyUsedKernelBundleInternal(detail::string_view KernelName); /// Stores lambda to the template-free object @@ -1233,7 +1229,6 @@ class __SYCL_EXPORT handler { // conflicts they should be included in the name. using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); // Range rounding can be disabled by the user. // Range rounding is not done on the host device. @@ -1254,6 +1249,8 @@ class __SYCL_EXPORT handler { kernel_parallel_for_wrapper(Wrapper); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); // We are executing over the rounded range, but there are still // items/ids that are are constructed in ther range rounded // kernel use items/ids in the user range, which means that @@ -1279,6 +1276,8 @@ class __SYCL_EXPORT handler { kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); processProperties(), PropertiesT>(Props); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); @@ -1315,7 +1314,6 @@ class __SYCL_EXPORT handler { // conflicts they should be included in the name. using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; static_assert( @@ -1329,6 +1327,8 @@ class __SYCL_EXPORT handler { kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(ExecutionRange); setNDRangeDescriptor(std::move(ExecutionRange)); processProperties(), PropertiesT>(Props); @@ -1405,7 +1405,6 @@ class __SYCL_EXPORT handler { // conflicts they should be included in the name. using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)NumWorkGroups; @@ -1413,6 +1412,8 @@ class __SYCL_EXPORT handler { kernel_parallel_for_work_group_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); processProperties(), PropertiesT>(Props); detail::checkValueRange(NumWorkGroups); setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true); @@ -1446,7 +1447,6 @@ class __SYCL_EXPORT handler { // conflicts they should be included in the name. using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)NumWorkGroups; @@ -1455,6 +1455,8 @@ class __SYCL_EXPORT handler { kernel_parallel_for_work_group_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); processProperties(), PropertiesT>(Props); nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); @@ -1728,9 +1730,10 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); kernel_single_task_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); // No need to check if range is out of INT_MAX limits as it's compile-time // known constant. setNDRangeDescriptor(range<1>{1}); @@ -2024,7 +2027,6 @@ class __SYCL_EXPORT handler { throwIfActionIsCreated(); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, @@ -2033,6 +2035,8 @@ class __SYCL_EXPORT handler { (void)WorkItemOffset; kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(NumWorkItems, WorkItemOffset); setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); StoreLambda( @@ -2165,10 +2169,11 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); (void)Kernel; kernel_single_task(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); // No need to check if range is out of INT_MAX limits as it's compile-time // known constant setNDRangeDescriptor(range<1>{1}); @@ -2200,12 +2205,13 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)Kernel; (void)NumWorkItems; kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); @@ -2239,13 +2245,14 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)Kernel; (void)NumWorkItems; (void)WorkItemOffset; kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(NumWorkItems, WorkItemOffset); setNDRangeDescriptor(std::move(NumWorkItems), std::move(WorkItemOffset)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); @@ -2278,13 +2285,14 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)Kernel; (void)NDRange; kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); MKernel = detail::getSyclObjImpl(std::move(Kernel)); @@ -2321,13 +2329,14 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)Kernel; (void)NumWorkGroups; kernel_parallel_for_work_group_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); detail::checkValueRange(NumWorkGroups); setNDRangeDescriptor(NumWorkGroups, /*SetNumWorkGroups=*/true); MKernel = detail::getSyclObjImpl(std::move(Kernel)); @@ -2361,7 +2370,6 @@ class __SYCL_EXPORT handler { setHandlerKernelBundle(Kernel); using NameT = typename detail::get_kernel_name_t::name; - verifyUsedKernelBundle(); using LambdaArgType = sycl::detail::lambda_arg_type>; (void)Kernel; @@ -2369,6 +2377,8 @@ class __SYCL_EXPORT handler { (void)WorkGroupSize; kernel_parallel_for_work_group_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ + verifyUsedKernelBundleInternal( + detail::string_view{detail::getKernelName()}); nd_range ExecRange = nd_range(NumWorkGroups * WorkGroupSize, WorkGroupSize); detail::checkValueRange(ExecRange);