I implemented a simple kernel that converts data between different data types. This kernel demonstrates reasonably good bandwidth utilization on RTX TITAN NVIDIA GPU (250-550 GB/S depending on data types), which is similar to what clpeak shows.
However, when I run it on MI200 bandwidth utilization is very poor (~45GB/S), though clpeak shows ~1100 GB/S.
I used the latest commit: 599b1b9. OS Linux.
Kernel(clickable)#include <chrono> #include <cstdint> #include <iostream> #include <CL/sycl.hpp> #include <sycl/ext/oneapi/experimental/bfloat16.hpp> using sycl::ext::oneapi::experimental::bfloat16; using namespace std::chrono; using namespace std; using namespace sycl; enum class dt_t : int { bf16, f16, f32, s32, s8, u8 }; template <dt_t dt> struct dt_traits; #define DECLARE_TRAIT(dt, t, n) \ template <> \ struct dt_traits<dt_t::dt> { \ using type = t; \ static constexpr size_t size = sizeof(t); \ static constexpr char *name = (decltype(name))n; \ }; DECLARE_TRAIT(bf16, bfloat16, "bf16") DECLARE_TRAIT(f16, half, "f16") DECLARE_TRAIT(f32, float, "f32") DECLARE_TRAIT(s32, int32_t, "s32") DECLARE_TRAIT(s8, int8_t, "s8") DECLARE_TRAIT(u8, uint8_t, "u8") #undef DECLARE_TRAIT template <dt_t sdt, dt_t ddt> struct transform_kernel_t { using src_type = typename dt_traits<sdt>::type; using dst_type = typename dt_traits<ddt>::type; transform_kernel_t(const void *src, void *dst, int nelems) : src(src), dst(dst), nelems(nelems) {} void operator()(nd_item<1> id) const { const auto local_id = id.get_global_id(0); if (local_id < nelems) { const auto *src_typed = reinterpret_cast<const src_type *>(src); auto *dst_typed = reinterpret_cast<dst_type *>(dst); dst_typed[local_id] = static_cast<dst_type>(src_typed[local_id]); } } const void *src; void *dst; int nelems; }; void *allocate_buffer(dt_t dt, int nelems, queue q) { #define CASE(dt) \ case dt_t::dt: \ return malloc_shared(nelems * sizeof(dt_traits<dt_t::dt>::type), q); switch (dt) { CASE(f32) CASE(f16) CASE(bf16) CASE(s32) CASE(s8) CASE(u8) default: throw std::runtime_error("unexpected dt"); } #undef CASE } void init_buffer(void *ptr, dt_t dt, int nelems) { #define CASE(dt) \ case dt_t::dt: \ reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr)[i] \ = (i % 2 == 0) ? 4 : 9; \ break; for (int i = 0; i < nelems; i++) { switch (dt) { CASE(f32) CASE(f16) CASE(bf16) CASE(s32) CASE(s8) CASE(u8) default: throw std::runtime_error("unexpected dt"); } } #undef CASE } void check_buffer(void *ptr, dt_t dt, int nelems) { #define CASE(dt) \ case dt_t::dt: { \ auto *p = reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr); \ const auto exp = (i % 2 == 0) ? 4 : 9; \ if (p[i] != exp) throw std::runtime_error("result mismatch"); \ break; \ } for (int i = 0; i < nelems; i++) { switch (dt) { CASE(f32) CASE(f16) CASE(bf16) CASE(s32) CASE(s8) CASE(u8) default: throw std::runtime_error("unexpected dt"); } } printf("Test passed\n"); #undef CASE } sycl::nd_range<1> get_nd_range(const device &dev, int nelems) { const size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>(); const size_t max_work_item = dev.get_info<sycl::info::device::max_work_item_sizes>()[0]; const size_t optimal_ls = std::min(max_wg_size, max_work_item); const size_t ls = std::min((size_t)nelems, optimal_ls); const size_t gs = nelems % ls ? (nelems / ls + 1) * ls : nelems; printf("ls:%lu, gs:%lu\n", ls, gs); return {{gs}, {ls}}; } int main() { constexpr int64_t nelems = 1024 * 1024 * 256; constexpr int64_t niters = 10; auto dev = device(gpu_selector_v); auto q = queue(dev); #define SUBMIT_CASE(sdt, ddt) \ { \ printf("%s -> %s\n", dt_traits<dt_t::sdt>::name, \ dt_traits<dt_t::ddt>::name); \ void *src = allocate_buffer(dt_t::sdt, nelems, q); \ void *dst = allocate_buffer(dt_t::ddt, nelems, q); \ init_buffer(src, dt_t::sdt, nelems); \ const auto nd_range = get_nd_range(dev, nelems); \ /* Warm-up run */ \ auto e = q.submit([&](handler &cgh) { \ transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \ cgh.parallel_for(nd_range, tk); \ }); \ q.wait_and_throw(); \ \ auto start = high_resolution_clock::now(); \ for (int i = 0; i < niters; i++) { \ e = q.submit([&](handler &cgh) { \ cgh.depends_on({e}); \ transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \ cgh.parallel_for(nd_range, tk); \ }); \ } \ q.wait_and_throw(); \ auto end = high_resolution_clock::now(); \ \ try { \ check_buffer(dst, dt_t::ddt, nelems); \ } catch (std::exception & e) { \ std::cout << e.what() << std::endl; \ return 1; \ } \ /* Time in seconds */ \ double time = (duration_cast<microseconds>(end - start)).count() \ / niters / 1e6f; \ /* Size in GB */ \ double size = ((dt_traits<dt_t::sdt>::size * nelems) \ + (dt_traits<dt_t::ddt>::size * nelems)) \ / 1e9; \ printf("size(gb):%.2f, time(sec):%f, BW:%f\n", size, time, \ size / time); \ free(src, q); \ free(dst, q); \ } //----------------------- SUBMIT_CASE(f16, f16) SUBMIT_CASE(f16, f32) SUBMIT_CASE(f16, s32) SUBMIT_CASE(f16, s8) SUBMIT_CASE(f16, u8) //----------------------- SUBMIT_CASE(f32, f32) SUBMIT_CASE(f32, f16) SUBMIT_CASE(f32, s32) SUBMIT_CASE(f32, s8) SUBMIT_CASE(f32, u8) //----------------------- SUBMIT_CASE(s32, s32) SUBMIT_CASE(s32, f32) SUBMIT_CASE(s32, f16) SUBMIT_CASE(s32, s8) SUBMIT_CASE(s32, u8) //----------------------- SUBMIT_CASE(s8, s8) SUBMIT_CASE(s8, f32) SUBMIT_CASE(s8, f16) SUBMIT_CASE(s8, s32) SUBMIT_CASE(s8, u8) //----------------------- SUBMIT_CASE(u8, u8) SUBMIT_CASE(u8, f32) SUBMIT_CASE(u8, f16) SUBMIT_CASE(u8, s32) SUBMIT_CASE(u8, s8) //----------------------- #undef SUBMIT_CASE return 0; }
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4