Skip to content

Commit

Permalink
Merge pull request #1311 from yenong-amd/release/rocm-rel-5.6
Browse files Browse the repository at this point in the history
Hotfix for rocblas initialization time issues
  • Loading branch information
amcamd authored May 30, 2023
2 parents 089c30b + 4f24b81 commit 19f14cd
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 34 deletions.
9 changes: 5 additions & 4 deletions clients/benchmarks/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ int run_bench_test(bool init,
{
if(init)
{
static int runOnce = (rocblas_client_initialize(), 0); // Initialize rocBLAS
static int runOnce = (rocblas_parallel_initialize(1), 0); // Initialize rocBLAS
}

rocblas_cout << std::setiosflags(std::ios::fixed)
Expand Down Expand Up @@ -1177,8 +1177,6 @@ void gpu_thread_init_device(int id,
{
CHECK_HIP_ERROR(hipSetDevice(id));

rocblas_client_initialize();

Arguments a(arg);
std::string name_filter = "";
a.cold_iters = 1;
Expand Down Expand Up @@ -1207,6 +1205,9 @@ int run_bench_gpu_test(int parallel_devices,
return 1;

// initialization
rocblas_parallel_initialize(parallel_devices);

// run cold call on each device
auto thread_init = std::make_unique<std::thread[]>(parallel_devices);

for(int id = 0; id < parallel_devices; ++id)
Expand All @@ -1215,7 +1216,7 @@ int run_bench_gpu_test(int parallel_devices,
for(int id = 0; id < parallel_devices; ++id)
thread_init[id].join();

// synchronzied launch of cold & hot calls
// synchronized launch of cold & hot calls
auto thread = std::make_unique<std::thread[]>(parallel_devices);

for(int id = 0; id < parallel_devices; ++id)
Expand Down
84 changes: 64 additions & 20 deletions clients/common/utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,42 +352,86 @@ void rocblas_local_handle::rocblas_stream_end_capture()
#endif
}

void rocblas_parallel_initialize_thread(int id, size_t& memory_used)
{
size_t before_init, after_init, total_memory;
CHECK_HIP_ERROR(hipSetDevice(id));
CHECK_HIP_ERROR(hipMemGetInfo(&before_init, &total_memory));
rocblas_initialize();
CHECK_HIP_ERROR(hipMemGetInfo(&after_init, &total_memory));
memory_used = before_init - after_init;
}

/*!
* Initialize rocBLAS for the current HIP device and report
* the time taken to complete the initialization. This is to
* avoid costly startup time at the first call on that device.
* Internal use for benchmark & testing.
* Initialize rocBLAS for the requested number of HIP devices
* and report the time taken to complete the initialization.
* This is to avoid costly startup time at the first call on
* that device. Internal use for benchmark & testing.
* Initializes devices indexed from 0 to parallel_devices-1.
* If parallel_devices is 1, hipSetDevice should be called
* before calling this function.
*/
void rocblas_client_initialize()
void rocblas_parallel_initialize(int parallel_devices)
{
// when executed on a CPU under normal load( Disk I/O, memory etc.),
// this routine completes execution under max limit of 12 seconds.
// The minimum time it takes to complete varies based on
// the architecture & build options used while building the library.
// Setting a max duration of 5 seconds for rocblas library initialization to complete.
constexpr static int max_duration = 5;
auto thread = std::make_unique<std::thread[]>(parallel_devices);
std::vector<size_t> init_memory(parallel_devices);

// Store the start timepoint of rocblas initialize
auto start_time = std::chrono::steady_clock::now();

rocblas_initialize();
if(parallel_devices == 1)
{
size_t before_init, after_init, total_memory;
CHECK_HIP_ERROR(hipMemGetInfo(&before_init, &total_memory));
rocblas_initialize();
CHECK_HIP_ERROR(hipMemGetInfo(&after_init, &total_memory));
init_memory[0] = before_init - after_init;
}
else
{

for(int id = 0; id < parallel_devices; ++id)
thread[id]
= std::thread(rocblas_parallel_initialize_thread, id, std::ref(init_memory[id]));
for(int id = 0; id < parallel_devices; ++id)
thread[id].join();
}

// Store the end timepoint of rocblas initialize
auto end_time = std::chrono::steady_clock::now();

// Compute the time taken to load the Tensile kernels (in seconds).
auto total_library_initialize_time
= std::chrono::duration_cast<std::chrono::seconds>(end_time - start_time).count();

// Compute the time taken to load the Tensile kernels (in milliseconds).
auto init_time_in_ms
= std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();

rocblas_cout << "\nrocBLAS info: Time taken to complete rocBLAS library initialization is "
<< init_time_in_ms << " milliseconds." << std::endl;

// If initialization time exceeds the max duration, display the following info message.
if(total_library_initialize_time > max_duration)
rocblas_cerr << "\nrocBLAS info: rocBLAS initialization exceeded the max duration of "
<< max_duration << " seconds. Check CPU's load metrics." << std::endl;
// Calculate average initialization time per GPU
auto avg_init_time_in_ms = init_time_in_ms / parallel_devices;
if(parallel_devices > 1)
{
rocblas_cout
<< "\nrocBLAS info: Average time taken to complete rocBLAS library initialization "
"per device is "
<< avg_init_time_in_ms << " milliseconds." << std::endl;
}

// If average initialization time exceeds the max duration, display the following info message.
constexpr static int max_duration = 5000;
if(avg_init_time_in_ms > max_duration)
rocblas_cerr << "\nrocBLAS info: average time to initialize each device exceeded the max "
"duration of "
<< max_duration << " milliseconds. Check CPU's load metrics." << std::endl;

constexpr static float max_memory = 1.0;
auto max_library_size
= *std::max_element(std::begin(init_memory), std::end(init_memory)) * 1.0e-9;

rocblas_cout << "\nrocBLAS info: maximum library size per device is " << max_library_size
<< " GB." << std::endl;
if(max_library_size > max_memory)
rocblas_cerr << "\nrocBLAS info: max kernel library size " << max_library_size
<< " GB exceeds the max recommended memory " << max_memory
<< " GB. Check library logic file sizes." << std::endl;
}
8 changes: 4 additions & 4 deletions clients/gtest/multiheaded_gtest.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ************************************************************************
* Copyright (C) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (C) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -66,9 +66,6 @@ namespace
{
CHECK_HIP_ERROR(hipSetDevice(id));

//Initialize rocblas
rocblas_client_initialize();

rocblas_operation transa = rocblas_operation_none, transb = rocblas_operation_transpose;
float alpha = 1.1, beta = 0.9;
rocblas_int m = 1023, n = 1024, k = 1025;
Expand Down Expand Up @@ -191,6 +188,9 @@ namespace
<< std::endl;
return;
}

rocblas_parallel_initialize(count);

auto thread = std::make_unique<std::thread[]>(count);

for(int id = 0; id < count; ++id)
Expand Down
13 changes: 8 additions & 5 deletions clients/include/utility.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,15 @@
#define NOOP (void)0

/*!
* Initialize rocBLAS for the current HIP device and report
* the time taken to complete the initialization. This is used to
* avoid costly startup time at the first call on that device.
* Internal use for benchmark & testing.
* Initialize rocBLAS for the requested number of HIP devices
* and report the time taken to complete the initialization.
* This is to avoid costly startup time at the first call on
* that device. Internal use for benchmark & testing.
* Initializes devices indexed from 0 to parallel_devices-1.
* If parallel_devices is 1, hipSetDevice should be called
* before calling this function.
*/
void rocblas_client_initialize();
void rocblas_parallel_initialize(int parallel_devices);

/* ============================================================================================ */
/*! \brief local handle which is automatically created and destroyed */
Expand Down
2 changes: 1 addition & 1 deletion clients/samples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ foreach( exe ${sample_list_fortran} )
endforeach( )

foreach( exe ${sample_list_all} )
target_link_libraries( ${exe} PRIVATE roc::rocblas )
target_link_libraries( ${exe} PRIVATE roc::rocblas Threads::Threads )

set_target_properties( ${exe} PROPERTIES
CXX_STANDARD 14
Expand Down
5 changes: 5 additions & 0 deletions library/src/tensile_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,9 @@ namespace
if(!skip_xnack.empty()
&& codeObjectFile.find(skip_xnack) != std::string::npos)
continue;
// Skip experimental libraries
if(codeObjectFile.find("Experimental") != std::string::npos)
continue;
adapter.loadCodeObjectFile(codeObjectFile.c_str());
} while(FindNextFileA(hfine, &finddata));
}
Expand All @@ -703,6 +706,8 @@ namespace
std::string cofile = glob_result.gl_pathv[i];
if(!skip_xnack.empty() && cofile.find(skip_xnack) != std::string::npos)
continue;
if(cofile.find("Experimental") != std::string::npos)
continue;
adapter.loadCodeObjectFile(cofile);
}
}
Expand Down

0 comments on commit 19f14cd

Please sign in to comment.