Skip to content

Commit

Permalink
[ORO-0] bitcode/cubin linking APIs (#40)
Browse files Browse the repository at this point in the history
* [ORO-0] Link apis.

* [ORO-0] Forgot to add.

* [ORO-0] Linking test.

* [ORO-0] Add orortcGetBitcode/orortcGetBitcodeSize

* [ORO-0] Update link unit tests with comments

* [ORO-0] Change test for CUBIN instead of PTX

* [ORO-0] Fix loadfile to use binary mode, remove printf in kernel

* [ORO-0] Adding hiprtc to workaround the regression in 22.7.1 driver released at 7/26/2022.

* [ORO-0] Created win64 subdir.

* [ORO-0] Load amdhip first, then hiprtc.

* [ORO-0] Remove assert from hiprtc library checks

* [ORO-0] Add gfx1030 bitcode for navi21

* [MNN-0] Fix premake and add more link testcases

* [ORO-0] Update a link_null_name testcase

* [ORO-0] Make unit tests more stable on CUDA

* [ORO-0] Update bitcode for gfx1030

* [ORO-0] Add bitcodes for navi1,2, vega

* [ORO-0] Add hiprtc.dll and comgr dll

* [ORO-0] Add gfx906 bitcodes

* [ORO-0] Support unit tests on both HIP and CUDA

* [ORO-0] Update dlls and bitcodes

* [ORO-0] Update bitcodes and generation script

* [ORO-0] Minor fixes in bundled bitcode unit tests

* [ORO-0] Fix typo in options

* [ORO-0] Fix getCUBIN/PTX signatures

* [ORO-0] Fix unit tests and generate fatbin for CUDA

* [ORO-0] Regenerate fatbin and fix script

* [ORO-0] Cleanup

* [ORO-0] Update bundled bitcodes to only contain navi21 for now

* [ORO-0] Updated bundled bitcode

* [ORO-0] add ORO_LAUNCH_PARAMS_*

* [ORO-0] Add unit test for orortcLinkAddFile

* [ORO-0] Add unittest scripts for TC

* [ORO-0] Set separate LAUNCH_PARAM_END for HIP/CUDA

* [ORO-0] Add bitcode+bundled bitcode link test

* [ORO-0] Cleanup

* [ORO-0] Fix typo in script

* [ORO-0] Update linux TC script

Co-authored-by: takahiroharada <[email protected]>
  • Loading branch information
jammm and takahiroharada authored Aug 19, 2022
1 parent d78fb81 commit 6ba85ab
Show file tree
Hide file tree
Showing 57 changed files with 814 additions and 159 deletions.
58 changes: 58 additions & 0 deletions Orochi/Orochi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ orortcResult nvrtc2oro( nvrtcResult a )
return (orortcResult)a;
}

inline orortcResult cu2orortc( CUresult a ) { return (orortcResult)a; }


#define __ORO_FUNC1( cuname, hipname ) if( s_api & ORO_API_CUDADRIVER ) return cu2oro( cu##cuname ); if( s_api == ORO_API_HIP ) return hip2oro( hip##hipname );
#define __ORO_FUNC1X( API, cuname, hipname ) if( API & ORO_API_CUDADRIVER ) return cu2oro( cu##cuname ); if( API == ORO_API_HIP ) return hip2oro( hip##hipname );
//#define __ORO_FUNC2( cudaname, hipname ) if( s_api == ORO_API_CUDA ) return cuda2oro( cuda##cudaname ); if( s_api == ORO_API_HIP ) return hip2oro( hip##hipname );
Expand Down Expand Up @@ -727,6 +730,18 @@ orortcResult OROAPI orortcGetProgramLogSize(orortcProgram prog, size_t* logSizeR
GetProgramLogSize( (hiprtcProgram)prog, logSizeRet ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcGetBitcode(orortcProgram prog, char* bitcode)
{
__ORORTC_FUNC1( GetCUBIN( (nvrtcProgram)prog, bitcode ),
GetBitcode( (hiprtcProgram)prog, bitcode ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcGetBitcodeSize(orortcProgram prog, size_t* bitcodeSizeRet)
{
__ORORTC_FUNC1( GetCUBINSize( (nvrtcProgram)prog, bitcodeSizeRet ),
GetBitcodeSize( (hiprtcProgram)prog, bitcodeSizeRet ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcGetCode(orortcProgram prog, char* code)
{
__ORORTC_FUNC1( GetPTX( (nvrtcProgram)prog, code ),
Expand All @@ -740,6 +755,49 @@ orortcResult OROAPI orortcGetCodeSize(orortcProgram prog, size_t* codeSizeRet)
return ORORTC_ERROR_INTERNAL_ERROR;
}

orortcResult OROAPI orortcLinkCreate( unsigned int num_options, orortcJIT_option* option_ptr, void** option_vals_pptr, orortcLinkState* link_state_ptr )
{
if( s_api & ORO_API_CUDADRIVER )
return cu2orortc( cuLinkCreate( num_options, (CUjit_option*)option_ptr, option_vals_pptr, (CUlinkState*)link_state_ptr ) );
else
return hiprtc2oro( hiprtcLinkCreate( num_options, (hiprtcJIT_option*)option_ptr, option_vals_pptr, (hiprtcLinkState*)link_state_ptr ) );

return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcLinkAddFile( orortcLinkState link_state_ptr, orortcJITInputType input_type, const char* file_path, unsigned int num_options, orortcJIT_option* options_ptr, void** option_values )
{
if( s_api & ORO_API_CUDADRIVER )
return cu2orortc( cuLinkAddFile( (CUlinkState)link_state_ptr, (CUjitInputType)input_type, file_path, num_options, (CUjit_option*)options_ptr, option_values ) );
else
return hiprtc2oro( hiprtcLinkAddFile( (hiprtcLinkState)link_state_ptr, (hiprtcJITInputType)input_type, file_path, num_options, (hiprtcJIT_option*)options_ptr, option_values ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcLinkAddData( orortcLinkState link_state_ptr, orortcJITInputType input_type, void* image, size_t image_size, const char* name, unsigned int num_options, orortcJIT_option* options_ptr, void** option_values )
{
if( s_api & ORO_API_CUDADRIVER )
return cu2orortc( cuLinkAddData( (CUlinkState)link_state_ptr, (CUjitInputType)input_type, image, image_size, name, num_options, (CUjit_option*)options_ptr ,option_values ) );
else
return hiprtc2oro( hiprtcLinkAddData( (hiprtcLinkState)link_state_ptr, (hiprtcJITInputType)input_type, image, image_size, name, num_options, (hiprtcJIT_option*)options_ptr, option_values ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcLinkComplete( orortcLinkState link_state_ptr, void** bin_out, size_t* size_out )
{
if( s_api & ORO_API_CUDADRIVER )
return cu2orortc( cuLinkComplete( (CUlinkState)link_state_ptr, bin_out, size_out ) );
else
return hiprtc2oro( hiprtcLinkComplete( (hiprtcLinkState)link_state_ptr, bin_out, size_out ) );
return ORORTC_ERROR_INTERNAL_ERROR;
}
orortcResult OROAPI orortcLinkDestroy( orortcLinkState link_state_ptr )
{
if( s_api & ORO_API_CUDADRIVER )
return cu2orortc( cuLinkDestroy( (CUlinkState)link_state_ptr ) );
else
return hiprtc2oro( hiprtcLinkDestroy( (hiprtcLinkState)link_state_ptr ) );

return ORORTC_ERROR_INTERNAL_ERROR;
}

// Implementation of oroPointerGetAttributes is hacky due to differences between CUDA and HIP
oroError OROAPI oroPointerGetAttributes(oroPointerAttribute* attr, oroDeviceptr dptr)
{
Expand Down
213 changes: 61 additions & 152 deletions Orochi/Orochi.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,18 @@ typedef struct ioroStream_t* oroStream;
typedef struct ioroPointerAttribute_t* oroPointerAttribute;
typedef unsigned long long oroTextureObject;
typedef void* oroExternalMemory_t;

typedef struct iorortcLinkState* orortcLinkState;
typedef struct _orortcProgram* orortcProgram;

#define oroHostRegisterPortable 0x01
#define oroHostRegisterMapped 0x02
#define oroHostRegisterIoMemory 0x04

#define ORO_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
#define ORO_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
#define ORO_HIP_LAUNCH_PARAM_END ((void*)0x03)
#define ORO_CUDA_LAUNCH_PARAM_END ((void*)0x00)

enum orortcResult
{
ORORTC_SUCCESS = 0,
Expand Down Expand Up @@ -482,157 +487,52 @@ typedef enum oroJitOption {
oroJitOptionFastCompile,
oroJitOptionNumOptions,
} oroJitOption;
/*
typedef enum HIPjit_target_enum {
ORO_TARGET_COMPUTE_20 = 20,
ORO_TARGET_COMPUTE_21 = 21,
ORO_TARGET_COMPUTE_30 = 30,
ORO_TARGET_COMPUTE_32 = 32,
ORO_TARGET_COMPUTE_35 = 35,
ORO_TARGET_COMPUTE_37 = 37,
ORO_TARGET_COMPUTE_50 = 50,
ORO_TARGET_COMPUTE_52 = 52,
ORO_TARGET_COMPUTE_53 = 53,
ORO_TARGET_COMPUTE_60 = 60,
ORO_TARGET_COMPUTE_61 = 61,
ORO_TARGET_COMPUTE_62 = 62,
ORO_TARGET_COMPUTE_70 = 70,
ORO_TARGET_COMPUTE_73 = 73,
ORO_TARGET_COMPUTE_75 = 75,
} HIPjit_target;
typedef enum HIPjit_fallback_enum {
ORO_PREFER_PTX = 0,
ORO_PREFER_BINARY,
} HIPjit_fallback;
typedef enum HIPjit_cacheMode_enum {
ORO_JIT_CACHE_OPTION_NONE = 0,
ORO_JIT_CACHE_OPTION_CG,
ORO_JIT_CACHE_OPTION_CA,
} HIPjit_cacheMode;
typedef enum HIPjitInputType_enum {
ORO_JIT_INPUT_HIPBIN = 0,
ORO_JIT_INPUT_PTX,
ORO_JIT_INPUT_FATBINARY,
ORO_JIT_INPUT_OBJECT,
ORO_JIT_INPUT_LIBRARY,
ORO_JIT_NUM_INPUT_TYPES,
} HIPjitInputType;
typedef struct HIPlinkState_st* HIPlinkState;
typedef enum hipGLDeviceList {
hipGLDeviceListAll = 1, ///< All hip devices used by current OpenGL context.
hipGLDeviceListCurrentFrame = 2, ///< Hip devices used by current OpenGL context in current
///< frame
hipGLDeviceListNextFrame = 3 ///< Hip devices used by current OpenGL context in next
///< frame.
} hipGLDeviceList;
typedef enum hipGraphicsRegisterFlags {
hipGraphicsRegisterFlagsNone = 0,
hipGraphicsRegisterFlagsReadOnly = 1, ///< HIP will not write to this registered resource
hipGraphicsRegisterFlagsWriteDiscard =
2, ///< HIP will only write and will not read from this registered resource
hipGraphicsRegisterFlagsSurfaceLoadStore = 4, ///< HIP will bind this resource to a surface
hipGraphicsRegisterFlagsTextureGather =
8 ///< HIP will perform texture gather operations on this registered resource
} hipGraphicsRegisterFlags;
typedef enum HIPgraphicsRegisterFlags_enum {
ORO_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
ORO_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
ORO_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
ORO_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04,
ORO_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08,
} HIPgraphicsRegisterFlags;
typedef enum HIPgraphicsMapResourceFlags_enum {
ORO_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
ORO_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
ORO_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
} HIPgraphicsMapResourceFlags;
typedef enum HIParray_cubemap_face_enum {
ORO_HIPBEMAP_FACE_POSITIVE_X = 0x00,
ORO_HIPBEMAP_FACE_NEGATIVE_X = 0x01,
ORO_HIPBEMAP_FACE_POSITIVE_Y = 0x02,
ORO_HIPBEMAP_FACE_NEGATIVE_Y = 0x03,
ORO_HIPBEMAP_FACE_POSITIVE_Z = 0x04,
ORO_HIPBEMAP_FACE_NEGATIVE_Z = 0x05,
} HIParray_cubemap_face;
typedef enum hipLimit_t {
ORO_LIMIT_STACK_SIZE = 0x00,
ORO_LIMIT_PRINTF_FIFO_SIZE = 0x01,
hipLimitMallocHeapSize = 0x02,
ORO_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03,
ORO_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04,
ORO_LIMIT_MAX,
} hipLimit_t;
typedef enum hipResourceType {
hipResourceTypeArray = 0x00,
hipResourceTypeMipmaoroedArray = 0x01,
hipResourceTypeLinear = 0x02,
hipResourceTypePitch2D = 0x03,
} hipResourceType;
typedef enum hipError_t {
hipSuccess = 0,
hipErrorInvalidValue = 1,
hipErrorOutOfMemory = 2,
hipErrorNotInitialized = 3,
hipErrorDeinitialized = 4,
hipErrorProfilerDisabled = 5,
hipErrorProfilerNotInitialized = 6,
hipErrorProfilerAlreadyStarted = 7,
hipErrorProfilerAlreadyStooroed = 8,
hipErrorNoDevice = 100,
hipErrorInvalidDevice = 101,
hipErrorInvalidImage = 200,
hipErrorInvalidContext = 201,
hipErrorContextAlreadyCurrent = 202,
hipErrorMapFailed = 205,
hipErrorUnmapFailed = 206,
hipErrorArrayIsMaoroed = 207,
hipErrorAlreadyMaoroed = 208,
hipErrorNoBinaryForGpu = 209,
hipErrorAlreadyAcquired = 210,
hipErrorNotMaoroed = 211,
hipErrorNotMaoroedAsArray = 212,
hipErrorNotMaoroedAsPointer = 213,
hipErrorECCNotCorrectable = 214,
hipErrorUnsuoroortedLimit = 215,
hipErrorContextAlreadyInUse = 216,
hipErrorPeerAccessUnsuoroorted = 217,
hipErrorInvalidKernelFile = 218,
hipErrorInvalidGraphicsContext = 219,
hipErrorInvalidSource = 300,
hipErrorFileNotFound = 301,
hipErrorSharedObjectSymbolNotFound = 302,
hipErrorSharedObjectInitFailed = 303,
hipErrorOperatingSystem = 304,
hipErrorInvalidHandle = 400,
hipErrorNotFound = 500,
hipErrorNotReady = 600,
hipErrorIllegalAddress = 700,
hipErrorLaunchOutOfResources = 701,
hipErrorLaunchTimeOut = 702,
hipErrorPeerAccessAlreadyEnabled = 704,
hipErrorPeerAccessNotEnabled = 705,
hipErrorSetOnActiveProcess = 708,
hipErrorAssert = 710,
hipErrorHostMemoryAlreadyRegistered = 712,
hipErrorHostMemoryNotRegistered = 713,
hipErrorLaunchFailure = 719,
hipErrorCooperativeLaunchTooLarge = 720,
hipErrorNotSuoroorted = 801,
hipErrorUnknown = 999,
} hipError_t;
*/

typedef enum orortcJIT_option
{
ORORTC_JIT_MAX_REGISTERS = 0,
ORORTC_JIT_THREADS_PER_BLOCK,
ORORTC_JIT_WALL_TIME,
ORORTC_JIT_INFO_LOG_BUFFER,
ORORTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
ORORTC_JIT_ERROR_LOG_BUFFER,
ORORTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
ORORTC_JIT_OPTIMIZATION_LEVEL,
ORORTC_JIT_TARGET_FROM_HIPCONTEXT,
ORORTC_JIT_TARGET,
ORORTC_JIT_FALLBACK_STRATEGY,
ORORTC_JIT_GENERATE_DEBUG_INFO,
ORORTC_JIT_LOG_VERBOSE,
ORORTC_JIT_GENERATE_LINE_INFO,
ORORTC_JIT_CACHE_MODE,
ORORTC_JIT_NEW_SM3X_OPT,
ORORTC_JIT_FAST_COMPILE,
ORORTC_JIT_GLOBAL_SYMBOL_NAMES,
ORORTC_JIT_GLOBAL_SYMBOL_ADDRESS,
ORORTC_JIT_GLOBAL_SYMBOL_COUNT,
ORORTC_JIT_LTO,
ORORTC_JIT_FTZ,
ORORTC_JIT_PREC_DIV,
ORORTC_JIT_PREC_SQRT,
ORORTC_JIT_FMA,
ORORTC_JIT_NUM_OPTIONS,
} orortcJIT_option;

typedef enum orortcJITInputType
{
ORORTC_JIT_INPUT_CUBIN = 0,
ORORTC_JIT_INPUT_PTX,
ORORTC_JIT_INPUT_FATBINARY,
ORORTC_JIT_INPUT_OBJECT,
ORORTC_JIT_INPUT_LIBRARY,
ORORTC_JIT_INPUT_NVVM,
ORORTC_JIT_NUM_LEGACY_INPUT_TYPES,
ORORTC_JIT_INPUT_LLVM_BITCODE = 100,
ORORTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101,
ORORTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102,
ORORTC_JIT_NUM_INPUT_TYPES = ( ORORTC_JIT_NUM_LEGACY_INPUT_TYPES + 3 )
} orortcJITInputType;


typedef enum oroExternalMemoryHandleType_enum {
oroExternalMemoryHandleTypeOpaqueFd = 1,
Expand Down Expand Up @@ -811,9 +711,18 @@ orortcResult OROAPI orortcDestroyProgram(orortcProgram* prog);
orortcResult OROAPI orortcGetLoweredName(orortcProgram prog, const char* name_expression, const char** lowered_name);
orortcResult OROAPI orortcGetProgramLog(orortcProgram prog, char* log);
orortcResult OROAPI orortcGetProgramLogSize(orortcProgram prog, size_t* logSizeRet);
orortcResult OROAPI orortcGetBitcode( orortcProgram prog, char* bitcode );
orortcResult OROAPI orortcGetBitcodeSize( orortcProgram prog, size_t* bitcodeSizeRet );
orortcResult OROAPI orortcGetCode(orortcProgram prog, char* code);
orortcResult OROAPI orortcGetCodeSize(orortcProgram prog, size_t* codeSizeRet);

orortcResult OROAPI orortcLinkCreate( unsigned int num_options, orortcJIT_option* option_ptr, void** option_vals_pptr, orortcLinkState* link_state_ptr );
orortcResult OROAPI orortcLinkAddFile( orortcLinkState link_state_ptr, orortcJITInputType input_type, const char* file_path, unsigned int num_options, orortcJIT_option* options_ptr, void** option_values );
orortcResult OROAPI orortcLinkAddData( orortcLinkState link_state_ptr, orortcJITInputType input_type, void* image, size_t image_size, const char* name, unsigned int num_options, orortcJIT_option* options_ptr, void** option_values );
orortcResult OROAPI orortcLinkComplete( orortcLinkState link_state_ptr, void** bin_out, size_t* size_out );
orortcResult OROAPI orortcLinkDestroy( orortcLinkState link_state_ptr );



enum {
ORO_SUCCESS = 0,
Expand Down
57 changes: 57 additions & 0 deletions Orochi/OrochiUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ OrochiUtils::~OrochiUtils()
{
}

bool OrochiUtils::readSourceCode( const std::string& path, std::string& sourceCode, std::vector<std::string>* includes )
{
return OrochiUtilsImpl::readSourceCode( path, sourceCode, includes );
}

oroFunction OrochiUtils::getFunctionFromFile( oroDevice device, const char* path, const char* funcName, std::vector<const char*>* optsIn )
{
const std::string cacheName = OrochiUtilsImpl::getCacheName( path, funcName );
Expand Down Expand Up @@ -491,6 +496,58 @@ oroFunction OrochiUtils::getFunction( oroDevice device, const char* code, const
return function;
}

void OrochiUtils::getData( oroDevice device, const char* code, const char* path, std::vector<const char*>* optsIn, std::vector<char>& dst )
{
std::vector<const char*> opts;
opts.push_back( "-std=c++17" );

std::string tmp = "--gpu-architecture=";

if( oroGetCurAPI(0) == ORO_API_HIP )
{
oroDeviceProp props;
oroGetDeviceProperties( &props, device );
tmp += props.gcnArchName;
opts.push_back( tmp.c_str() );
}

if( optsIn )
{
for( int i = 0; i < optsIn->size(); i++ )
opts.push_back( ( *optsIn )[i] );
}
// if( oroGetCurAPI(0) == ORO_API_CUDA )
// opts.push_back( "-G" );

oroFunction function;
std::vector<char>& codec = dst;
{
orortcProgram prog;
orortcResult e;
e = orortcCreateProgram( &prog, code, path, 0, 0, 0 );

e = orortcCompileProgram( prog, opts.size(), opts.data() );
if( e != ORORTC_SUCCESS )
{
size_t logSize;
orortcGetProgramLogSize( prog, &logSize );
if( logSize )
{
std::string log( logSize, '\0' );
orortcGetProgramLog( prog, &log[0] );
std::cout << log << '\n';
};
}
size_t codeSize;
e = orortcGetBitcodeSize( prog, &codeSize );

codec.resize( codeSize );
e = orortcGetBitcode( prog, codec.data() );
e = orortcDestroyProgram( &prog );
}
return;
}

void OrochiUtils::launch1D( oroFunction func, int nx, const void** args, int wgSize, unsigned int sharedMemBytes )
{
int4 tpb = { wgSize, 1, 0 };
Expand Down
Loading

0 comments on commit 6ba85ab

Please sign in to comment.