diff --git a/CMakeLists.txt b/CMakeLists.txt index 050da5434..619b16de8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,19 +18,19 @@ include(cmake/msg_color.cmake) include(cmake/utils.cmake) include(cmake/statistic.cmake) +set(CMAKE_EXPORT_COMPILE_COMMANDS YES) + # ---------------------------------------------------------------------------- # section: global anakin version and lib name # ---------------------------------------------------------------------------- cmake_minimum_required(VERSION ${MIN_CMAKE_V} FATAL_ERROR) -# global anakin version 1.0.0 +# global anakin version 1.1.0 set(VERSION_MAJOR "1") -set(VERSION_MINOR "0") +set(VERSION_MINOR "1") set(VERSION_PATCH "0") set(VERSION "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") - - # anakin lib name and global directories set(anakin_lib_so "anakin") set(anakin_lib_static "anakin_static") @@ -48,6 +48,7 @@ set(ANAKIN_SABER ${ANAKIN_ROOT}/saber) set(ANAKIN_LITE_SABER ${ANAKIN_SABER}/lite) set(ANAKIN_UNIT_TEST ${ANAKIN_ROOT}/test) set(ANAKIN_EXAMPLES ${ANAKIN_ROOT}/examples) +set(ANAKIN_SGX ${ANAKIN_ROOT}/sgx) # ---------------------------------------------------------------------------- @@ -59,27 +60,39 @@ anakin_option(ANAKIN_TYPE_FP32 "define the FP32 for data precision." YES) anakin_option(ANAKIN_TYPE_FP16 "define the FP16 for data precision." NO) anakin_option(ANAKIN_TYPE_INT8 "define the INT8 for data precision." NO) -#select the plantform to build +#select the platform to build anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." YES) anakin_option(USE_X86_PLACE "Select the build mode for X86 place." YES) anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO) anakin_option(USE_BM_PLACE "Select the build mode for BM place." NO) -# plantfrom details +anakin_option(USE_SGX "Enbale Anakin to run in Intel SGX secure enclave." NO) +anakin_option(USE_MLU_PLACE "Select the build mode for MLU place." NO) + +if(USE_SGX) + if(NOT USE_X86_PLACE OR USE_GPU_PLACE) + set(USE_SGX NO) + endif() +endif() + +# platform details anakin_option(NVIDIA_GPU "Use NVIDIA GPU place." YES if USE_GPU_PLACE) anakin_option(AMD_GPU "Use AMD GPU place." NO if USE_GPU_PLACE AND NOT NVIDIA_GPU) anakin_option(TARGET_ANDROID "build for android" YES if USE_ARM_PLACE) anakin_option(TARGET_IOS "not supported now" YES if USE_ARM_PLACE AND NOT TARGET_ANDROID) +# compile options for Cambricon MLU place +anakin_option(USE_MLU "Use MLU libs." YES if USE_MLU_PLACE) +anakin_option(USE_BANG "Use Bang." NO) + # compile options for NVIDIA_GPU place anakin_option(USE_CUDA "Use Cuda libs." YES if NVIDIA_GPU) anakin_option(USE_CUBLAS "Use Cublas libs." YES if USE_CUDA) anakin_option(USE_CURAND "Use Curand libs." YES if USE_CUDA) anakin_option(USE_CUFFT "Use CuFFT libs." YES if USE_CUDA) anakin_option(USE_CUDNN "Use Cudnn libs." YES if USE_CUDA) -anakin_option(USE_TENSORRT "Use tensorrt for inference." NO) -anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform." YES if USE_CUDA) -anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform" NO if BUILD_CROSS_PLANTFORM) +anakin_option(BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device platform." YES if USE_CUDA) +anakin_option(BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device platform" YES if BUILD_CROSS_PLANTFORM) if (NOT DEFINED AK_OUTPUT_PATH) set(AK_OUTPUT_PATH "output") @@ -91,7 +104,9 @@ if((NOT BUILD_FAT_BIN) AND (NOT BUILD_CROSS_PLANTFORM) AND USE_CUDA) endif() if(USE_X86_PLACE) - if(NOT DEFINED BUILD_X86_TARGET) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(BUILD_X86_ARCH "clang_native") + elseif(NOT DEFINED BUILD_X86_TARGET) set(BUILD_X86_ARCH "native") anakin_get_cpu_arch(BUILD_X86_ARCH) else() @@ -105,27 +120,31 @@ anakin_option(BUILD_CUBIN "BUILD with the -cubin option in Device mode" NO if US anakin_option(COMPILE_PTX "Returns a list of PTX files generated from src." NO if USE_CUDA) # common build options -anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." YES) +anakin_option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO) +anakin_option(RECORD_TENSOR_IN_NET "Enable Tensor Recored in DEBUG mode." NO) anakin_option(ENABLE_VERBOSE_MSG "Enable verbose=1 : compile msg during make." NO) anakin_option(DISABLE_ALL_WARNINGS "Disable all the warning msg during compile." YES) anakin_option(ENABLE_NOISY_WARNINGS "Enable noisy warning msg during compile." NO if DISABLE_ALL_WARNINGS) anakin_option(ENABLE_MIN_DEPENDENCY "Enable minimum dependency of third party library" NO) -# using 3rd party libs +# SGX options +anakin_option(SGX_SIM_MODE "Build Anakin to run in software-emulated SGX mode." YES if ENABLE_DEBUG) + +# using 3rd party libs anakin_option(USE_LOGGER "Build native logger components." YES) anakin_option(USE_GLOG "Build Glog components." NO if NOT USE_LOGGER) -anakin_option(USE_PROTOBUF "Build Google protobuf components." YES) +anakin_option(USE_NANOPB "Use nanopb, a light-weight C implementation of protobuf" YES if USE_SGX) +anakin_option(USE_PROTOBUF "Build Google protobuf components." YES if NOT USE_NANOPB) anakin_option(USE_OPENCV "Use static opencv libs." NO) anakin_option(USE_BOOST "Use static BOOST libs." NO) -anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID) +anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID OR (USE_X86_PLACE AND NOT USE_SGX)) anakin_option(USE_GTEST "Use googletest libs." NO if BUILD_WITH_UNIT_TEST) anakin_option(USE_PYTHON "Generate py wrappers." NO) anakin_option(USE_OPENCL "Use OpenCL ." YES if AMD_GPU) anakin_option(USE_GFLAGS "Build Google gflags components." NO) -anakin_option(USE_MKL "Use mkl libs." NO if USE_X86_PLACE) -anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE) +anakin_option(USE_MKL "Use mkl libs." YES if USE_SGX) +anakin_option(USE_MKLML "Use MKLML libs." YES if USE_X86_PLACE AND NOT USE_SGX) anakin_option(USE_XBYAK "Use XBYAK libs." YES if USE_X86_PLACE) -anakin_option(USE_OPENMP "Use Openmp when in android environment." YES if TARGET_ANDROID) # build components anakin_option(BUILD_WITH_UNIT_TEST "Build anakin unit test components." YES) @@ -139,12 +158,12 @@ anakin_option(BUILD_LITE "Build anakin lite components." NO if BUILD_WITH_FRAMEW anakin_option(BUILD_EXAMPLES "build detection and classification examples" NO) # build target -anakin_option(BUILD_SHARED "Build anakin shared lib." YES) +anakin_option(BUILD_SHARED "Build anakin shared lib." YES if NOT (USE_SGX OR BUILD_WITH_STATIC)) anakin_option(BUILD_STATIC "Build anakin static lib." YES if NOT BUILD_SHARED) -anakin_option(ENABLE_OP_TIMER "Enable op timer mode." NO) -if(ENABLE_MIN_DEPENDENCY) +anakin_option(ENABLE_OP_TIMER "Enable op timer mode." NO) +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND ENABLE_MIN_DEPENDENCY) set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--version-script,${ANAKIN_ROOT}/cmake/ak_link.lds") endif() @@ -157,7 +176,7 @@ else() set(CMAKE_BUILD_TYPE Release FORCE) endif() -if(USE_LOGGER) +if(USE_LOGGER) anakin_option(ENABLE_STACKTRACES "If enable local logger with stacktrace." YES if NOT USE_ARM_PLACE) anakin_option(SUPPORT_PTHREADS "If enable local logger with supporting pthreads. " YES) endif() @@ -189,6 +208,11 @@ if(USE_CUDA) include(cmake/external/sass.cmake) endif() +if(USE_MLU) + include(cmake/mlu.cmake) + include(cmake/external/cnrtml.cmake) +endif() + if(USE_X86_PLACE) if(USE_MKLML) include(cmake/external/mklml.cmake) @@ -196,7 +220,9 @@ if(USE_X86_PLACE) if(USE_XBYAK) include(cmake/external/xbyak.cmake) endif() - #include(cmake/external/mkldnn.cmake) + if(NOT USE_SGX) + include(cmake/external/mkldnn.cmake) + endif() endif() if(AMD_GPU) @@ -208,19 +234,18 @@ include(cmake/gather.cmake) # ---------------------------------------------------------------------------- -# section: build and install anakin +# section: build and install anakin # ---------------------------------------------------------------------------- # add source sub_directory whick holds the cmake build module # fetch files of model_parser add_subdirectory(${ANAKIN_SABER}) if(BUILD_WITH_FRAMEWORK) - add_subdirectory(${ANAKIN_MODEL_PARSER}) add_subdirectory(${ANAKIN_FRAMEWORK}) - if(BUILD_RPC) - add_subdirectory(${ANAKIN_SERVICE}) + if(BUILD_RPC) + add_subdirectory(${ANAKIN_SERVICE}) endif() - if(BUILD_LITE) + if(BUILD_LITE) add_subdirectory(${ANAKIN_LITE_FRAMEWORK}) endif() endif() @@ -235,6 +260,9 @@ if (BUILD_EXAMPLES) endif() endif() +if (USE_SGX) + add_subdirectory(${ANAKIN_SGX}) +endif() anakin_print_statistic() diff --git a/benchmark/README_GPU.md b/benchmark/README_GPU.md index 96016c7cb..04326535a 100644 --- a/benchmark/README_GPU.md +++ b/benchmark/README_GPU.md @@ -9,11 +9,11 @@ ## Counterpart of anakin : -The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 5`** , The models which TensorRT 5 doesn't support we use the custom plugins to support. +The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support. ## Benchmark Model -The following convolutional neural networks are tested with both `Anakin` and `TenorRT5`. +The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`. You can use pretrained caffe model or the model trained by youself. > Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md) @@ -35,19 +35,21 @@ We tested them on single-GPU with single-thread. BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 8.53945 | 8.18737 - 2 | 14.2269 | 13.8976 - 4 | 24.2803 | 21.7976 - 8 | 45.6003 | 40.319 + 1 | 8.85176 | 8.15362 + 2 | 15.6517 | 13.8716 + 4 | 26.5303 | 21.8478 + 8 | 48.2286 | 40.496 + 32 | 183.994 | 163.035 - GPU Memory Used (`MB`) BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1053.88 | 762.73 - 2 | 1055.71 | 762.41 - 4 | 1003.22 | 832.75 - 8 | 1108.77 | 926.9 + 1 | 887 | 648 + 2 | 965 | 733 + 4 | 991 | 810 + 8 | 1067 | 911 + 32 | 1715 | 1325 ### Yolo @@ -56,40 +58,44 @@ We tested them on single-GPU with single-thread. BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 8.41606| 7.07977 - 2 | 16.6588| 15.2216 - 4 | 31.9955| 30.5102 - 8 | 66.1107 | 64.3658 + 1 | 16.4623| 15.3214 + 2 | 26.7082| 25.0305 + 4 | 43.2129| 43.4758 + 8 | 80.0053 | 80.7645 + 32 | 283.352| 311.152 - GPU Memory Used (`MB`) BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 1054.71 | 299.8 - 2 | 951.51 | 347.47 - 4 | 846.9 | 438.47 - 8 | 1042.31 | 515.15 + :---: | :---: | :---: | + 1 | 1226 | 1192 + 2 | 1326 | 1269 + 4 | 1435 | 1356 + 8 | 1563 | 1434 + 32 | 2150 | 1633 ### Resnet50 - Latency (`ms`) of different batch BatchSize | TensorRT | Anakin - :---: | :---: | :---: | - 1 | 4.10063 | 3.33845 - 2 | 6.10941 | 5.54814 - 4 | 9.90233 | 10.2763 - 8 | 17.3287 | 20.0783 + :---: | :---: | :---: | + 1 | 4.26834 | 3.25853 + 2 | 6.2811 | 6.12156 + 4 | 10.1183 | 10.9219 + 8 | 18.1395 | 20.323 + 32 | 66.4728 | 83.9934 - GPU Memory Used (`MB`) BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1059.15 | 299.86 - 2 | 1077.8 | 340.78 - 4 | 903.04 | 395 - 8 | 832.53 | 508.86 + 1 | 932 | 272 + 2 | 936 | 318 + 4 | 720 | 376 + 8 | 697 | 480 + 32 | 842 | 835 ### Resnet101 @@ -97,19 +103,21 @@ We tested them on single-GPU with single-thread. BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 7.29828 | 5.672 - 2 | 11.2037 | 9.42352 - 4 | 17.9306 | 18.0936 - 8 | 31.4804 | 35.7439 + 1 | 7.58234 | 5.66457 + 2 | 11.6014 | 10.9213 + 4 | 18.3298 | 19.3987 + 8 | 32.6523 | 37.5575 + 32 | 123.114 | 149.089 - GPU Memory Used (`MB)` BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1161.94 | 429.22 - 2 | 1190.92 | 531.92 - 4 | 994.11 | 549.7 - 8 | 945.47 | 653.06 + 1 | 1020 | 420 + 2 | 961 | 467 + 4 | 943 | 503 + 8 | 885 | 606 + 32 | 1048 | 1077 ### MobileNet V1 @@ -117,19 +125,21 @@ We tested them on single-GPU with single-thread. BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1.52692 | 1.39282 - 2 | 1.98091 | 2.05788 - 4 | 3.2705 | 4.03476 - 8 | 5.15652 | 7.06651 + 1 | 45.2189 | 1.39566 + 2 | 46.4538 | 2.50698 + 4 | 47.8918 | 4.38727 + 8 | 52.3636 | 8.21416 + 32 | 83.0503 | 31.33 - GPU Memory Used (`MB`) BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1144.35 | 99.6 - 2 | 1160.03 | 199.75 - 4 | 1098 | 184.33 - 8 | 990.71 | 232.11 + 1 | 516 | 176 + 2 | 524 | 166 + 4 | 497 | 165 + 8 | 508 | 239 + 32 | 628 | 388 ### MobileNet V2 @@ -137,19 +147,21 @@ We tested them on single-GPU with single-thread. BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1.95961 | 1.78249 - 2 | 2.8709 | 3.01144 - 4 | 4.46131 | 5.43946 - 8 | 7.161 | 10.2081 + 1 | 65.4277 | 1.80542 + 2 | 66.2048 | 3.85568 + 4 | 68.8045 | 6.80921 + 8 | 75.64 | 12.6038 + 32 | 124.09 | 47.6079 - GPU Memory Used (`MB`) BatchSize | TensorRT | Anakin :---: | :---: | :---: | - 1 | 1154.69 | 195.25 - 2 | 1187.25 | 227.6 - 4 | 1053 | 241.75 - 8 | 1062.48 | 352.18 + 1 | 341 | 293 + 2 | 353 | 301 + 4 | 385 | 319 + 8 | 421 | 351 + 32 | 637 | 551 ## How to run those Benchmark models? diff --git a/benchmark/RNN/prepare.sh b/benchmark/RNN/prepare.sh index 7762fff96..6fc9032e5 100755 --- a/benchmark/RNN/prepare.sh +++ b/benchmark/RNN/prepare.sh @@ -1,14 +1,14 @@ #!/bin/bash sdir=$(cd `dirname $0`; pwd) -if [ ! -e $sdir/data/ptb.valid.txt ]; then -echo "can not find language_data download now" -wget -P $sdir/data/ http://ojf1xbmzo.bkt.clouddn.com/ptb.valid.txt -fi +#if [ ! -e $sdir/data/ptb.valid.txt ]; then +#echo "can not find language_data download now" +#wget -P $sdir/data/ http://ojf1xbmzo.bkt.clouddn.com/ptb.valid.txt +#fi if [ ! -e $sdir/data/ner_data.txt ]; then echo "can not find language_data download now" -wget -P $sdir/data/ https://raw.githubusercontent.com/PaddlePaddle/models/develop/fluid/chinese_ner/data/test_files/test_part_1 +wget -P $sdir/data/ https://raw.githubusercontent.com/PaddlePaddle/models/v0.15.0-rc0/fluid/chinese_ner/data/test_files/test_part_1 for n in $(seq 30); do cat $sdir/data/test_part_1 >> $sdir/data/ner_data.txt; done rm $sdir/data/test_part_1 fi diff --git a/benchmark/RNN/tensorflow_c_benchmark/example_model.cc b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc index 291f89e33..deac2f127 100644 --- a/benchmark/RNN/tensorflow_c_benchmark/example_model.cc +++ b/benchmark/RNN/tensorflow_c_benchmark/example_model.cc @@ -56,9 +56,9 @@ void SplitString(const std::string& s, int split_word_from_file( std::vector >& word_idx, - const std::string input_file_path, - const std::string split_token, - const std::string inner_split_token, + const std::string& input_file_path, + const std::string& split_token, + const std::string& inner_split_token, const int col_select) { std::ifstream infile(input_file_path.c_str()); diff --git a/cmake/compiler_options.cmake b/cmake/compiler_options.cmake index 1b41b047e..f6fab8781 100644 --- a/cmake/compiler_options.cmake +++ b/cmake/compiler_options.cmake @@ -13,22 +13,34 @@ # limitations under the License. # ---------------------------------------------------------------------------- -# section: set the compiler and linker options +# section: set the compiler and linker options # ---------------------------------------------------------------------------- set(ANAKIN_EXTRA_CXX_FLAGS "") set(ANAKIN_NVCC_FLAG "") - +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) anakin_add_compile_option(-std=c++11) anakin_add_compile_option(-fPIC) -anakin_add_compile_option(-ldl) -if(USE_ARM_PLACE ) -elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") -else() - anakin_add_compile_option(-lrt) + +if(NOT USE_SGX) + anakin_add_compile_option(-ldl) + anakin_add_compile_option(-pthread) + if(USE_ARM_PLACE) + elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + else() + anakin_add_compile_option(-lrt) + endif() endif() + +if(USE_X86_PLACE) + if (BUILD_X86_TARGET MATCHES "knl" OR ${BUILD_X86_ARCH} MATCHES "knl") + anakin_add_compile_option(-mavx512bw) + anakin_add_compile_option(-mavx512f) + endif () +endif() + anakin_add_compile_option(-W) anakin_add_compile_option(-Wall) -anakin_add_compile_option(-pthread) anakin_add_compile_option(-Werror=return-type) anakin_add_compile_option(-Werror=address) anakin_add_compile_option(-Werror=sequence-point) @@ -41,6 +53,8 @@ anakin_add_compile_option(-Wshadow) anakin_add_compile_option(-fpermissive) anakin_add_compile_option(-Wsign-promo) anakin_add_compile_option(-fdiagnostics-show-option) +anakin_add_compile_option(-Wno-missing-field-initializers) +anakin_add_compile_option(-Wno-extra) if(ENABLE_NOISY_WARNINGS) anakin_add_compile_option(-Wcast-align) @@ -54,8 +68,8 @@ else() anakin_add_compile_option(-Wno-delete-non-virtual-dtor) anakin_add_compile_option(-Wno-comment) anakin_add_compile_option(-Wno-sign-compare) - anakin_add_compile_option(-Wno-write-strings) - anakin_add_compile_option(-Wno-ignored-qualifiers) + anakin_add_compile_option(-Wno-write-strings) + anakin_add_compile_option(-Wno-ignored-qualifiers) anakin_add_compile_option(-Wno-enum-compare) anakin_add_compile_option(-Wno-missing-field-initializers) endif() @@ -63,26 +77,41 @@ endif() if(CMAKE_BUILD_TYPE MATCHES Debug) anakin_add_compile_option(-O0) anakin_add_compile_option(-g) - anakin_add_compile_option(-gdwarf-2) # for old version gcc and gdb. see: http://stackoverflow.com/a/15051109/673852 + anakin_add_compile_option(-gdwarf-2) # for old version gcc and gdb. see: http://stackoverflow.com/a/15051109/673852 else() - anakin_add_compile_option(-O3) - #anakin_add_compile_option(-g) - anakin_add_compile_option(-DNDEBUG) + if(USE_SGX) + anakin_add_compile_option(-Os) + else() + anakin_add_compile_option(-Ofast) + endif() + + if(USE_ARM_PLACE) + add_compile_options(-Ofast) + add_compile_options(-ffast-math) + add_compile_options(-Os) + endif() + + anakin_add_compile_option(-DNDEBUG) endif() if(TARGET_ANDROID) anakin_add_compile_option(-pie) - anakin_add_compile_option(-mfloat-abi=softfp) - anakin_add_compile_option(-mfpu=neon) - anakin_add_compile_option(-ffast-math) + add_compile_options(-ldl) anakin_add_compile_option(-lc) - set(ANAKIN_EXTRA_CXX_FLAGS "${ANAKIN_EXTRA_CXX_FLAGS} ${ANDROID_CXX_FLAGS}") + set(ANAKIN_EXTRA_CXX_FLAGS "${ANAKIN_EXTRA_CXX_FLAGS} ${ANDROID_CXX_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections") + set(MAKE_STATIC_LINKER_FLAGS "${MAKE_STATIC_LINKER_FLAGS} -Wl,--gc-sections") endif() if(TARGET_IOS) # none temp endif() +if(BUILD_STATIC OR X86_COMPILE_482) + anakin_add_compile_option(-static-libstdc++) +endif() + + if(USE_X86_PLACE) if(X86_COMPILE_482) set(CMAKE_SYSROOT /opt/compiler/gcc-4.8.2/) @@ -92,14 +121,19 @@ if(X86_COMPILE_482) set(CMAKE_EXE_LINKER_FLAGS "-Wl,-dynamic-linker,/opt/compiler/gcc-4.8.2/lib64/ld-linux-x86-64.so.2") set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - anakin_add_compile_option(-static-libstdc++) + anakin_add_compile_option(-D_GLIBCXX_USE_CXX11_ABI=0) #use std namespace for string and list rather than std::__CXX11:: +# anakin_add_compile_option(-static-libstdc++) # anakin_add_compile_option(-static-libgcc) endif() +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") anakin_add_compile_option(-fabi-version=6) - anakin_add_compile_option(-march=${BUILD_X86_ARCH}) - anakin_add_compile_option(-Ofast) - anakin_add_compile_option(-ffast-math) + anakin_add_compile_option(-fabi-compat-version=2) #add compat + anakin_add_compile_option(-march=${BUILD_X86_ARCH}) +endif() +if(USE_OPENMP) + anakin_add_compile_option(-fopenmp) +endif() anakin_add_compile_option(-Wall) anakin_add_compile_option(-Wno-comment) anakin_add_compile_option(-Wno-unused-local-typedefs) @@ -110,9 +144,9 @@ if(X86_64) anakin_add_compile_option(-Wno-long-long) endif() -set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${ANAKIN_EXTRA_CXX_FLAGS}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ANAKIN_EXTRA_CXX_FLAGS}") -#if(WIN32) +#if(WIN32) # if(MSVC) # message(STATUS "Using msvc compiler") # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_SCL_SECURE_NO_WARNINGS") @@ -134,6 +168,4 @@ if(USE_CUDA) anakin_add_compile_option("--default-stream per-thread" NVCC) anakin_add_compile_option(-Wno-deprecated-gpu-targets NVCC) endif() - # set default nvidia gpu arch - set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") endif() diff --git a/cmake/config/anakin_config.h.in b/cmake/config/anakin_config.h.in index d96e231bf..ccbfca14e 100644 --- a/cmake/config/anakin_config.h.in +++ b/cmake/config/anakin_config.h.in @@ -39,8 +39,6 @@ #cmakedefine USE_CUDNN -#cmakedefine USE_TENSORRT - #cmakedefine USE_PYTHON #cmakedefine USE_OPENCL @@ -56,6 +54,7 @@ #cmakedefine USE_GFLAGS + // plantform to use #cmakedefine USE_GPU_PLACE @@ -65,6 +64,10 @@ #cmakedefine USE_ARM_PLACE #cmakedefine USE_BM_PLACE +#cmakedefine USE_MLU_PLACE +#cmakedefine USE_MLU + +#cmakedefine USE_SGX #cmakedefine TARGET_ANDROID @@ -80,11 +83,13 @@ #cmakedefine SUPPORT_PTHREADS +#cmakedefine USE_NANOPB + // build arm lite #cmakedefine BUILD_LITE +#cmakedefine LINUX_ARM_OS - -#if defined(ANDROID) || defined(__ANDROID__) +#if defined(ANDROID) || defined(__ANDROID__) || defined(LINUX_ARM_OS) #define PLATFORM_ANDROID #define IS_MOBILE_PLATFORM #elif defined(__APPLE__) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 60b10e298..0e96e7e68 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -77,19 +77,32 @@ endmacro() # section: Find cudnn. # ---------------------------------------------------------------------------- macro(anakin_find_cudnn) + set(CUDNN_ROOT "" CACHE PATH "CUDNN root dir.") find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${ANAKIN_ROOT}/third-party/cudnn/include NO_DEFAULT_PATH) if(BUILD_SHARED) - find_library(CUDNN_LIBRARY NAMES libcudnn.so + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + find_library(CUDNN_LIBRARY NAMES libcudnn.dylib + PATHS ${CUDNN_INCLUDE_DIR}/../lib/ ${CUDNN_INCLUDE_DIR}/ + DOC "library path for cudnn.") + else() + find_library(CUDNN_LIBRARY NAMES libcudnn.so PATHS ${CUDNN_INCLUDE_DIR}/../lib64/ ${CUDNN_INCLUDE_DIR}/ - DOC "library path for cudnn.") - else() - find_library(CUDNN_LIBRARY NAMES libcudnn_static.a - PATHS ${CUDNN_INCLUDE_DIR}/../lib64/ DOC "library path for cudnn.") + endif() + else() + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + find_library(CUDNN_LIBRARY NAMES libcudnn_static.a + PATHS ${CUDNN_INCLUDE_DIR}/../lib/ + DOC "library path for cudnn.") + else() + find_library(CUDNN_LIBRARY NAMES libcudnn_static.a + PATHS ${CUDNN_INCLUDE_DIR}/../lib64/ + DOC "library path for cudnn.") + endif() endif() if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY) @@ -177,6 +190,17 @@ macro(anakin_find_cuda) # build cuda part for local machine. if(BUILD_CROSS_PLANTFORM) + #set nvida gpu arch + set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1") + if("${CUDA_VERSION}" GREATER 9.0 OR "${CUDA_VERSION}" EQUAL 9.0) + message("${CUDA_VERSION}") + set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1;7.0")#>=9.0 + endif() + if("${CUDA_VERSION}" GREATER 10.0 OR "${CUDA_VERSION}" EQUAL 10.0) + set(ANAKIN_ARCH_LIST "3.5;5.0;6.0;6.1;7.0;7.5")#>=10.0 + message("${CUDA_VERSION}") + endif() + if(BUILD_FAT_BIN) message(STATUS "Building fat-bin for cuda code !") anakin_set_nvcc_archs_info(ANAKIN_ARCH_LIST) diff --git a/cmake/external/cnrtml.cmake b/cmake/external/cnrtml.cmake new file mode 100644 index 000000000..8f92f04c0 --- /dev/null +++ b/cmake/external/cnrtml.cmake @@ -0,0 +1,54 @@ +#=============================================================================== +# Copyright 2016-2018 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +anakin_find_mlulib() +if (${MLU_FOUND}) + return() +endif() + +include(ExternalProject) + +set(MLU_PROJECT "extern_mlu") +set(MLU_SOURCE_DIR "${ANAKIN_TEMP_THIRD_PARTY_PATH}/mlu") +set(REL_MLU_LIB "${MLU_SOURCE_DIR}/src/${MLU_PROJECT}/mlu") +set(MLU_INC "${ANAKIN_THIRD_PARTY_PATH}/mlu/include") +set(MLU_LIB "${ANAKIN_THIRD_PARTY_PATH}/mlu/lib") +set(MLU_INSTALL_ROOT ${ANAKIN_THIRD_PARTY_PATH}/mlu) + + +file(WRITE ${MLU_SOURCE_DIR}/src/install.sh + "mkdir -p ${MLU_INSTALL_ROOT}/include \n" + "mkdir -p ${MLU_INSTALL_ROOT}/lib \n" + "cp ${REL_MLU_LIB}/include/*.h ${MLU_INSTALL_ROOT}/include/ \n" + "cp ${REL_MLU_LIB}/lib/*.so ${MLU_INSTALL_ROOT}/lib \n") + + + +ExternalProject_Add( + ${MLU_PROJECT} + GIT_REPOSITORY "xxx" + GIT_TAG master + PREFIX ${MLU_SOURCE_DIR} + INSTALL_COMMAND sh ${MLU_SOURCE_DIR}/src/install.sh +) + +include_directories(${MLU_INC}) +add_library(mlu_lib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mlu_lib PROPERTY IMPORTED_LOCATION ${MLU_LIB}/libcnrt.so ${MLU_LIB}/libcnml.so) +add_dependencies(mlu_lib ${MLU_PROJECT}) +message("mlu lib: ${MLU_LIB}") +list(APPEND ANAKIN_SABER_DEPENDENCIES mlu_lib) +list(APPEND ANAKIN_LINKER_LIBS ${MLU_LIB}/libcnrt.so ${MLU_LIB}/libcnml.so) diff --git a/cmake/external/miopen.cmake b/cmake/external/miopen.cmake index e76acb428..f8dc418ef 100644 --- a/cmake/external/miopen.cmake +++ b/cmake/external/miopen.cmake @@ -28,8 +28,8 @@ message(STATUS "Scanning external modules ${Green}MIOPEN${ColourReset} ...") ExternalProject_Add( ${MIOPEN_PROJECT}_customize - GIT_REPOSITORY "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen" - GIT_TAG "cbd4e7dbad0599c7327cb43888476ab8d966f285" + GIT_REPOSITORY "xxx" + GIT_TAG "xxx" PREFIX ${ANAKIN_TEMP_THIRD_PARTY_PATH}/miopen/customize_miopen_file SOURCE_DIR ${ANAKIN_THIRD_PARTY_PATH}/miopen/customize_miopen_file CONFIGURE_COMMAND "" @@ -40,8 +40,8 @@ ExternalProject_Add( ExternalProject_Add( ${MIOPEN_PROJECT} DEPENDS ${MIOPEN_PROJECT}_customize - GIT_REPOSITORY "ssh://git@icode.baidu.com:8235/baidu/third-party/miopen" - GIT_TAG 1.4.2 + GIT_REPOSITORY "xxx" + GIT_TAG xxx PREFIX ${MIOPEN_PREFIX_DIR} CMAKE_ARGS -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX=${MIOPEN_INSTALL_ROOT} -DCMAKE_INSTALL_LIBDIR=lib -DBOOST_ROOT=${BOOST_ROOT} #LOG_DOWNLOAD 1 diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 05befe0f0..4bb7ac174 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,8 +20,8 @@ set(MKLDNN_PROJECT "extern_mkldnn") set(MKLDNN_SOURCES_DIR ${ANAKIN_TEMP_THIRD_PARTY_PATH}/mkldnn) set(MKLDNN_INSTALL_DIR ${ANAKIN_THIRD_PARTY_PATH}/mkldnn) set(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) -set(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") +set(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib64/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib64") include_directories(${MKLDNN_INC_DIR}) @@ -29,38 +29,33 @@ set(MKLDNN_DEPENDS ${MKLML_PROJECT}) message(STATUS "Scanning external modules ${Green}MKLDNNN${ColourReset}...") - -if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS "5.4") - set(MKLDNN_CFLAG) +if(X86_COMPILE_482) + set(MKLDNN_SYS_ROOT "/opt/compiler/gcc-4.8.2/") + message(STATUS ${MKLDNN_SYS_ROOT}) else() - set(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow \ - -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-truncation") + set(MKLDNN_SYS_ROOT "") endif() - -if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "5.4") - set(MKLDNN_CXXFLAG) -else() - set(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow \ - -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-truncation") -endif() - set(MKLDNN_C_COMPILER ${CMAKE_C_COMPILER}) set(MKLDNN_CXX_COMPILER ${CMAKE_CXX_COMPILER}) ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} - GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f" + GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" +# GIT_TAG "v0.17.1" ##v0.17.1 + GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc" ##v0.17.1 PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} - CMAKE_ARGS -DCMAKE_C_COMPILER=${MKLDNN_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${MKLDNN_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} - CMAKE_ARGS -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF + CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=lib64 +# CMAKE_ARGS -DCMAKE_C_COMPILER=${MKLDNN_C_COMPILER} +# CMAKE_ARGS -DCMAKE_CXX_COMPILER=${MKLDNN_CXX_COMPILER} +# CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} +# CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} +# CMAKE_ARGS -DCMAKE_SYSROOT=${MKLDNN_SYS_ROOT} + #CMAKE_ARGS -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF +# CMAKE_ARGS -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" ) add_library(mkldnn SHARED IMPORTED GLOBAL) @@ -71,4 +66,8 @@ list(APPEND ANAKIN_SABER_DEPENDENCIES mkldnn) list(APPEND ANAKIN_LINKER_LIBS ${MKLDNN_LIB}) +install(FILES ${MKLDNN_INSTALL_DIR}/lib64/libmkldnn.so.0 ${MKLDNN_INSTALL_DIR}/lib64/libmkldnn.so.0.18.0.0 ${MKLDNN_LIB} DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/) +install(DIRECTORY ${MKLDNN_INC_DIR} + DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/mkldnn_include) +message(STATUS ${MKLML_INSTALL_ROOT}/include) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 8e4b3df32..50f1fc2d8 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -22,10 +22,10 @@ endif() # download mklml package is only for iomp so far include(ExternalProject) -set(MKLML_PROJECT "extern_mklml") -set(MKLML_VER "mklml_lnx_2019.0.20180710") -#set(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.13/${MKLML_VER}.tgz") // original site -set(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz") # use paddle mirror site instead +set(MKLML_PROJECT "extern_mklml")# +set(MKLML_VER "mklml_lnx_2019.0.3.20190220")# for vnni mklml_lnx_2019.0.3.20190125 +set(MKLML_URL "https://github.com/intel/mkl-dnn/releases/download/v0.18/${MKLML_VER}.tgz") # original site +#set(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz") # use paddle mirror site instead set(MKLML_SOURCE_DIR "${ANAKIN_TEMP_THIRD_PARTY_PATH}/mklml") set(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") set(MKLML_DST_DIR ".") @@ -56,6 +56,7 @@ ExternalProject_Add( CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} ) + add_library(mklml SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_IOMP_LIB}) add_dependencies(mklml ${MKLML_PROJECT}) diff --git a/cmake/external/sass.cmake b/cmake/external/sass.cmake index d10200cbb..c970a6a26 100644 --- a/cmake/external/sass.cmake +++ b/cmake/external/sass.cmake @@ -16,8 +16,8 @@ if (EXISTS ${ANAKIN_THIRD_PARTY_PATH}/sass/lib/) include_directories(${ANAKIN_THIRD_PARTY_PATH}/sass/include) - return() -endif() + return() +endif() include(ExternalProject) @@ -30,26 +30,16 @@ set(SASS_INSTALL_ROOT ${ANAKIN_THIRD_PARTY_PATH}/sass) include_directories(${SASS_INC}) -file(WRITE ${SASS_SOURCE_DIR}/src/build.sh - "cmake ../${SASS_PROJECT} -DSELECT_ARCH=61,50;make -j$(nproc) \n") +file(WRITE ${SASS_SOURCE_DIR}/src/build.sh + "cmake ../${SASS_PROJECT} -DSELECT_ARCH=61,50;make -j$(nproc) \n") file(WRITE ${SASS_SOURCE_DIR}/src/install.sh - "mkdir -p ${SASS_INSTALL_ROOT}/include \n" - "mkdir -p ${SASS_INSTALL_ROOT}/lib \n" - "cp ${REAL_SASS_SRC}/nv/*.h ${SASS_INSTALL_ROOT}/include/ \n" - "cp *.a ${SASS_INSTALL_ROOT}/lib \n") + "mkdir -p ${SASS_INSTALL_ROOT}/include \n" + "mkdir -p ${SASS_INSTALL_ROOT}/lib \n" + "cp ${REAL_SASS_SRC}/nv/*.h ${SASS_INSTALL_ROOT}/include/ \n" + "cp *.a ${SASS_INSTALL_ROOT}/lib \n") - -ExternalProject_Add( - ${SASS_PROJECT} - GIT_REPOSITORY "ssh://git@icode.baidu.com:8235/baidu/sys-hic-gpu/anakin_saber_lib" - GIT_TAG batch_gemm - PREFIX ${SASS_SOURCE_DIR} - BUILD_COMMAND sh ${SASS_SOURCE_DIR}/src/build.sh - INSTALL_COMMAND sh ${SASS_SOURCE_DIR}/src/install.sh -) - add_library(sass_lib SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET sass_lib PROPERTY IMPORTED_LOCATION ${SASS_LIB}) add_dependencies(sass_lib ${SASS_PROJECT}) diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 04d1cd953..408aedb29 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -18,8 +18,8 @@ include(ExternalProject) set(XBYAK_PROJECT extern_xbyak) set(XBYAK_PREFIX_DIR ${ANAKIN_TEMP_THIRD_PARTY_PATH}/xbyak) -set(XBYAK_CLONE_DIR ${XBYAK_PREFIX_DIR}/src/${XBYAK_PROJECT}) -set(XBYAK_INSTALL_ROOT ${ANAKIN_THIRD_PARTY_PATH}/xbyak) +set(XBYAK_CLONE_DIR ${XBYAK_PREFIX_DIR}/src/${XBYAK_PROJECT}) +set(XBYAK_INSTALL_ROOT ${ANAKIN_TEMP_THIRD_PARTY_PATH}/xbyak) set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include) message(STATUS "Scanning external modules ${Green}xbyak${ColourReset} ...") @@ -27,23 +27,28 @@ message(STATUS "Scanning external modules ${Green}xbyak${ColourReset} ...") include_directories(${XBYAK_INC_DIR}) -file(WRITE ${XBYAK_CLONE_DIR}/CMakeLists.txt - "PROJECT(MKLML)\n" - "cmake_minimum_required(VERSION 2.8)\n" - "install(DIRECTORY ${XBYAK_CLONE_DIR}/include \n" - " DESTINATION ${XBYAK_INSTALL_ROOT})\n") +if(USE_SGX) + set(SGX_PATCH_CMD "cd ${ANAKIN_TEMP_THIRD_PARTY_PATH} && patch -p0 <${ANAKIN_THIRD_PARTY_PATH}/xbyak.patch") +else() + # use a whitespace as nop so that sh won't complain about missing argument + set(SGX_PATCH_CMD " ") +endif() ExternalProject_Add( ${XBYAK_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS "" GIT_REPOSITORY "https://github.com/herumi/xbyak.git" - GIT_TAG "fe083912c8ac7b7e2b0081cbd6213997bc8b56e6" # mar 6, 2018 + GIT_TAG "v5.661" # Jul 26th PREFIX ${XBYAK_PREFIX_DIR}/src UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} + INSTALL_COMMAND make install + COMMAND sh -c "${SGX_PATCH_CMD}" + VERBATIM ) add_library(xbyak SHARED IMPORTED GLOBAL) add_dependencies(xbyak ${XBYAK_PROJECT}) + list(APPEND ANAKIN_SABER_DEPENDENCIES xbyak) diff --git a/cmake/find_modules.cmake b/cmake/find_modules.cmake index 2f0790b5b..5200f1e88 100644 --- a/cmake/find_modules.cmake +++ b/cmake/find_modules.cmake @@ -38,25 +38,25 @@ if(UNIX) endif() # whole archive for static lib -if(NOT MSVC AND NOT APPLE) - set(WHOLE_ARCHIVE_START -Wl,--whole-archive) - set(WHOLE_ARCHIVE_END -Wl,--no-whole-archive) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - # using regular Clang or AppleClang - set(WHOLE_ARCHIVE_START -Wl,-force_load) - set(WHOLE_ARCHIVE_END) +if(NOT MSVC AND NOT APPLE) + set(WHOLE_ARCHIVE_START -Wl,--whole-archive) + set(WHOLE_ARCHIVE_END -Wl,--no-whole-archive) +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + # using regular Clang or AppleClang + set(WHOLE_ARCHIVE_START -Wl,-force_load) + set(WHOLE_ARCHIVE_END) endif() #find opencv version >= 2.4.3 macro(anakin_find_opencv) - if(USE_ARM_PLACE AND TARGET_ANDROID) - include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) - LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) - + #include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/) + #LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/) + include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/include/) + LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/lib/armeabi-v7a/) + message(ERROR "opencv=${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/lib/armeabi-v7a/") else() - - if(BUILD_SHARED) # temporary not support static link opencv. + if(BUILD_SHARED AND NOT ENABLE_MIN_DEPENDENCY) # temporary not support static link opencv. find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) if(NOT OpenCV_FOUND) find_package(OpenCV QUIET COMPONENTS core highgui imgproc) @@ -70,21 +70,30 @@ macro(anakin_find_opencv) message(SEND_ERROR "Could not found opencv !") endif() else() # BUILD_STATIC - set(OPENCV_LIB_PATH "" CACHE "Path to oopen cv library") - list(APPEND OPENCV_STATIC_LIBS ${OPENCV_LIB_PATH}/libopencv_core.a - ${OPENCV_LIB_PATH}libopencv_highgui.a - ${OPENCV_LIB_PATH}libopencv_imgproc.a - ${OPENCV_LIB_PATH}libopencv_contrib.a) - foreach(CV_LIB ${OPENCV_STATIC_LIBS}) - list(APPEND ANAKIN_LINKER_LIBS ${CV_LIB}) - endforeach() - unset(__CV_LIB_FULL_PATH) + find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) + if(NOT OpenCV_FOUND) + find_package(OpenCV QUIET COMPONENTS core highgui imgproc) + endif() + if(OpenCV_FOUND) + message(STATUS "Found opencv: ${OpenCV_INCLUDE_DIRS}") + include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) + list(APPEND OPENCV_STATIC_LIBS ${OPENCV_LIB_PATH}/libopencv_core.a + ${OPENCV_LIB_PATH}libopencv_highgui.a + ${OPENCV_LIB_PATH}libopencv_imgproc.a + ${OPENCV_LIB_PATH}libopencv_contrib.a) + foreach(CV_LIB ${OPENCV_STATIC_LIBS}) + list(APPEND ANAKIN_LINKER_LIBS ${CV_LIB}) + endforeach() + unset(__CV_LIB_FULL_PATH) + else() + message(SEND_ERROR "Could not found opencv !") + endif() endif() endif() endmacro() -#find opencl +#find opencl macro(anakin_find_opencl) set(OCL_ROOT "" CACHE PATH "openCL root dir.") @@ -114,14 +123,14 @@ macro(anakin_find_boost) find_package(Boost 1.59.0 QUIET COMPONENTS thread variant) if(Boost_FOUND) include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) - list(APPEND ANAKIN_LINKER_LIBS ${Boost_LIBRARIES}) - endif() + list(APPEND ANAKIN_LINKER_LIBS ${Boost_LIBRARIES}) + endif() endmacro() #find intel mkl lib. macro(anakin_find_mkl) set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs.") - set(MKL_ROOT "" CACHE PATH "Folder contains intel(R) mkl libs.") + set(MKL_ROOT "" CACHE PATH "Folder contains intel(R) mkl libs.") # options for mkl set(MKL_USE_SINGLE_DYNAMIC_LIBRARY YES) set(MKL_USE_STATIC_LIBS NO) @@ -144,7 +153,7 @@ macro(anakin_find_mkl) set(__mkl_libs "") if(MKL_USE_SINGLE_DYNAMIC_LIBRARY) list(APPEND __mkl_libs rt) - else() + else() if(CMAKE_SIZEOF_VOID_P EQUAL 4) if(WIN32) list(APPEND __mkl_libs intel_c) @@ -153,7 +162,7 @@ macro(anakin_find_mkl) endif() else() list(APPEND __mkl_libs intel_lp64 gf_lp64) - endif() + endif() if(MKL_MULTI_THREADED) list(APPEND __mkl_libs intel_thread) @@ -180,7 +189,7 @@ macro(anakin_find_mkl) set(__trigger_mkllib TRUE) endif() endforeach() - + if(NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY) if (MKL_USE_STATIC_LIBS) set(__iomp5_libs iomp5 libiomp5mt.lib) @@ -206,7 +215,7 @@ macro(anakin_find_mkl) else() message(FATAL_ERROR "Could not found mkl !") endif() - + endmacro() # find glog and config it @@ -247,12 +256,12 @@ endmacro() macro(anakin_find_gflags) set(GFLAGS_ROOT "~/.jumbo/" CACHE PATH "google flags root dir." ) - find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h - PATHS ${GFLAGS_ROOT}/include + find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h + PATHS ${GFLAGS_ROOT}/include $ENV{GFLAGS_ROOT}/include) find_library(GFLAGS_LIBRARY NAMES libgflags.so PATHS ${GFLAGS_ROOT}/lib - $ENV{GFLAGS_ROOT}/lib + $ENV{GFLAGS_ROOT}/lib DOC "library path for gflags.") if(GFLAGS_INCLUDE_DIR AND GFLAGS_LIBRARY) set(GFLAGS_FOUND TRUE) @@ -301,13 +310,27 @@ endmacro() macro(anakin_find_protobuf) if(USE_ARM_PLACE) + set(PROTOBUF_PROTOC_EXECUTABLE "/usr/local/bin/protoc") set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf") - include_directories(${ARM_RPOTO_ROOT}/include) - set(PROTOBUF_LIBRARIES "") + message(STATUS "ANDROID_ABI=${ANDROID_ABI}") + if(${ANDROID_ABI} STREQUAL "arm64-v8a") + #set(PROTOBUF_PROTOC_EXECUTABLE "${ARM_RPOTO_ROOT}/arm64-v8a/bin/protoc") + include_directories(${ARM_RPOTO_ROOT}/arm64-v8a/include) + set(PROTOBUF_LIBRARIES "") + list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/arm64-v8a/lib/libprotobuf.a) + else() + #set(PROTOBUF_PROTOC_EXECUTABLE "${ARM_RPOTO_ROOT}/armeabi-v7a/bin/protoc") + include_directories(${ARM_RPOTO_ROOT}/armeabi-v7a/include) + set(PROTOBUF_LIBRARIES "") + list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/armeabi-v7a/lib/libprotobuf.a) + endif() + #include_directories(${ARM_RPOTO_ROOT}/include) + #set(PROTOBUF_LIBRARIES "") + #list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.a) #if(BUILD_SHARED) # list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.so) #else() - list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.a) + # list(APPEND ANAKIN_LINKER_LIBS ${ARM_RPOTO_ROOT}/lib/libprotobuf.a) #endif() find_library( # Sets the name of the path variable. log-lib @@ -316,8 +339,9 @@ macro(anakin_find_protobuf) # you want CMake to locate. log ) list(APPEND ANAKIN_LINKER_LIBS ${log-lib}) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc) else() - if(NOT ENABLE_MIN_DEPENDENCY) + if(NOT ENABLE_MIN_DEPENDENCY) find_program(PROTOBUF_PROTOC_EXECUTABLE protoc) if(PROTOBUF_PROTOC_EXECUTABLE) find_package(Protobuf REQUIRED) @@ -343,16 +367,16 @@ macro(anakin_find_protobuf) endif() endif() else() - set(PROTOBUF_ROOT "/usr/local" CACHE PATH "Folder contains protobuf") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/stubs/common.h PATHS + set(PROTOBUF_ROOT "/usr/local" CACHE PATH "Folder contains protobuf") + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/stubs/common.h PATHS ${PROTOBUF_ROOT}/include $ENV{PROTOBUF_ROOT}/include NO_DEFAULT_PATH) - find_library(PROTOBUF_LIBRARY libprotobuf.a PATHS ${PROTOBUF_ROOT}/lib + find_library(PROTOBUF_LIBRARY libprotobuf.a PATHS ${PROTOBUF_ROOT}/lib $ENV{PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin $ENV{PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) - if(PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY) + if(PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY) list(APPEND ANAKIN_LINKER_LIBS ${PROTOBUF_LIBRARY}) include_directories(${PROTOBUF_INCLUDE_DIR}) else() @@ -362,6 +386,13 @@ macro(anakin_find_protobuf) endif() endmacro() +macro(anakin_find_nanopb) + set(NANOPB_VERSION "0.3.9.1") + set(NANOPB_DOWNLOAD_URL "https://jpa.kapsi.fi/nanopb/download/nanopb-${NANOPB_VERSION}-linux-x86.tar.gz") + set(NANOPB_DIR ${ANAKIN_THIRD_PARTY_PATH}/nanopb) + set(PROTOBUF_PROTOC_EXECUTABLE ${NANOPB_DIR}/generator-bin/protoc) +endmacro() + macro(anakin_find_baidu_rpc) if(NOT ENABLE_MIN_DEPENDENCY) set(BAIDU_RPC_ROOT "/opt/brpc" CACHE PATH "baidu rpc root dir") @@ -422,69 +453,48 @@ macro(anakin_find_openmp) endmacro() macro(anakin_find_bmlib) - find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/third-party/bm_lib/ $ENV{BM_ROOT}/) - find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn) - find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime) - find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib) - if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB) - set(BM_FOUND TRUE) - endif() - if(BM_FOUND) + find_path(BM_ROOT include/bmdnn/bmdnn_api.h ${CMAKE_SOURCE_DIR}/third-party/bm_lib/ $ENV{BM_ROOT}/) + find_path(BM_ROOT_INCLUDE_DNN bmdnn_api.h ${BM_ROOT}/include/bmdnn) + find_path(BM_ROOT_INCLUDE_RT bmruntime.h ${BM_ROOT}/include/bmruntime) + find_path(BM_ROOT_INCLUDE_LIB bmlib_runtime.h ${BM_ROOT}/include/bmlib) + if(BM_ROOT_INCLUDE_DNN AND BM_ROOT_INCLUDE_RT AND BM_ROOT_INCLUDE_LIB) + set(BM_FOUND TRUE) + endif() + if(BM_FOUND) message(STATUS " Found bm_lib in ${BM_ROOT} ${BM_ROOT_INCLUDE_DNN} ${BM_ROOT_INCLUDE_RT} ${BM_ROOT_INCLUDE_LIB}") include_directories(${BM_ROOT_INCLUDE_DNN}) - include_directories(${BM_ROOT_INCLUDE_RT}) - include_directories(${BM_ROOT_INCLUDE_LIB}) - set(BM_LIBRARIES "") - list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so) + include_directories(${BM_ROOT_INCLUDE_RT}) + include_directories(${BM_ROOT_INCLUDE_LIB}) + set(BM_LIBRARIES "") + list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmdnn_device.so) list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmlib_device.so) - list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so) - list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) - else() - message(FATAL_ERROR "Could not found bm_lib") + list(APPEND BM_LIBRARIES ${BM_ROOT}/lib/app/libbmrt.so) + list(APPEND ANAKIN_LINKER_LIBS ${BM_LIBRARIES}) + else() + message(FATAL_ERROR "Could not found bm_lib") endif() endmacro() - -macro(anakin_find_nvinfer) - find_path(NVINFER_INCLUDE_DIR NvInfer.h PATHS ${ANAKIN_ROOT}/third-party/tensorrt5/include - $ENV{NVINFER_ROOT}) - if (BUILD_SHARED) - find_library(NVINFER_LIBRARY NAMES libnvinfer.so - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - PATHS ${NVINFER_INCLUDE_DIR}/../lib/ - DOC "library path for tensorrt.") - find_library(NVINFER_PLUGIN_LIBRARY NAMES libnvinfer_plugin.so - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - PATHS ${NVINFER_INCLUDE_DIR}/../lib/ - DOC "library path for tensorrt.") - find_library(NVPARSERS_LIBRARY NAMES libnvparsers.so - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - PATHS ${NVINFER_INCLUDE_DIR}/../lib/ - DOC "library path for tensorrt.") - else() - find_library(NVINFER_LIBRARY NAMES libnvinfer.a - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - DOC "library path for tensorrt.") - find_library(NVINFER_PLUGIN_LIBRARY NAMES libnvinfer_plugin.a - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - DOC "library path for tensorrt.") - find_library(NVPARSERS_LIBRARY NAMES libnvparsers.a - PATHS ${NVINFER_INCLUDE_DIR}/../lib64/ - DOC "library path for tensorrt.") - endif() - if(NVINFER_INCLUDE_DIR AND NVINFER_LIBRARY AND NVINFER_PLUGIN_LIBRARY AND NVPARSERS_LIBRARY) - set(NVINFER_FOUND TRUE) - endif() - if(NVINFER_FOUND) - message(STATUS "Found NvInfer in ${NVINFER_INCLUDE_DIR}") - include_directories(SYSTEM ${NVINFER_INCLUDE_DIR}) - #include_directories(${NVINFER_INCLUDE_DIR}) - list(APPEND ANAKIN_LINKER_LIBS ${NVINFER_LIBRARY}) - list(APPEND ANAKIN_LINKER_LIBS ${NVINFER_PLUGIN_LIBRARY}) - list(APPEND ANAKIN_LINKER_LIBS ${NVPARSERS_LIBRARY}) - message(STATUS "${ANAKIN_LINKER_LIBS}") - else() - message(FATAL_ERROR "Couldn't found NvInfer ! in path: ${NVINFER_INCLUDE_DIR}") - endif() +macro(anakin_find_sgx) + set(SGX_SDK $ENV{SGX_SDK}) + if(SGX_SDK) + add_library(anakin_sgx_config INTERFACE) + set(SGX_CONFIG_INTERFACE anakin_sgx_config) + target_compile_options(${SGX_CONFIG_INTERFACE} INTERFACE + -fPIC -fno-builtin -nostdlib -nostdinc $<$:-nostdinc++>) + set(PROBE_CMD "echo \"#include \" | ${CMAKE_C_COMPILER} -E -xc - | grep immintrin.h | sed 's:^.*\"\\(.*\\)\".*$:\\1:g' | head -1") + execute_process(COMMAND sh -c "${PROBE_CMD}" OUTPUT_VARIABLE IMMINTRIN_H) + get_filename_component(IMMINTRIN_PATH ${IMMINTRIN_H} DIRECTORY) + target_include_directories(${SGX_CONFIG_INTERFACE} BEFORE INTERFACE + "${ANAKIN_ROOT}/sgx/enclave/include" + "${SGX_SDK}/include" + "${SGX_SDK}/include/tlibc" + "${SGX_SDK}/include/libcxx" + ) + target_include_directories(${SGX_CONFIG_INTERFACE} INTERFACE ${IMMINTRIN_PATH}) + list(APPEND ANAKIN_LINKER_LIBS "sgx_tstdc" "sgx_tcxx") + message(STATUS "Found SGX SDK in ${SGX_SDK}") + else() + message(FATAL_ERROR "SGX SDK not found or not properly configured!") + endif() endmacro() - diff --git a/cmake/gather.cmake b/cmake/gather.cmake index 32b03c05f..bdcdce97f 100644 --- a/cmake/gather.cmake +++ b/cmake/gather.cmake @@ -28,6 +28,11 @@ if(USE_BM_PLACE) anakin_find_bmlib() endif() +# find cnml and cnrt +#if(USE_MLU) +## anakin_find_mlulib() +#endif() + # set amd opencl path if(AMD_GPU) amd_build_cl_file("${CMAKE_SOURCE_DIR}/saber/funcs/impl/amd/cl" "${CMAKE_BINARY_DIR}/cl/amd") @@ -59,7 +64,10 @@ endif() if(USE_PROTOBUF) anakin_find_protobuf() - anakin_protos_processing() +endif() + +if(USE_NANOPB) + anakin_find_nanopb() endif() if(BUILD_RPC) @@ -88,9 +96,11 @@ endif() if(DISABLE_ALL_WARNINGS) anakin_disable_warnings(CMAKE_CXX_FLAGS) endif() -if(USE_OPENMP) + +if(USE_OPENMP AND NOT APPLE) anakin_find_openmp() endif() + if(USE_ARM_PLACE) if(TARGET_ANDROID) if(USE_OPENMP) @@ -102,6 +112,7 @@ if(USE_ARM_PLACE) message(FATAL_ERROR " ARM TARGET unknown !") endif() endif() -if(USE_TENSORRT) - anakin_find_nvinfer() + +if(USE_SGX) + anakin_find_sgx() endif() diff --git a/cmake/ios/ios.toolchain.cmake b/cmake/ios/ios.toolchain.cmake old mode 100755 new mode 100644 index e6b56c7a5..ec1c98ecb --- a/cmake/ios/ios.toolchain.cmake +++ b/cmake/ios/ios.toolchain.cmake @@ -1,202 +1,492 @@ -# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake -# files which are included with CMake 2.8.4 -# It has been altered for iOS development - -# Options: +# This file is part of the ios-cmake project. It was retrieved from +# https://github.com/cristeab/ios-cmake.git, which is a fork of +# https://code.google.com/p/ios-cmake/. Which in turn is based off of +# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which +# are included with CMake 2.8.4 # -# IOS_PLATFORM = iPhoneOS (default) or iPhoneSimulator -# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders -# iPhoneOS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. -# iPhoneSimulator - used to build for the Simulator platforms, which have an x86 arch. +# The ios-cmake project is licensed under the new BSD license. # -# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder -# By default this location is automatcially chosen based on the IOS_PLATFORM value above. -# If set manually, it will override the default location and force the user of a particular Developer Platform +# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, +# Kitware, Inc., Insight Software Consortium. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. # -# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder -# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. -# In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path. -# If set manually, this will force the use of a specific SDK version - -# Macros: +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# This file is based off of the Platform/Darwin.cmake and +# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 +# It has been altered for iOS development. +# +# Updated by Alex Stewart (alexs.mac@gmail.com) +# +# ***************************************************************************** +# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) +# under the BSD-3-Clause license +# https://github.com/leetal/ios-cmake +# ***************************************************************************** +# +# INFORMATION / HELP +# +# The following variables control the behaviour of this toolchain: +# +# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS or WATCHOS or SIMULATOR_WATCHOS +# OS = Build for iPhoneOS. +# OS64 = Build for arm64 arm64e iPhoneOS. +# SIMULATOR = Build for x86 i386 iPhone Simulator. +# SIMULATOR64 = Build for x86_64 iPhone Simulator. +# TVOS = Build for AppleTVOS. +# SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator. +# WATCHOS = Build for armv7k arm64_32 for WatchOS. +# SIMULATOR_WATCHOS = Build for x86_64 for Watch Simulator. +# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use. By default this is +# automatically determined from IOS_PLATFORM and xcodebuild, but +# can also be manually specified (although this should not be required). +# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform +# being compiled for. By default this is automatically determined from +# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should +# not be required). +# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true) +# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default) +# ENABLE_VISIBILITY: (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility hidden by default) +# IOS_ARCH: (armv7 armv7s armv7k arm64 arm64e arm64_32 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM +# OS = armv7 armv7s arm64 arm64e (if applicable) +# OS64 = arm64 arm64e (if applicable) +# SIMULATOR = i386 +# SIMULATOR64 = x86_64 +# TVOS = arm64 +# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) +# WATCHOS = armv7k arm64_32 (if applicable) +# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) +# +# This toolchain defines the following variables for use externally: +# +# XCODE_VERSION: Version number (not including Build version) of Xcode detected. +# IOS_SDK_VERSION: Version of iOS SDK being used. +# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from +# IOS_PLATFORM). # -# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) -# A convenience macro for setting xcode specific properties on targets -# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") +# This toolchain defines the following macros for use externally: +# +# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) +# A convenience macro for setting xcode specific properties on targets. +# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel +# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). # # find_host_package (PROGRAM ARGS) -# A macro used to find executable programs on the host system, not within the iOS environment. -# Thanks to the android-cmake project for providing the command - -# Standard settings -set (CMAKE_SYSTEM_NAME Darwin) -set (CMAKE_SYSTEM_VERSION 1) -set (UNIX True) -set (APPLE True) -set (IOS True) - -# Required as of cmake 2.8.10 -set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) - -# Determine the cmake host system version so we know where to find the iOS SDKs -find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin) -if (CMAKE_UNAME) - exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION) - string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}") -endif (CMAKE_UNAME) - -# Force the compilers to gcc for iOS -include (CMakeForceCompiler) -set(CMAKE_C_COMPILER /usr/bin/clang) -set(CMAKE_CXX_COMPILER /usr/bin/clang++) -#CMAKE_FORCE_C_COMPILER (/usr/bin/clang Apple) -#CMAKE_FORCE_CXX_COMPILER (/usr/bin/clang++ Apple) +# A macro used to find executable programs on the host system, not within the +# iOS environment. Thanks to the android-cmake project for providing the +# command. + +# Fix for PThread library not in path +set(CMAKE_THREAD_LIBS_INIT "-lpthread") +set(CMAKE_HAVE_THREADS_LIBRARY 1) +set(CMAKE_USE_WIN32_THREADS_INIT 0) +set(CMAKE_USE_PTHREADS_INIT 1) + +# Cache what generator is used +set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}" CACHE STRING "Expose CMAKE_GENERATOR" FORCE) + +# Get the Xcode version being used. +execute_process(COMMAND xcodebuild -version + OUTPUT_VARIABLE XCODE_VERSION + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") +string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") +message(STATUS "Building with Xcode version: ${XCODE_VERSION}") +# Default to building for iPhoneOS if not specified otherwise, and we cannot +# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use +# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly +# determine the value of IOS_PLATFORM from the root project, as +# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake. +if (NOT DEFINED IOS_PLATFORM) + if (CMAKE_OSX_ARCHITECTURES) + if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*") + set(IOS_PLATFORM "OS") + elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386") + set(IOS_PLATFORM "SIMULATOR") + elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") + set(IOS_PLATFORM "SIMULATOR64") + elseif (CMAKE_OSX_ARCHITECTURES MATCHES "armv7k") + set(IOS_PLATFORM "WATCHOS") + endif() + endif() + if (NOT IOS_PLATFORM) + set(IOS_PLATFORM "OS") + endif() +endif() +set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING + "Type of iOS platform for which to build.") +# Determine the platform name and architectures for use in xcodebuild commands +# from the specified IOS_PLATFORM name. +if (IOS_PLATFORM STREQUAL "OS") + set(XCODE_IOS_PLATFORM iphoneos) + if(NOT IOS_ARCH) + if (XCODE_VERSION VERSION_GREATER 10.0) + set(IOS_ARCH armv7 armv7s arm64 arm64e) + else() + set(IOS_ARCH armv7 armv7s arm64) + endif() + endif() + elseif (IOS_PLATFORM STREQUAL "OS64") + set(XCODE_IOS_PLATFORM iphoneos) + if(NOT IOS_ARCH) + if (XCODE_VERSION VERSION_GREATER 10.0) + set(IOS_ARCH arm64 arm64e) + else() + set(IOS_ARCH arm64) + endif() + endif() +elseif (IOS_PLATFORM STREQUAL "SIMULATOR") + set(XCODE_IOS_PLATFORM iphonesimulator) + if(NOT IOS_ARCH) + set(IOS_ARCH i386) + endif() + message(WARNING "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.") +elseif(IOS_PLATFORM STREQUAL "SIMULATOR64") + set(XCODE_IOS_PLATFORM iphonesimulator) + if(NOT IOS_ARCH) + set(IOS_ARCH x86_64) + endif() +elseif (IOS_PLATFORM STREQUAL "TVOS") + set(XCODE_IOS_PLATFORM appletvos) + if(NOT IOS_ARCH) + set(IOS_ARCH arm64) + endif() +elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") + set(XCODE_IOS_PLATFORM appletvsimulator) + if(NOT IOS_ARCH) + set(IOS_ARCH x86_64) + endif() +elseif (IOS_PLATFORM STREQUAL "WATCHOS") + set(XCODE_IOS_PLATFORM watchos) + if(NOT IOS_ARCH) + if (XCODE_VERSION VERSION_GREATER 10.0) + set(IOS_ARCH armv7k arm64_32) + else() + set(IOS_ARCH armv7k) + endif() + endif() +elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") + set(XCODE_IOS_PLATFORM watchsimulator) + if(NOT IOS_ARCH) + set(IOS_ARCH x86_64) + endif() +else() + message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}") +endif() +message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, " + "architecture(s): ${IOS_ARCH}") +# If user did not specify the SDK root to use, then query xcodebuild for it. +execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT + OUTPUT_QUIET ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +# If user did not specify the SDK root to use, then query xcodebuild for it. +if (NOT DEFINED CMAKE_OSX_SYSROOT OR (NOT CMAKE_OSX_SYSROOT STREQUAL CMAKE_OSX_SYSROOT_INT)) + execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() +if (NOT EXISTS ${CMAKE_OSX_SYSROOT}) + message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" + "is pointing to the correct path. Please run:" + "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" + " and see if that fixes the problem for you.") + message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " + "does not exist.") +elseif(DEFINED CMAKE_OSX_SYSROOT) + message(STATUS "Using manually set SDK path: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") +else() + message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}") +endif() +# Specify minimum version of deployment target. +if (NOT DEFINED IOS_DEPLOYMENT_TARGET) + if (IOS_PLATFORM STREQUAL "WATCHOS" OR IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") + # Unless specified, SDK version 2.0 is used by default as minimum target version (watchOS). + set(IOS_DEPLOYMENT_TARGET "2.0" + CACHE STRING "Minimum iOS version to build for." ) + else() + # Unless specified, SDK version 8.0 is used by default as minimum target version (iOS, tvOS). + set(IOS_DEPLOYMENT_TARGET "8.0" + CACHE STRING "Minimum iOS version to build for." ) + endif() + message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!") +endif() +# Use bitcode or not +if (NOT DEFINED ENABLE_BITCODE AND NOT IOS_ARCH MATCHES "((^|, )(i386|x86_64))+") + # Unless specified, enable bitcode support by default + set(ENABLE_BITCODE TRUE CACHE BOOL "Whether or not to enable bitcode") + message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!") +endif() +if (NOT DEFINED ENABLE_BITCODE) + message(STATUS "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!") +endif() +# Use ARC or not +if (NOT DEFINED ENABLE_ARC) + # Unless specified, enable ARC support by default + set(ENABLE_ARC TRUE CACHE BOOL "Whether or not to enable ARC") + message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!") +endif() +# Use hidden visibility or not +if (NOT DEFINED ENABLE_VISIBILITY) + # Unless specified, disable symbols visibility by default + set(ENABLE_VISIBILITY FALSE CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)") + message(STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") +endif() +# Get the SDK version information. +execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion + OUTPUT_VARIABLE IOS_SDK_VERSION + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +# Find the Developer root for the specific iOS platform being compiled for +# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in +# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain +# this information from xcrun or xcodebuild. +if (NOT CMAKE_IOS_DEVELOPER_ROOT) + get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH) + get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH) +endif() +if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT}) + message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: " + "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.") +endif() +# Find the C & C++ compilers for the specified SDK. +if (NOT CMAKE_C_COMPILER) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang + OUTPUT_VARIABLE CMAKE_C_COMPILER + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") +endif() +if (NOT CMAKE_CXX_COMPILER) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ + OUTPUT_VARIABLE CMAKE_CXX_COMPILER + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") +endif() +# Find (Apple's) libtool. +execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool + OUTPUT_VARIABLE IOS_LIBTOOL + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +message(STATUS "Using libtool: ${IOS_LIBTOOL}") +# Configure libtool to be used instead of ar + ranlib to build static libraries. +# This is required on Xcode 7+, but should also work on previous versions of +# Xcode. +set(CMAKE_C_CREATE_STATIC_LIBRARY + "${IOS_LIBTOOL} -static -o ") +set(CMAKE_CXX_CREATE_STATIC_LIBRARY + "${IOS_LIBTOOL} -static -o ") +# Get the version of Darwin (OS X) of the host. +execute_process(COMMAND uname -r + OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +# Standard settings. +set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "") +set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "") +set(UNIX TRUE CACHE BOOL "") +set(APPLE TRUE CACHE BOOL "") +set(. TRUE CACHE BOOL "") set(CMAKE_AR ar CACHE FILEPATH "" FORCE) - -# Skip the platform compiler checks for cross compiling -set (CMAKE_CXX_COMPILER_WORKS TRUE) -set (CMAKE_C_COMPILER_WORKS TRUE) - -# All iOS/Darwin specific settings - some may be redundant -set (CMAKE_SHARED_LIBRARY_PREFIX "lib") -set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") -set (CMAKE_SHARED_MODULE_PREFIX "lib") -set (CMAKE_SHARED_MODULE_SUFFIX ".so") -set (CMAKE_MODULE_EXISTS 1) -set (CMAKE_DL_LIBS "") - -set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") -set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") -set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") -set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") - -# Hidden visibilty is required for cxx on iOS -set (CMAKE_C_FLAGS_INIT "-isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0") -set (CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -fvisibility=hidden -fvisibility-inlines-hidden -isysroot ${CMAKE_OSX_SYSROOT} -miphoneos-version-min=6.0") - -set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") -set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") - -set (CMAKE_PLATFORM_HAS_INSTALLNAME 1) -set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") -set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") -set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") -set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") -set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") - -# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree -# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache -# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) -# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex +set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE) +# Force unset of OS X-specific deployment target (otherwise autopopulated), +# required as of cmake 2.8.10. +set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING + "Must be empty for iOS builds." FORCE) +# Set the architectures for which to build. +set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS") +# Change the type of target generated for try_compile() so it'll work when cross-compiling +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +# All iOS/Darwin specific settings - some may be redundant. +set(CMAKE_SHARED_LIBRARY_PREFIX "lib") +set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") +set(CMAKE_SHARED_MODULE_PREFIX "lib") +set(CMAKE_SHARED_MODULE_SUFFIX ".so") +set(CMAKE_C_COMPILER_ABI ELF) +set(CMAKE_CXX_COMPILER_ABI ELF) +set(CMAKE_C_HAS_ISYSROOT 1) +set(CMAKE_CXX_HAS_ISYSROOT 1) +set(CMAKE_MODULE_EXISTS 1) +set(CMAKE_DL_LIBS "") +set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") +set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") +set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") +set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") + +if(IOS_ARCH MATCHES "((^|, )(arm64|arm64e|x86_64))+") + set(CMAKE_C_SIZEOF_DATA_PTR 8) + set(CMAKE_CXX_SIZEOF_DATA_PTR 8) + message(STATUS "Using a data_ptr size of 8") +else() + set(CMAKE_C_SIZEOF_DATA_PTR 4) + set(CMAKE_CXX_SIZEOF_DATA_PTR 4) + message(STATUS "Using a data_ptr size of 4") +endif() + +message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}" + " (SDK version: ${IOS_SDK_VERSION})") +# Note that only Xcode 7+ supports the newer more specific: +# -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use: +# -m(ios/ios-simulator)-version-min instead. +if (IOS_PLATFORM STREQUAL "OS" OR IOS_PLATFORM STREQUAL "OS64") + if (XCODE_VERSION VERSION_LESS 7.0) + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") + else() + # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") + endif() +elseif (IOS_PLATFORM STREQUAL "TVOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif (IOS_PLATFORM STREQUAL "WATCHOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mwatchos-version-min=${IOS_DEPLOYMENT_TARGET}") +elseif (IOS_PLATFORM STREQUAL "SIMULATOR_WATCHOS") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mwatchos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +else() + # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. + set(XCODE_IOS_PLATFORM_VERSION_FLAGS + "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +endif() +message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}") + +if (ENABLE_BITCODE) + set(BITCODE "-fembed-bitcode") + set(HEADER_PAD "") + message(STATUS "Enabling bitcode support.") +else() + set(BITCODE "") + set(HEADER_PAD "-headerpad_max_install_names") + message(STATUS "Disabling bitcode support.") +endif() + +if (ENABLE_ARC) + set(FOBJC_ARC "-fobjc-arc") + message(STATUS "Enabling ARC support.") +else() + set(FOBJC_ARC "-fno-objc-arc") + message(STATUS "Disabling ARC support.") +endif() + +if (NOT ENABLE_VISIBILITY) + set(VISIBILITY "-fvisibility=hidden") + message(STATUS "Hiding symbols (-fvisibility=hidden).") +else() + set(VISIBILITY "") +endif() + +set(CMAKE_C_FLAGS +"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}") +# Hidden visibilty is required for C++ on iOS. +set(CMAKE_CXX_FLAGS +"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g ${BITCODE} ${CMAKE_CXX_FLAGS_DEBUG}") +set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_MINSIZEREL}") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${BITCODE} ${CMAKE_CXX_FLAGS_RELEASE}") +set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") +set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") + +# In order to ensure that the updated compiler flags are used in try_compile() +# tests, we have to forcibly set them in the CMake cache, not merely set them +# in the local scope. +list(APPEND VARS_TO_FORCE_IN_CACHE + CMAKE_C_FLAGS + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_LINK_FLAGS + CMAKE_CXX_LINK_FLAGS) +foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE}) + set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "") +endforeach() + +set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) +set (CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") +set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib ${HEADER_PAD}") +set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle ${HEADER_PAD}") +set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") +set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") +set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") + +# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old +# build tree (where install_name_tool was hardcoded) and where +# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in +# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode +# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did +# before, Alex. if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) - find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) + find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) -# Setup iOS platform unless specified manually with IOS_PLATFORM -if (NOT DEFINED IOS_PLATFORM) - set (IOS_PLATFORM "iPhoneOS") -endif (NOT DEFINED IOS_PLATFORM) -set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") - -# Add Bitcode -if (${IOS_PLATFORM} STREQUAL "iPhoneOS") - set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") - set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}") -endif (${IOS_PLATFORM} STREQUAL "iPhoneOS") - -# Check the platform selection and setup for developer root -if (${IOS_PLATFORM} STREQUAL "iPhoneOS") - set (IOS_PLATFORM_LOCATION "iPhoneOS.platform") - - # This causes the installers to properly locate the output libraries - set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") -elseif (${IOS_PLATFORM} STREQUAL "iPhoneSimulator") - set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") - - # This causes the installers to properly locate the output libraries - set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") -else (${IOS_PLATFORM} STREQUAL "iPhoneOS") - message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose iPhoneOS or iPhoneSimulator") -endif (${IOS_PLATFORM} STREQUAL "iPhoneOS") - -# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT -# Note Xcode 4.3 changed the installation location, choose the most recent one available -set (XCODE_POST_43_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") -set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") -if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) - if (EXISTS ${XCODE_POST_43_ROOT}) - set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT}) - elseif(EXISTS ${XCODE_PRE_43_ROOT}) - set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT}) - endif (EXISTS ${XCODE_POST_43_ROOT}) -endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT) -set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") - -# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT -if (NOT DEFINED CMAKE_IOS_SDK_ROOT) - file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*") - if (_CMAKE_IOS_SDKS) - list (SORT _CMAKE_IOS_SDKS) - list (REVERSE _CMAKE_IOS_SDKS) - list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT) - else (_CMAKE_IOS_SDKS) - message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.") - endif (_CMAKE_IOS_SDKS) - message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}") -endif (NOT DEFINED CMAKE_IOS_SDK_ROOT) -set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") - -# Set the sysroot default to the most recent SDK -set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") - -# set the architecture for iOS -# NOTE: Currently both ARCHS_STANDARD_32_BIT and ARCHS_UNIVERSAL_IPHONE_OS set armv7 only, so set both manually -if (${IOS_PLATFORM} STREQUAL "iPhoneOS") - set (IOS_ARCH armv7) -else (${IOS_PLATFORM} STREQUAL "iPhoneOS") - set (IOS_ARCH i386) -endif (${IOS_PLATFORM} STREQUAL "iPhoneOS") - -set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") - -# Set the find root to the iOS developer roots and to user defined paths -set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root") - -# default to searching for frameworks first -set (CMAKE_FIND_FRAMEWORK FIRST) - -# set up the default search directories for frameworks -set (CMAKE_SYSTEM_FRAMEWORK_PATH - ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks - ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks - ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks -) - -# only search the iOS sdks, not the remainder of the host filesystem -set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) -set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) - - -# This little macro lets you set any XCode specific property -macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) - set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) -endmacro (set_xcode_property) - - -# This macro lets you find executable programs on the host system -macro (find_host_package) - set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) - set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) - set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) - set (IOS FALSE) - - find_package(${ARGN}) - - set (IOS TRUE) - set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) - set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) - set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endmacro (find_host_package) - +# Set the find root to the iOS developer roots and to user defined paths. +set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT} + ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root" FORCE) +# Default to searching for frameworks first. +set(CMAKE_FIND_FRAMEWORK FIRST) +# Set up the default search directories for frameworks. +set(CMAKE_SYSTEM_FRAMEWORK_PATH + ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks + ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks + ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks) +# Only search the specified iOS SDK, not the remainder of the host filesystem. +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +# This little macro lets you set any XCode specific property. +macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) + set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") + if (XCODE_RELVERSION_I STREQUAL "All") + set_property(TARGET ${TARGET} PROPERTY + XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") + else() + set_property(TARGET ${TARGET} PROPERTY + XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") + endif() +endmacro(set_xcode_property) +# This macro lets you find executable programs on the host system. +macro(find_host_package) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) + set(. FALSE) + find_package(${ARGN}) + set(. TRUE) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +endmacro(find_host_package) diff --git a/cmake/linux_arm/linux_arm.toolchain.cmake b/cmake/linux_arm/linux_arm.toolchain.cmake new file mode 100644 index 000000000..92e894e17 --- /dev/null +++ b/cmake/linux_arm/linux_arm.toolchain.cmake @@ -0,0 +1,25 @@ +# this one is important +SET(CMAKE_SYSTEM_NAME Linux) +SET(CMAKE_SYSTEM_PROCESSOR arm) +#this one not so much +#SET(CMAKE_SYSTEM_VERSION 1) + +# specify the cross compiler +SET(CMAKE_C_COMPILER ${LINUX_ARM_TOOL_ROOT}/bin/arm-linux-gnueabihf-gcc) +SET(CMAKE_CXX_COMPILER ${LINUX_ARM_TOOL_ROOT}/bin/arm-linux-gnueabihf-g++) +#SET(CMAKE_LINKER /home/xuhailong/dev-tool/arm-linux/64hf/bin/arm-linux-gnueabihf-g++) +#SET(CMAKE_AR /home/xuhailong/dev-tool/arm-linux/64hf/bin/arm-linux-gnueabihf-g++) + +# where is the target environment +SET(CMAKE_FIND_ROOT_PATH ${LINUX_ARM_TOOL_ROOT}) + +# search for programs in the build host directories +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +# for libraries and headers in the target directories +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# float-abi: hard, softfp +add_compile_options(-mfloat-abi=softfp) +add_compile_options(-mfpu=neon) +add_compile_options(-march=armv7-a) diff --git a/cmake/linux_arm/linux_arm_hf.toolchain.cmake b/cmake/linux_arm/linux_arm_hf.toolchain.cmake new file mode 100644 index 000000000..941906b78 --- /dev/null +++ b/cmake/linux_arm/linux_arm_hf.toolchain.cmake @@ -0,0 +1,25 @@ +# this one is important +SET(CMAKE_SYSTEM_NAME Linux) +SET(CMAKE_SYSTEM_PROCESSOR arm) +#this one not so much +#SET(CMAKE_SYSTEM_VERSION 1) + +# specify the cross compiler +SET(CMAKE_C_COMPILER ${LINUX_ARM_TOOL_ROOT}/bin/arm-linux-gnueabihf-gcc) +SET(CMAKE_CXX_COMPILER ${LINUX_ARM_TOOL_ROOT}/bin/arm-linux-gnueabihf-g++) +#SET(CMAKE_LINKER /home/xuhailong/dev-tool/arm-linux/64hf/bin/arm-linux-gnueabihf-g++) +#SET(CMAKE_AR /home/xuhailong/dev-tool/arm-linux/64hf/bin/arm-linux-gnueabihf-g++) + +# where is the target environment +SET(CMAKE_FIND_ROOT_PATH ${LINUX_ARM_TOOL_ROOT}) + +# search for programs in the build host directories +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +# for libraries and headers in the target directories +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# float-abi: hard, softfp +add_compile_options(-mfloat-abi=hard) +add_compile_options(-mfpu=neon) +add_compile_options(-march=armv7-a) diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake new file mode 100644 index 000000000..80669e1ab --- /dev/null +++ b/cmake/mlu.cmake @@ -0,0 +1,42 @@ +# ---------------------------------------------------------------------------- +# section: Find mlu and config compile options. +# ---------------------------------------------------------------------------- +macro(anakin_find_mlulib) + SET(CNRTML_ROOT ${ANAKIN_THIRD_PARTY_PATH}/mlu) + SET(CNML_INCLUDE_SEARCH_PATHS ${CNRTML_ROOT}/include) + SET(CNML_LIB_SEARCH_PATHS ${CNRTML_ROOT}/lib) + + SET(CNRT_INCLUDE_SEARCH_PATHS ${CNRTML_ROOT}/include) + SET(CNRT_LIB_SEARCH_PATHS ${CNRTML_ROOT}/lib) + + find_path(CNML_INCLUDE_DIR cnml.h PATHS ${CNML_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) + + find_path(CNRT_INCLUDE_DIR cnrt.h PATHS ${CNRT_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) + + find_library(CNML_LIBRARY NAMES libcnml.so + PATHS ${CNML_LIB_SEARCH_PATHS} + DOC "library path for cnml.") + + find_library(CNRT_LIBRARY NAMES libcnrt.so + PATHS ${CNRT_LIB_SEARCH_PATHS} + DOC "library path for cnrt.") + + if(CNML_INCLUDE_DIR AND CNML_LIBRARY AND CNRT_INCLUDE_DIR AND CNRT_LIBRARY) + set(MLU_FOUND YES) + endif() + if(MLU_FOUND) + include_directories(SYSTEM ${CNML_INCLUDE_DIR}) + list(APPEND ANAKIN_LINKER_LIBS ${CNML_LIBRARY}) + message(STATUS "Found CNML (include: ${CNML_INCLUDE_DIR}, library: ${CNML_LIBRARY})") + + include_directories(SYSTEM ${CNRT_INCLUDE_DIR}) + list(APPEND ANAKIN_LINKER_LIBS ${CNRT_LIBRARY}) + message(STATUS "Found CNRT (include: ${CNRT_INCLUDE_DIR}, library: ${CNRT_LIBRARY})") + + else() +# message(SEND_ERROR "Could not find cnml library in: ${CNML_ROOT}") +# message(SEND_ERROR "Could not find cnrt library in: ${CNRT_ROOT}") + message(STATUS "Could not find cnml library in: ${CNML_ROOT}") + message(STATUS "Could not find cnrt library in: ${CNRT_ROOT}") + endif() +endmacro() diff --git a/cmake/statistic.cmake b/cmake/statistic.cmake index d316968dc..34b23c1ce 100644 --- a/cmake/statistic.cmake +++ b/cmake/statistic.cmake @@ -27,6 +27,7 @@ function(anakin_print_statistic) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") message(STATUS " Link flags : ${CMAKE_EXE_LINKER_FLAGS}") message(STATUS " Shared Link flags : ${CMAKE_SHARED_LINKER_FLAGS}") + message(STATUS " Anakin Link Libs : ${ANAKIN_LINKER_LIBS}") message(STATUS " Build type : ${BoldWhite}${CMAKE_BUILD_TYPE}${ColourReset}") message(STATUS " Build cross plantform : ${BUILD_CROSS_PLANTFORM}") if(ANAKIN_TYPE_FP64) @@ -61,7 +62,9 @@ function(anakin_print_statistic) if(USE_PROTOBUF) message(STATUS " Use google protobuf : ${USE_PROTOBUF}") endif() - + if(USE_NANOPB) + message(STATUS " USE nanopb : ${USE_NANOPB}") + endif() if(USE_GTEST) message(STATUS " USE_GTEST : ${USE_GTEST}") else() @@ -92,7 +95,13 @@ function(anakin_print_statistic) message(STATUS " USE_X86 : ${USE_X86_PLACE}") message(STATUS " X86 Target Arch : ${BUILD_X86_ARCH}") endif() - + + if(USE_MLU) + message(STATUS "") + message(STATUS "${Green}Mlu:${ColourReset}") + message(STATUS " USE_MLU : ${USE_MLU}") + endif() + if(USE_CUDA) message(STATUS "") message(STATUS "${Green}Cuda:${ColourReset}") @@ -116,13 +125,14 @@ function(anakin_print_statistic) message(STATUS " `--OpenCL version : ${OpenCL_VERSION}") endif() endif() - message(STATUS "") if(USE_GPU_PLACE) message(STATUS " SELECT_GPU_PLACE : ${USE_GPU_PLACE}") + elseif(USE_MLU_PLACE) + message(STATUS " SELECT_MLU_PLACE : ${USE_MLU_PLACE}") elseif(USE_X86_PLACE) - message(STATUS " SELECT_X86_PLACE : ${USE_X86_PLACE}") + message(STATUS " SELECT_X86_PLACE : ${USE_X86_PLACE}") elseif(USE_ARM_PLACE) message(STATUS " USE_ARM_PLACE : ${USE_ARM_PLACE}") if(TARGET_ANDROID) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index ee4c54170..8b46f91bf 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,7 +15,7 @@ # ---------------------------------------------------------------------------- # section: help to search src and include files # ---------------------------------------------------------------------------- -# fetch files(.cc .cpp .cu .c or .h .hpp etc.) in dir(search_dir) +# fetch files(.cc .cpp .cu .c or .h .hpp etc.) in dir(search_dir) # and save to parent scope var outputs function(anakin_fetch_files_with_suffix search_dir suffix outputs) exec_program(ls ${search_dir} @@ -39,11 +39,11 @@ endfunction() # recursively fetch files function(anakin_fetch_files_with_suffix_recursively search_dir suffix outputs) - file(GLOB_RECURSE ${outputs} ${search_dir} "*.${suffix}") + file(GLOB_RECURSE ${outputs} ${search_dir} "*.${suffix}") set(${outputs} ${${outputs}} PARENT_SCOPE) endfunction() -# recursively fetch include dir +# recursively fetch include dir function(anakin_fetch_include_recursively root_dir) if (IS_DIRECTORY ${root_dir}) #message(STATUS "include dir: " ${Magenta}${root_dir}${ColourReset}) @@ -52,7 +52,7 @@ function(anakin_fetch_include_recursively root_dir) file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*) foreach(sub ${ALL_SUB}) - if (IS_DIRECTORY ${root_dir}/${sub}) + if (IS_DIRECTORY ${root_dir}/${sub}) anakin_fetch_include_recursively(${root_dir}/${sub}) endif() endforeach() @@ -95,11 +95,11 @@ macro(anakin_check_compiler_flag LANG FLAG RESULT) set(_fname "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cu") if("${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror " OR "${CMAKE_CXX_FLAGS} ${FLAG} " MATCHES "-Werror=unknown-pragmas ") FILE(WRITE "${_fname}" "" - "extern \"C\" __global__ void test() {}\n" + "extern \"C\" __global__ void test() {}\n" "int main() { return 0; }\n") else() FILE(WRITE "${_fname}" "#pragma\n" - "extern \"C\" __global__ void test() {}\n" + "extern \"C\" __global__ void test() {}\n" "int main() { return 0; }\n") endif() else() @@ -132,8 +132,8 @@ macro(anakin_check_compiler_flag LANG FLAG RESULT) MESSAGE(STATUS "Testing ${RESULT}") EXEC_PROGRAM(nvcc ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/ - ARGS "${FLAG}" "${_fname}" - OUTPUT_VARIABLE OUTPUT + ARGS "${FLAG}" "${_fname}" + OUTPUT_VARIABLE OUTPUT RETURN_VALUE RET_VALUE) if(NOT ${RET_VALUE}) SET(${RESULT} 1 CACHE INTERNAL "Test ${RESULT}") @@ -163,7 +163,7 @@ macro(anakin_check_flag_support lang flag varname) else() set(_lang ${lang}) endif() - + string(TOUPPER "${flag}" ${varname}) string(REGEX REPLACE "^(/|-)" "HAVE_${_lang}_" ${varname} "${${varname}}") string(REGEX REPLACE " --|-|=| |\\." "_" ${varname} "${${varname}}") @@ -207,7 +207,7 @@ macro(anakin_option variable description value) if(__condition STREQUAL "") set(__condition 2 GREATER 1) endif() - + if(${__condition}) if(__value MATCHES ";") if(${__value}) @@ -240,74 +240,13 @@ function(anakin_generate_kernel anakin_root_dir) ARGS " ${anakin_root_dir}" OUTPUT_VARIABLE OUTPUT RETURN_VALUE VALUE) - if(NOT VALUE) + if(NOT VALUE) message(STATUS "generate kernel files ${Green}${OUTPUT}${ColourReset} successfully.") else() message(FATAL_ERROR "anakin_generate_kernel\npath: ${kerel_generate_script_path}\nscript: generate.sh ") endif() endfunction() - -# ---------------------------------------------------------------------------- -# section: generate the protobuf .h and .cpp files. -# ---------------------------------------------------------------------------- -function(anakin_gen_pb proto_src_path) - set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/) - foreach(__proto_file ${ARGN}) - exec_program(${PROTOBUF_PROTOC_EXECUTABLE} ${__working_dir} ARGS " -I=${proto_src_path} --cpp_out=. ${__proto_file}" - OUTPUT_VARIABLE OUTPUT RETURN_VALUE VALUE) - if(NOT VALUE) - anakin_fetch_files_with_suffix(${__working_dir} "h" PROTO_GENERATE_H) - # get *.cpp or *.cc - anakin_fetch_files_with_suffix(${__working_dir} "c*" PROTO_GENERATE_C) - foreach(__include_file ${PROTO_GENERATE_H}) - exec_program(mv ARGS ${__include_file} ${proto_src_path} - OUTPUT_VARIABLE __out RETURN_VALUE __value) - endforeach() - foreach(__src_file ${PROTO_GENERATE_C}) - if(POLICY CMP0007) - cmake_policy(PUSH) - cmake_policy(SET CMP0007 NEW) - endif() - string(REPLACE "." ";" SRC_LIST ${__src_file}) - list(GET SRC_LIST -1 __src_file_name_suffix) - list(GET SRC_LIST -3 __src_file_name) - - string(REPLACE "/" ";" SRC_LIST_PATH ${__src_file_name}) - list(GET SRC_LIST_PATH -1 __pure_src_file_name) - - if(__src_file_name_suffix EQUAL "cpp") - set(__full_src_filename "${__pure_src_file_name}.pb.cpp") - else() - set(__full_src_filename "${__pure_src_file_name}.pb.cc") - endif() - exec_program(mv ARGS " ${__working_dir}${__full_src_filename} ${proto_src_path}/${__pure_src_file_name}.pb.cpp" - OUTPUT_VARIABLE __out - RETURN_VALUE __value) - if(POLICY CMP0007) - cmake_policy(POP) - endif() - endforeach() - else() - message(FATAL_ERROR "anakin_gen_bp: ${__file} \n error msg: ${OUTPUT}") - endif() - endforeach() -endfunction() - -function(anakin_protos_processing) - set(PROTO_SRC_PATH ${ANAKIN_MODEL_PARSER}/proto) - set(SERVICE_API_SRC_PATH ${ANAKIN_SERVICE}/api) - - set(__working_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/PROTO_TEMP/) - - anakin_fetch_files_with_suffix(${PROTO_SRC_PATH} "proto" PROTO_SRC_FILES) - anakin_fetch_files_with_suffix(${SERVICE_API_SRC_PATH} "proto" SERVICE_API_PROTO_SRC_FILES) - anakin_gen_pb(${PROTO_SRC_PATH} ${PROTO_SRC_FILES}) - if(BUILD_RPC) - anakin_gen_pb(${SERVICE_API_SRC_PATH} ${SERVICE_API_PROTO_SRC_FILES}) - endif() -endfunction() - # ---------------------------------------------------------------------------- # section: Provides macro for an anakin warning diasable # ---------------------------------------------------------------------------- @@ -326,7 +265,7 @@ macro(anakin_disable_warnings) if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) foreach(var ${__flag_vars}) - string(REPLACE " " ";" __list_flag ${${var}}) + string(REPLACE " " ";" __list_flag ${${var}}) foreach(warning ${__list_flag}) if(NOT warning MATCHES "^-Wno-") if((warning MATCHES "^-W") AND (NOT warning STREQUAL "-W")) @@ -349,7 +288,7 @@ endmacro() # ---------------------------------------------------------------------------- macro(anakin_get_file_name path file_name) string(REPLACE "/" ";" split_code_list ${${path}}) - list(GET split_code_list -1 real_code_with_suffix) + list(GET split_code_list -1 real_code_with_suffix) string(REPLACE "." ";" split_code_list ${real_code_with_suffix}) list(GET split_code_list 0 real_code_name) set(${file_name} ${real_code_name}) diff --git a/examples/anakin/anakin_helper.h b/examples/anakin/anakin_helper.h index 2dd791956..a60289e12 100644 --- a/examples/anakin/anakin_helper.h +++ b/examples/anakin/anakin_helper.h @@ -27,10 +27,10 @@ class AKAutoChoose { AKAutoChoose(std::string ak_so_dir,std::string ak_so_path): _ak_so_dir(ak_so_dir),_ak_so_path(ak_so_path) { } - AnakinRunerInterface* get_ak_instance_static(std::string device_type, int device_num){ + AnakinRunerInterface* get_ak_instance_static(const std::string& device_type, int device_num){ return get_anakinrun_instance(device_type.c_str(),device_num); } - AnakinRunerInterface* get_ak_instance(std::string device_type, int device_num) { + AnakinRunerInterface* get_ak_instance(const std::string& device_type, int device_num) { if (device_type == "X86") { std::string this_cpu_arch = _cpu_helper.get_cpu_arch(); //FIXME:choose real path diff --git a/examples/anakin/build.sh b/examples/anakin/build.sh index 41763133c..ee7812ce1 100644 --- a/examples/anakin/build.sh +++ b/examples/anakin/build.sh @@ -2,6 +2,6 @@ DEBUG_FLAG="-std=c++11 -g -I../../framework/c_api/ -I./ -I../../build/ -ldl -Wno-narrowing " ORI_FAST_FLAG="-std=c++11 -Ofast -ffast-math -I../../framework/c_api/ -I./ -ldl -Wno-narrowing " STATIC_FAST_FLAG="-std=c++11 -Ofast -ffast-math -I../../output -I./ -ldl -Wno-narrowing -I../../output/framework/c_api/" -FAST_FLAG="-std=c++11 -g -static-libstdc++ --sysroot=/opt/compiler/gcc-4.8.2/ -Wl,-rpath,/opt/compiler/gcc-4.8.2/lib64/ -Wl,-dynamic-linker,/opt/compiler/gcc-4.8.2/lib64/ld-linux-x86-64.so.2 -Ofast -ffast-math -I../../output/framework/c_api/ -I./ -ldl -Wno-narrowing" +FAST_FLAG="-std=c++11 -g -static-libstdc++ --sysroot=/opt/compiler/gcc-4.8.2/ -Wl,-rpath,/opt/compiler/gcc-4.8.2/lib64/ -Wl,-dynamic-linker,/opt/compiler/gcc-4.8.2/lib64/ld-linux-x86-64.so.2 -Ofast -ffast-math -I../../output/framework/c_api/ -I./ -I../../framework/c_api/ -ldl -Wno-narrowing " g++ example.cpp -o example $FAST_FLAG -g++ map_rnn.cpp -o map_rnn $FAST_FLAG \ No newline at end of file +g++ map_rnn.cpp -o map_rnn ${FAST_FLAG} \ No newline at end of file diff --git a/examples/anakin/map_rnn.cpp b/examples/anakin/map_rnn.cpp index cd1e46c36..1a269170e 100644 --- a/examples/anakin/map_rnn.cpp +++ b/examples/anakin/map_rnn.cpp @@ -1,5 +1,6 @@ #include "anakin_helper.h" #include +bool g_print_data=false; class Data { public: Data(std::string file_name, int batch_size) : @@ -197,19 +198,21 @@ class AKRNNExampleX86 { input_fea->set_dev_lod_offset(lod); _anakin_obj->prediction(); -#ifdef PRINT_RESULT - AnakinRunerTensorInterface* output_0 = _anakin_obj->get_output_tensor(0); - for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) { - int seq_len = seq_offset[seq_id + 1] - seq_offset[seq_id]; - int seq_start = seq_offset[seq_id]; - for (int i = 0; i < seq_len - 1; i++) { - printf("%f|", static_cast(output_0->get_host_data())[seq_start + i]); - } + if(g_print_data){ + AnakinRunerTensorInterface* output_0 = _anakin_obj->get_output_tensor(0); + for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) { + int seq_len = seq_offset[seq_id + 1] - seq_offset[seq_id]; + int seq_start = seq_offset[seq_id]; + + for (int i = 0; i < seq_len - 1; i++) { + printf("%f|", static_cast(output_0->get_host_data())[seq_start + i]); + } - printf("%f\n", static_cast(output_0->get_host_data())[seq_start + seq_len - 1]); + printf("%f\n", static_cast(output_0->get_host_data())[seq_start + seq_len - 1]); + } } -#endif + // output_0->copy_data_dev_2_host(); // float* out_ptr = static_cast(output_0->get_host_data()); @@ -249,9 +252,23 @@ int main(int argc, const char** argv) { } if (argc > 5) { - so_path = argv[5]; + g_print_data=atoi(argv[5]); + } + + if (argc > 6) { + so_dir=argv[6]; + } + + if(argc > 7){ + so_path = argv[7]; + } + + if(argc<=7){ + AKRNNExampleX86 ak_run(so_dir, model_path, max_batch); + ak_run.run(data_path,batch_size); + }else { + AKRNNExampleX86 ak_run(so_dir, so_path, model_path, max_batch); + ak_run.run(data_path,batch_size); } - AKRNNExampleX86 ak_run(so_dir, so_path,model_path,max_batch); - ak_run.run(data_path,batch_size); } \ No newline at end of file diff --git a/examples/arm/classification.cpp b/examples/arm/classification.cpp deleted file mode 100644 index 27c3ce45d..000000000 --- a/examples/arm/classification.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "graph_base.h" -#include "graph.h" -#include "scheduler.h" -#include "net.h" -#include "worker.h" -#include "tensor_op.h" -#include "timer.h" - -using namespace anakin::saber; -using namespace anakin::graph; -using namespace anakin; -typedef Tensor Tensor4hf; - - -void load_labels(std::string path, std::vector& labels) { - - FILE* fp = fopen(path.c_str(), "r"); - if (fp == nullptr) { - LOG(FATAL) << "load label file failed"; - } - while (!feof(fp)) { - char str[1024]; - fgets(str, 1024, fp); - std::string str_s(str); - - if (str_s.length() > 0) { - for (int i = 0; i < str_s.length(); i++) { - if (str_s[i] == ' ') { - std::string strr = str_s.substr(i, str_s.length() - i - 1); - labels.push_back(strr); - i = str_s.length(); - } - } - } - } - fclose(fp); -} - -void print_topk(const float* scores, const int size, const int topk, \ - const std::vector& labels) { - - std::vector< std::pair > vec; - vec.resize(size); - for (int i = 0; i < size; i++) { - vec[i] = std::make_pair(scores[i], i); - } - - std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), - std::greater< std::pair >()); - - // print topk and score - for (int i = 0; i < topk; i++) { - float score = vec[i].first; - int index = vec[i].second; - LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score; - } -} - -#ifdef USE_OPENCV -#include "opencv2/opencv.hpp" - -using namespace cv; - -void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \ - const int width, const int height, const float* mean, const float* scale) { - cv::Mat im; - cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); - float* ptr_data_in = tout.mutable_data(); - int stride = width * height; - for (int i = 0; i < num; i++) { - float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); - for (int r = 0; r < height; r++) { - for (int c = 0; c < width; c++) { - ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; - ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; - ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; - } - } - } -} -#endif - -void test_net(const std::string model_file_name, const std::string image_file_name, \ - const std::vector& labels, const int topk, const int threads, \ - const int test_iter) { - - int batch_size = 1; - - //! create runtime context - LOG(INFO) << "create runtime context"; - std::shared_ptr> ctx1 = std::make_shared>(); - ctx1->set_run_mode(SABER_POWER_HIGH, threads); - LOG(INFO) << omp_get_num_threads() << " threads is activated"; - - //! load model - LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; - Graph graph; - auto status = graph.load(model_file_name); - if (!status) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - //! set batch size - graph.ResetBatchSize("input_0", batch_size); - - //! optimize the graph - LOG(INFO) << "optimize the graph"; - graph.Optimize(); - - //! get output name - std::vector& vout_name = graph.get_outs(); - LOG(INFO) << "output size: " << vout_name.size(); - - //! constructs the executer net - LOG(INFO) << "create net to execute"; - Net net_executer(graph, ctx1, true); - - //! get in - LOG(INFO) << "get input"; - auto d_tensor_in_p = net_executer.get_in("input_0"); - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i = 0; i < valid_shape_in.size(); i++) { - LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; - } - Tensor4hf thin(valid_shape_in); - - //! feed input image to input tensor -#ifdef USE_OPENCV - LOG(INFO) << "loading image " << image_file_name << " ..."; - Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR); - if (img.empty()) { - LOG(FATAL) << "opencv read image " << image_file_name << " failed"; - } - //! set your mean value and scale value here - float mean_mb[3] = {103.94f, 116.78f, 123.68f}; - float scale_mb[3] = {0.017f, 0.017f, 0.017f}; - fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); - -#else - fill_tensor_host_const(thin, 1.f); -#endif - - //! do inference - Context ctx(0, 0, 0); - anakin::saber::SaberTimer my_time; - LOG(INFO) << "run prediction "; - - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(ctx); - saber::SaberTimer t1; - for (int i = 0; i < test_iter; i++) { - d_tensor_in_p->copy_from(thin); - t1.clear(); - t1.start(ctx); - net_executer.prediction(); - t1.end(ctx); - double tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - } - my_time.end(ctx); - - - LOG(INFO) << model_file_name << " batch_size " << batch_size << \ - " average time " << to / test_iter << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; - - //! get output - //! fixme get output - //std::vector vout = net_executer.get_out_list(); - std::vector vout; - for (auto& it : vout_name) { - vout.push_back(net_executer.get_out(it)); - } - Tensor4hf* tensor_out = vout[0]; - LOG(INFO) << "output size: " << vout.size(); - -#if 0 //print output tensor data - LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \ - ", width=" << tensor_out->width() << ", height=" << tensor_out->height(); - const float* ptr_out = tensor_out->data(); - for (int i = 0; i < tensor_out->valid_size(); i++) { - printf("%0.4f ", ptr_out[i]); - if ((i + 1) % 7 == 0) { - printf("\n"); - } - } - printf("\n"); -#endif - print_topk(tensor_out->data(), tensor_out->valid_size(), topk, labels); -} - -int main(int argc, char** argv){ - - LOG(INFO) << "initialized the device"; - Env::env_init(); - - if (argc < 4) { - LOG(ERROR) << "usage: " << argv[0] << ": model_file label_file image_name [topk] [test_iter] [threads]"; - return -1; - } - char* model_file = argv[1]; - char* label_file = argv[2]; - char* image_path = argv[3]; - - std::vector labels; - load_labels(label_file, labels); - - int topk = 5; - if (argc > 4) { - topk = atoi(argv[4]); - } - - int test_iter = 10; - if (argc > 5) { - test_iter = atoi(argv[5]); - } - - int threads = 1; - if (argc > 6) { - threads = atoi(argv[6]); - } - - test_net(model_file, image_path, labels, topk, threads, test_iter); - return 0; -} - diff --git a/examples/arm/ssd_detection.cpp b/examples/arm/ssd_detection.cpp deleted file mode 100644 index 50b02b396..000000000 --- a/examples/arm/ssd_detection.cpp +++ /dev/null @@ -1,233 +0,0 @@ -#include "graph_base.h" -#include "graph.h" -#include "scheduler.h" -#include "net.h" -#include "worker.h" -#include "tensor_op.h" -#include "timer.h" - -using namespace anakin::saber; -using namespace anakin::graph; -using namespace anakin; -typedef Tensor Tensor4hf; - -#ifdef USE_OPENCV -#include "opencv2/opencv.hpp" - -using namespace cv; - -struct Object{ - int batch_id; - cv::Rect rec; - int class_id; - float prob; -}; - -const char* class_names[] = {"background", - "aeroplane", "bicycle", "bird", "boat", - "bottle", "bus", "car", "cat", "chair", - "cow", "diningtable", "dog", "horse", - "motorbike", "person", "pottedplant", - "sheep", "sofa", "train", "tvmonitor"}; - -void fill_tensor_with_cvmat(const Mat& img_in, Tensor4hf& tout, const int num, \ - const int width, const int height, const float* mean, const float* scale) { - cv::Mat im; - cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); - float* ptr_data_in = tout.mutable_data(); - int stride = width * height; - for (int i = 0; i < num; i++) { - float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); - for (int r = 0; r < height; r++) { - for (int c = 0; c < width; c++) { - ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; - ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; - ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; - } - } - } -} - -void detect_object(Tensor4hf& tout, const float thresh, Mat& image) { - std::vector objects; - const float* dout = tout.data(); - for (int iw = 0; iw < tout.height(); iw++) { - Object object; - const float *values = dout + iw * tout.width(); - int batch_id = static_cast(values[0]); - int oriw = image.cols; - int orih = image.rows; - object.batch_id = batch_id; - object.class_id = (int)values[1]; - object.prob = values[2]; - object.rec.x = (int)(values[3] * oriw); - object.rec.y = (int)(values[4] * orih); - object.rec.width = (int)(values[5] * oriw - object.rec.x); - object.rec.height = (int)(values[6] * orih - object.rec.y); - objects.push_back(object); - } - - for (int i = 0; i< objects.size(); ++i) { - Object object = objects.at(i); - if (object.prob > thresh) { - cv::rectangle(image, object.rec, cv::Scalar(255, 0, 0)); - std::ostringstream pro_str; - pro_str << object.prob; - std::string label = std::string(class_names[object.class_id]) + ": " + pro_str.str(); - cv::putText(image, label, cv::Point(object.rec.x, object.rec.y), \ - cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); - LOG(INFO) << "detection in batch: " << object.batch_id << ", image size: " << image.cols << ", " << image.rows << \ - ", detect object: " << class_names[object.class_id] << ", location: x=" << object.rec.x << ", y=" << object.rec.y << \ - ", width=" << object.rec.width << ", height=" << object.rec.height; - cv::imwrite("detection_output.jpg", image); - } - } -} -#endif - -void test_net(const std::string model_file_name, const std::string image_file_name, float thresh, \ - int threads, int test_iter) { - - int batch_size = 1; - - //! create runtime context - LOG(INFO) << "create runtime context"; - std::shared_ptr> ctx1 = std::make_shared>(); - ctx1->set_run_mode(SABER_POWER_HIGH, threads); - LOG(INFO) << omp_get_num_threads() << " threads is activated"; - - //! load model - LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; - Graph graph; - auto status = graph.load(model_file_name); - if (!status) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - //! set batch size - graph.ResetBatchSize("input_0", batch_size); - - //! optimize the graph - LOG(INFO) << "optimize the graph"; - graph.Optimize(); - - //! get output name - std::vector& vout_name = graph.get_outs(); - LOG(INFO) << "output size: " << vout_name.size(); - - //! constructs the executer net - LOG(INFO) << "create net to execute"; - Net net_executer(graph, ctx1, true); - - //! get in - LOG(INFO) << "get input"; - auto d_tensor_in_p = net_executer.get_in("input_0"); - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i = 0; i < valid_shape_in.size(); i++) { - LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; - } - Tensor4hf thin(valid_shape_in); - - //! feed input image to input tensor -#ifdef USE_OPENCV - LOG(INFO) << "loading image " << image_file_name << " ..."; - Mat img = imread(image_file_name, CV_LOAD_IMAGE_COLOR); - if (img.empty()) { - LOG(FATAL) << "opencv read image " << image_file_name << " failed"; - } - float mean_mb[3] = {127.5f, 127.5f, 127.5f}; - float scale_mb[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; - fill_tensor_with_cvmat(img, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); -#else - fill_tensor_host_const(thin, 1.f); -#endif - - //! do inference - Context ctx(0, 0, 0); - anakin::saber::SaberTimer my_time; - LOG(INFO) << "run prediction "; - - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(ctx); - saber::SaberTimer t1; - for (int i = 0; i < test_iter; i++) { - d_tensor_in_p->copy_from(thin); - t1.clear(); - t1.start(ctx); - net_executer.prediction(); - t1.end(ctx); - double tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - } - my_time.end(ctx); - - - LOG(INFO) << model_file_name << " batch_size " << batch_size << \ - " average time " << to / test_iter << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; - - //! fixme get output - //std::vector vout = net_executer.get_out_list(); - std::vector vout; - for (auto& it : vout_name) { - vout.push_back(net_executer.get_out(it)); - } - Tensor4hf* tensor_out = vout[0]; - LOG(INFO) << "output size: " << vout.size(); -#if 0 //print output data - LOG(INFO) << "extract data: size: " << tensor_out->valid_size() << \ - ", width=" << tensor_out->width() << ", height=" << tensor_out->height(); - const float* ptr_out = tensor_out->data(); - for (int i = 0; i < tensor_out->valid_size(); i++) { - printf("%0.4f ", ptr_out[i]); - if ((i + 1) % 7 == 0) { - printf("\n"); - } - } - printf("\n"); -#endif -#ifdef USE_OPENCV - detect_object(*tensor_out, thresh, img); -#endif -} - -int main(int argc, char** argv){ - - LOG(INFO) << "initialized the device"; - Env::env_init(); - - if (argc < 2) { - LOG(ERROR) << "usage: " << argv[0] << ": model_file image_name [detect_thresh] [test_iter] [threads]"; - return -1; - } - char* model_file = argv[1]; - - char* image_path = argv[2]; - - float thresh = 0.6; - if(argc > 3) { - thresh = (float)atof(argv[3]); - } - - int test_iter = 10; - if (argc > 4) { - test_iter = atoi(argv[4]); - } - - int threads = 1; - if (argc > 5) { - threads = atoi(argv[5]); - } - - test_net(model_file, image_path, thresh, threads, test_iter); - return 0; -} - diff --git a/examples/cuda/example_nv_cnn_net.cpp b/examples/cuda/example_nv_cnn_net.cpp index be7ec6497..b1753c063 100644 --- a/examples/cuda/example_nv_cnn_net.cpp +++ b/examples/cuda/example_nv_cnn_net.cpp @@ -1,7 +1,7 @@ #include "utils/logger/logger.h" -#include "graph.h" -#include "net.h" +#include "framework/graph/graph.h" +#include "framework/core/net/net.h" #ifdef USE_CUDA /*util to fill tensor*/ @@ -11,56 +11,65 @@ using namespace anakin::graph; using namespace anakin::saber; int main(int argc, const char** argv) { + logger::init(argv[0]); + if (argc < 2) { + LOG(ERROR) << "usage: ./" << argv[0] << " [model path] "; + return 0; + } + const char* model_path = argv[1]; /*init graph object, graph is the skeleton of model*/ - Graph graph; + Graph graph; /*load model from file to init the graph*/ - auto status = graph.load("Resnet50.anakin.bin"); + auto status = graph.load(model_path); if (!status) { LOG(FATAL) << " [ERROR] " << status.info(); } /*set net input shape and use this shape to optimize the graph(fusion and init operator),shape is n,c,h,w*/ - graph.Reshape("input_0", {1, 3, 224, 224}); +// graph.Reshape("input_0", {1, 3, 224, 224}); graph.Optimize(); /*net_executer is the executor object of model. use graph to init Net*/ - Net net_executer(graph, true); + Net net_executer(graph, true); /*use input string to get the input tensor of net. for we use NV as target, the tensor of net_executer is on GPU memory*/ - auto d_tensor_in_p = net_executer.get_in("input_0"); - auto valid_shape_in = d_tensor_in_p->valid_shape(); + auto d_tensor_in_p = net_executer.get_in_list(); + for (auto& d_tensor : d_tensor_in_p) { + auto valid_shape_in = d_tensor->valid_shape(); - /*create tensor located in host*/ - Tensor4d h_tensor_in; + /*create tensor located in host*/ + Tensor4d h_tensor_in; - /*alloc for host tensor*/ - h_tensor_in.re_alloc(valid_shape_in); + /*alloc for host tensor*/ + h_tensor_in.re_alloc(valid_shape_in); - /*init host tensor by random*/ - fill_tensor_host_rand(h_tensor_in, -1.0f, 1.0f); + /*init host tensor by random*/ + fill_tensor_rand(h_tensor_in, -1.0f, 1.0f); - /*use host tensor to int device tensor which is net input*/ - d_tensor_in_p->copy_from(h_tensor_in); + /*use host tensor to int device tensor which is net input*/ + d_tensor->copy_from(h_tensor_in); + } /*run infer*/ net_executer.prediction(); LOG(INFO)<<"infer finash"; + auto d_out=net_executer.get_out_list(); /*get the out put of net, which is a device tensor*/ - auto d_out=net_executer.get_out("prob_out"); - - /*create another host tensor, and copy the content of device tensor to host*/ - Tensor4d h_tensor_out; - h_tensor_out.re_alloc(d_out->valid_shape()); - h_tensor_out.copy_from(*d_out); - - /*show output content*/ - for(int i=0;i workers("Resnet50.anakin.bin", 10); + Worker workers(model_path, 10); workers.register_inputs({"input_0"}); workers.register_outputs({"prob_out"}); /*set input shape*/ - workers.Reshape("input_0", {1, 3, 224, 224}); +// workers.Reshape("input_0", {1, 3, 224, 224}); /*start workers*/ workers.launch(); /*fill input*/ - std::vector::type, AK_FLOAT> > host_tensor_p_in_list; + std::vector::type>> host_tensor_p_in_list; saber::Shape valid_shape_in({1, 3, 224, 224}); - Tensor4dPtr::type, AK_FLOAT> h_tensor_in = new Tensor4d::type, AK_FLOAT>(valid_shape_in); - float* h_data = h_tensor_in->mutable_data(); - for (int i=0; isize(); i++) { + Tensor4d::type> h_tensor_in(valid_shape_in); + float* h_data = static_cast(h_tensor_in.mutable_data()); + for (int i = 0; i < h_tensor_in.valid_size(); i++) { h_data[i] = 1.0f; } host_tensor_p_in_list.push_back(h_tensor_in); @@ -37,14 +43,14 @@ int main(int argc, const char** argv) { /*run infer,send input to worker queue*/ int epoch = 1000; - for(int i=0; i +#include "framework/core/net/net.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include +using namespace anakin::saber; +using namespace anakin::graph; +using namespace anakin; +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif +typedef Tensor Tensor4hf; + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" + +using namespace cv; + +struct Object{ + int batch_id; + cv::Rect rec; + int class_id; + float prob; +}; + +const char* class_names[] = {"background", + "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", + "motorbike", "person", "pottedplant", + "sheep", "sofa", "train", "tvmonitor"}; + +void fill_tensor_with_cvmat(const std::vector& img_in, Tensor4hf& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + CHECK_GE(img_in.size(), 1) << "must have at least one image"; + cv::Mat im; + auto shape = tout.valid_shape(); + shape.set_height(height); + shape.set_width(width); + tout.reshape(shape); + float* ptr_data_in = tout.mutable_data(); + int cstride = width * height; + int nstride = tout.channel() * cstride; + + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * nstride; + if (i < img_in.size()) { + cv::resize(img_in[i], im, cv::Size(width, height), 0.f, 0.f); + for (int r = 0; r < height; r++) { + float* ptr_in_c0 = ptr_in + r * width; + float* ptr_in_c1 = ptr_in_c0 + cstride; + float* ptr_in_c2 = ptr_in_c1 + cstride; + for (int c = 0; c < width; c++) { + ptr_in_c0[c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in_c1[c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in_c2[c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } else { + memcpy(ptr_in, ptr_in - nstride, nstride * sizeof(float)); + } + } +} + +void detect_object(Tensor4hf& tout, const float thresh, std::vector& image) { + int img_num = image.size(); + const float* dout = static_cast(tout.data()); + std::vector objects; + for (int iw = 0; iw < tout.height(); iw++) { + Object object; + const float *values = dout + iw * tout.width(); + int batch_id = static_cast(values[0]); + int oriw = image[batch_id].cols; + int orih = image[batch_id].rows; + object.batch_id = batch_id; + object.class_id = (int)values[1]; + object.prob = values[2]; + object.rec.x = (int)(values[3] * oriw); + object.rec.y = (int)(values[4] * orih); + object.rec.width = (int)(values[5] * oriw - object.rec.x); + object.rec.height = (int)(values[6] * orih - object.rec.y); + objects.push_back(object); + } + + for (int i = 0; i < objects.size(); ++i) { + Object object = objects.at(i); + if (object.prob > thresh && object.batch_id < image.size()) { + cv::rectangle(image[object.batch_id], object.rec, cv::Scalar(255, 0, 0)); + std::ostringstream pro_str; + pro_str << object.prob; + std::string label = std::string(class_names[object.class_id]) + ": " + pro_str.str(); + cv::putText(image[object.batch_id], label, cv::Point(object.rec.x, object.rec.y), \ + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + LOG(INFO) << "detection in batch: " << object.batch_id << ", image size: " << \ + image[object.batch_id].cols << ", " << image[object.batch_id].rows << \ + ", detect object: " << class_names[object.class_id] << ", location: x=" << \ + object.rec.x << ", y=" << object.rec.y << ", width=" << object.rec.width << \ + ", height=" << object.rec.height; + } + } + for (int j = 0; j < image.size(); ++j) { + std::ostringstream str; + str << "detection_out_" << j << ".jpg"; + cv::imwrite(str.str(), image[j]); + } +} +#endif + +void test_net(const std::string model_file_name, const std::string image_file_name, float thresh, \ + int batch_size, int device_id) { + + Env::env_init(); + Env::env_init(); + TargetWrapper::set_device(device_id); + + //! load model + LOG(INFO) << "load anakin model file from " << model_file_name << " ..."; + Graph graph; + auto status = graph.load(model_file_name); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + auto ins_name = graph.get_ins(); + //! set batch size + for (auto& in : ins_name) { + graph.ResetBatchSize(in, batch_size); + } + + //! optimize the graph + LOG(INFO) << "optimize the graph"; + graph.Optimize(); + + //! get output name + std::vector& vout_name = graph.get_outs(); + LOG(INFO) << "output size: " << vout_name.size(); + + //! constructs the executer net + LOG(INFO) << "create net to execute"; + Net net_executer(graph, true); + +#ifdef USE_OPENCV + std::vector img_list; +#endif + + //! get in + auto d_tensor_in_p = net_executer.get_in_list(); + auto d_tensor_out_p = net_executer.get_out_list(); + for (auto& din : d_tensor_in_p) { + auto valid_shape_in = din->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input dims[" << i << "]" << valid_shape_in[i]; + } + Tensor4hf thin(valid_shape_in); + //! feed input image to input tensor +#ifdef USE_OPENCV + std::fstream fp(image_file_name); + std::string line; + std::vector img_file_list; + while (getline(fp, line)) { + img_file_list.push_back(line); + } + LOG(INFO) << "total test image number: " << img_file_list.size(); + for (int i = 0; i < img_file_list.size(); ++i) { + LOG(INFO) << "loading image : " << img_file_list[i]; + Mat img = imread(img_file_list[i], CV_LOAD_IMAGE_COLOR); + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_file_name << " failed"; + } + img_list.push_back(img); + } + float mean_mb[3] = {104.f, 117.f, 123.f}; + float scale_mb[3] = {1.f, 1.f, 1.f}; + fill_tensor_with_cvmat(img_list, thin, batch_size, thin.width(), thin.height(), mean_mb, scale_mb); + din->copy_from(thin); +#else + fill_tensor_const(*din, 1.f); +#endif + } + + + //! do inference + LOG(INFO) << "run prediction "; + net_executer.prediction(); + + + LOG(INFO) << "finish infer: " << model_file_name << ", batch_size " << batch_size; + + //! fixme get output + std::vector vout; + for (int i = 0; i < d_tensor_out_p.size(); i++) { + Tensor4hf hout(d_tensor_out_p[i]->valid_shape()); + hout.copy_from(*d_tensor_out_p[i]); + vout.push_back(hout); + } + Tensor4hf tensor_out = vout[0]; + LOG(INFO) << "output size: " << vout.size(); +#if 1 //print output data + LOG(INFO) << "extract data: size: " << tensor_out.valid_size() << \ + ", width=" << tensor_out.width() << ", height=" << tensor_out.height(); + const float* ptr_out = static_cast(tensor_out.data()); + for (int i = 0; i < tensor_out.valid_size(); i++) { + printf("%0.4f ", ptr_out[i]); + if ((i + 1) % 7 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif +#ifdef USE_OPENCV + detect_object(tensor_out, thresh, img_list); +#endif +} + +int main(int argc, char** argv){ + + logger::init(argv[0]); + if (argc < 2) { + LOG(ERROR) << "usage: " << argv[0] << ": model_file image_name [detect_thresh] [batch size] [device id]"; + return -1; + } + char* model_file = argv[1]; + + char* image_path = argv[2]; + + float thresh = 0.6; + if(argc > 3) { + thresh = (float)atof(argv[3]); + } + + int batch_size = 1; + if (argc > 4) { + batch_size = atoi(argv[4]); + } + + int device_id = 0; + if (argc > 5) { + device_id = atoi(argv[5]); + } + + test_net(model_file, image_path, thresh, batch_size, device_id); + return 0; +} + diff --git a/examples/x86/example_x86_rnn_net.cpp b/examples/x86/example_x86_rnn_net.cpp index 3ba2d61da..a047b9346 100644 --- a/examples/x86/example_x86_rnn_net.cpp +++ b/examples/x86/example_x86_rnn_net.cpp @@ -1,7 +1,7 @@ #include "utils/logger/logger.h" -#include "graph.h" -#include "net.h" +#include "framework/graph/graph.h" +#include "framework/core/net/net.h" #ifdef USE_X86_PLACE /*util to fill tensor*/ @@ -12,45 +12,51 @@ using namespace anakin::saber; int main(int argc, const char** argv) { /*init graph object, graph is the skeleton of model*/ - Graph graph; + logger::init(argv[0]); + if (argc < 2) { + LOG(ERROR) << "usage: ./" << argv[0] << " [model path] "; + return 0; + } + const char* model_path = argv[1]; + Graph graph; /*load model from file to init the graph*/ - auto status = graph.load("language_model.anakin2.bin"); + auto status = graph.load(model_path); if (!status) { LOG(FATAL) << " [ERROR] " << status.info(); } /*set net input shape and use this shape to optimize the graph(fusion and init operator), shape is n,c,h,w. n=sum of words*/ - graph.Reshape("input_0", {30, 1, 1, 1}); +// graph.Reshape("input_0", {30, 1, 1, 1}); graph.Optimize(); /*net_executer is the executor object of model. use graph to init Net*/ - Net net_executer(graph, true); + Net net_executer(graph, true); /*use input string to get the input tensor of net. for we use X86 as target, the tensor of net_executer is on host memory*/ - auto h_tensor_in_p = net_executer.get_in("input_0"); - - /*init host tensor by continue int*/ - fill_tensor_host_seq(*h_tensor_in_p); - - /*seq offset of tensor means offset of sentence, 0,10,15,30 means sentence0 = 0-9, sentence 1 = 10-14, sentence2 = 15-29*/ - h_tensor_in_p->set_seq_offset({0,10,15,30}); - + auto d_tensor_in_p = net_executer.get_in_list(); + for (auto& d_tensor : d_tensor_in_p) { + /*init host tensor by random*/ + fill_tensor_rand(*d_tensor, -1.0f, 1.0f); + } /*run infer*/ net_executer.prediction(); - LOG(INFO)<<"infer finash"; - - /*get the out put of net, which is a host tensor*/ - auto h_out=net_executer.get_out("fc_1.tmp_2_out"); + LOG(INFO)<<"infer finish"; - - /*show some output content*/ - for(int i=0;i<10;i++){ - LOG(INFO)<<"out ["<data()[i]; + auto d_out=net_executer.get_out_list(); + /*get the out put of net, which is a device tensor*/ + for (auto& out : d_out) { + /*show output content*/ + for(int i = 0; i < out->valid_size(); i++) { + LOG(INFO) << "out [" << i << "] = " << ((const float*)(out->data()))[i]; + } } } #else -int main(){} +int main() { + printf("nothing to do~~\n"); + return 0; +} #endif \ No newline at end of file diff --git a/framework/.DS_Store b/framework/.DS_Store new file mode 100644 index 000000000..f9008e2be Binary files /dev/null and b/framework/.DS_Store differ diff --git a/framework/CMakeLists.txt b/framework/CMakeLists.txt index b4cdfac95..1a2c3abb6 100644 --- a/framework/CMakeLists.txt +++ b/framework/CMakeLists.txt @@ -11,6 +11,56 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +anakin_fetch_files_with_suffix(${ANAKIN_MODEL_PARSER}/proto "proto" ANAKIN_PROTO_SRC) + +set(PROTOC_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/model_parser/proto") +file(MAKE_DIRECTORY ${PROTOC_OUT_DIR}) +include_directories(${PROTOC_OUT_DIR}) + +set(ANAKIN_BASE_SRC "") + +if(USE_NANOPB) + include_directories(${NANOPB_DIR}) + + anakin_fetch_files_with_suffix(${ANAKIN_MODEL_PARSER}/parser/nanopb/ "cpp" ANAKIN_BASE_SRC) + + add_definitions(-DPB_FIELD_16BIT) + + add_custom_command( + OUTPUT "${PROTOBUF_PROTOC_EXECUTABLE}" "${NANOPB_DIR}/pb_decode.c" "${NANOPB_DIR}/pb_common.c" + COMMAND ${CMAKE_COMMAND} -E make_directory ${NANOPB_DIR} + COMMAND bash ARGS -c "wget -qO- ${NANOPB_DOWNLOAD_URL} | tar xz -C ${NANOPB_DIR} --strip 1" + COMMENT "Downlaoding prebuilt nanopb-${NANOPB_VERSION}..." + VERBATIM) + + set(PROTOC_OUT_ARGS "--nanopb_out=-I${ANAKIN_MODEL_PARSER}/proto:") + + list(APPEND ANAKIN_SRC "${NANOPB_DIR}/pb_decode.c" "${NANOPB_DIR}/pb_common.c") +else() + set(PROTOC_OUT_ARGS "--cpp_out=") +endif() + +foreach(__file ${ANAKIN_PROTO_SRC}) + get_filename_component(__file_name ${__file} NAME_WE) + if(USE_NANOPB) + set(__out_src_name "${PROTOC_OUT_DIR}/${__file_name}.pb.c") + if(EXISTS "${ANAKIN_MODEL_PARSER}/proto/${__file_name}.options") + set(__proto_options "${ANAKIN_MODEL_PARSER}/proto/${__file_name}.options") + endif() + else() + set(__out_src_name "${PROTOC_OUT_DIR}/${__file_name}.pb.cc") + endif() + set(__out_header_name "${PROTOC_OUT_DIR}/${__file_name}.pb.h") + + add_custom_command( + OUTPUT "${__out_src_name}" "${__out_header_name}" + COMMAND "${PROTOBUF_PROTOC_EXECUTABLE}" + ARGS "-I${ANAKIN_MODEL_PARSER}/proto" ${__file} "${PROTOC_OUT_ARGS}${PROTOC_OUT_DIR}" + DEPENDS ${PROTOBUF_PROTOC_EXECUTABLE} ${__file} ${__proto_options} + COMMENT "Compiling ${__file_name}.proto using ${PROTOBUF_PROTOC_EXECUTABLE}...") + list(APPEND ANAKIN_SRC "${__out_src_name}") +endforeach() + anakin_fetch_include_recursively(${ANAKIN_SABER}) anakin_fetch_include_recursively(${ANAKIN_MODEL_PARSER}) anakin_fetch_include_recursively(${ANAKIN_UTILS}) @@ -22,9 +72,6 @@ if(BUILD_RPC) anakin_fetch_include_recursively(${ANAKIN_SERVICE}) endif() - -set(ANAKIN_BASE_SRC "") - # add ak_base_source files anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/c_api "cpp" ANAKIN_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/core "cpp" ANAKIN_BASE_SRC) @@ -40,38 +87,41 @@ anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/operators/fusion_ops "cpp" AN anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/utils "cpp" ANAKIN_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/utils/logger "cpp" ANAKIN_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/utils/unit_test "cpp" ANAKIN_BASE_SRC) +anakin_fetch_files_with_suffix(${ANAKIN_FRAMEWORK}/model_parser/parser "cpp" ANAKIN_BASE_SRC) list(APPEND ANAKIN_SRC ${ANAKIN_BASE_SRC}) unset(ANAKIN_BASE_SRC) # add library to shared or static if(UNIX OR APPLE) - if(BUILD_SHARED) - add_library(${anakin_lib_so} SHARED ${ANAKIN_SRC}) - add_dependencies(${anakin_lib_so} ${ANAKIN_SABER_LIB_TARGET}) - # set shared lib version - set_target_properties(${anakin_lib_so} PROPERTIES VERSION ${VERSION}) - - target_link_libraries(${anakin_lib_so} ${ANAKIN_SABER_LIB_TARGET} ${ANAKIN_LINKER_LIBS}) - set_target_properties(${anakin_lib_so} PROPERTIES LINK_FLAGS "") - set_target_properties(${anakin_lib_so} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/) - install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} - DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/ - FILES_MATCHING - PATTERN "*.h" - PATTERN "*.inl") - endif() - if(BUILD_STATIC) - add_library(${anakin_lib_static} STATIC ${ANAKIN_SRC}) - add_dependencies(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET})# ${anakin_framework_static}) - #set_target_properties(${anakin_lib_static} PROPERTIES VERSION ${VERSION}) - target_link_libraries(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET} ${ANAKIN_LINKER_LIBS}) - set_target_properties(${anakin_lib_static} PROPERTIES LINK_FLAGS "") - set_target_properties(${anakin_lib_static} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/) - install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} - DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/ - FILES_MATCHING - PATTERN "*.h" - PATTERN "*.inl") - endif() + if(BUILD_SHARED) + add_library(${anakin_lib_so} SHARED ${ANAKIN_SRC}) + add_dependencies(${anakin_lib_so} ${ANAKIN_SABER_LIB_TARGET}) + # set shared lib version + set_target_properties(${anakin_lib_so} PROPERTIES VERSION ${VERSION}) + + target_link_libraries(${anakin_lib_so} ${ANAKIN_SABER_LIB_TARGET} ${ANAKIN_LINKER_LIBS}) + set_target_properties(${anakin_lib_so} PROPERTIES LINK_FLAGS "") + set_target_properties(${anakin_lib_so} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/) + install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} + DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/ + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.inl") + endif() + if(BUILD_STATIC) + add_library(${anakin_lib_static} STATIC ${ANAKIN_SRC}) + add_dependencies(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET})# ${anakin_framework_static}) + target_link_libraries(${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET} ${ANAKIN_LINKER_LIBS}) + if(USE_SGX) + target_link_libraries(${anakin_lib_static} ${SGX_CONFIG_INTERFACE}) + endif() + set_target_properties(${anakin_lib_static} PROPERTIES LINK_FLAGS "") + set_target_properties(${anakin_lib_static} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/) + install(DIRECTORY ${ANAKIN_FRAMEWORK} ${ANAKIN_SABER} ${ANAKIN_UTILS} + DESTINATION ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/ + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.inl") + endif() endif() diff --git a/framework/c_api/anakin_runner.cpp b/framework/c_api/anakin_runner.cpp index 50b8a556c..363592c7d 100644 --- a/framework/c_api/anakin_runner.cpp +++ b/framework/c_api/anakin_runner.cpp @@ -333,11 +333,11 @@ char* get_ak_cpu_arch_string() { #ifdef USE_X86_PLACE -#include "omp.h" +#include "anakin_thread.h" #include "mkl_service.h" void set_ak_cpu_parallel() { - omp_set_dynamic(0); - omp_set_num_threads(1); + anakin_set_dynamic(0); + anakin_set_num_threads(1); mkl_set_num_threads(1); } #endif diff --git a/framework/core/.DS_Store b/framework/core/.DS_Store new file mode 100644 index 000000000..c14b87445 Binary files /dev/null and b/framework/core/.DS_Store differ diff --git a/framework/core/any.h b/framework/core/any.h index 34ffc16df..109e24219 100644 --- a/framework/core/any.h +++ b/framework/core/any.h @@ -162,11 +162,11 @@ ValueType any_cast(any& operand) { } // not FATAL error if(operand.type() == "") { - LOG(WARNING)<< "The type hold by any is None" + DLOG(WARNING)<< "The type hold by any is None" << " , but you cast to type " << anakin::type_id().type_info() << ", and you will get a empty vector."; } else { - LOG(ERROR)<< "The type hold by any is " <().type_info(); } diff --git a/framework/core/factory.h b/framework/core/factory.h index 1bc12be5b..24a111335 100644 --- a/framework/core/factory.h +++ b/framework/core/factory.h @@ -5,16 +5,16 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_FACTORY_H -#define ANAKIN_FACTORY_H +#define ANAKIN_FACTORY_H #include #include @@ -23,17 +23,26 @@ #include "framework/core/thread_safe_macros.h" #include "framework/core/singleton.h" #include "utils/logger/logger.h" +#include "anakin_config.h" +#ifdef USE_SGX +#include +#endif namespace anakin { -template> class FactoryBase { public: PolicyType* Create(const TypeIdentifier& type_id){ + if (_container.count(type_id) == 0) { +// LOG(INFO)<<"create "<first << " : " ; +// } LOG(FATAL) << type_id << " has not been registered! "; } //LOG(INFO) << "create " << type_id << " fuction " << &_container.at(type_id); @@ -52,17 +61,17 @@ class FactoryBase { std::vector& GetTypeIdentifierList() { return _type_id_list; } - bool Register(TypeIdentifier type_id, PolicyCreator creator) + bool Register(TypeIdentifier type_id, PolicyCreator creator) EXCLUSIVE_LOCKS_REQUIRED(container_mutex_) { std::lock_guard guard(container_mutex_); - //LOG(ERROR) << "register " << type_id; + // LOG(ERROR) << "register " << type_id; if (_container.count(type_id) == 0) { _type_id_list.push_back(type_id); _container[type_id] = creator; } return true; } - void UnRegister(const TypeIdentifier& type_id) + void UnRegister(const TypeIdentifier& type_id) EXCLUSIVE_LOCKS_REQUIRED(container_mutex_) { std::lock_guard guard(container_mutex_); _type_id_list.erase(std::remove(_type_id_list.begin(), _type_id_list.end(), type_id), _type_id_list.end()); @@ -94,15 +103,15 @@ class Factory: } /// Add another alias to the type_id. virtual void __alias__(const std::string& ori_name, const std::string& alias_name) { - this->__ALIAS__(ori_name, alias_name); + this->__ALIAS__(ori_name, alias_name); } }; -/** +/** * \brief Object register base class. */ -template> class ObjectRegisterBase { public: @@ -122,8 +131,8 @@ class ObjectRegisterBase { std::vector& GetTypeIdentifierList() { return _type_id_list; } - PolicyType& Register(TypeIdentifier type_id) EXCLUSIVE_LOCKS_REQUIRED(_container_mutex) { - std::lock_guard guard(_container_mutex); + PolicyType& Register(TypeIdentifier type_id) EXCLUSIVE_LOCKS_REQUIRED(_container_mutex) { + std::lock_guard guard(_container_mutex); //CHECK_EQ(_container.count(type_id), 0) << type_id << " has been registered! "; if (_container.count(type_id) == 0) { PolicyType* object= new PolicyType(); @@ -149,7 +158,7 @@ class ObjectRegisterBase { ContainerType _container GUARDED_BY(_container_mutex); }; -/** +/** * \brief Object register class. * */ @@ -166,7 +175,7 @@ class ObjectRegister : public ObjectRegisterBase { } /// Add another alias to the type_id virtual void __alias__(const std::string& ori_name, const std::string& alias_name) { - this->__ALIAS__(ori_name, alias_name); + this->__ALIAS__(ori_name, alias_name); } }; diff --git a/framework/core/net/auto_layout_config.cpp b/framework/core/net/auto_layout_config.cpp new file mode 100644 index 000000000..76326a396 --- /dev/null +++ b/framework/core/net/auto_layout_config.cpp @@ -0,0 +1,325 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "framework/core/net/auto_layout_config.h" +#include +#include "framework/graph/node.h" +namespace anakin { + +template +void AutoLayoutConfigHelper::init() { + _node_layout_hint["Input"]["nchw"] = {"nchw"}; + _node_layout_hint["Convolution"]["nchw"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["Convolution"]["nchw_c8r"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvRelu"]["nchw"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvRelu"]["nchw_c8r"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvBatchnormScaleRelu"]["nchw"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvBatchnormScaleRelu"]["nchw_c8r"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvBatchnormScale"]["nchw"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["ConvBatchnormScale"]["nchw_c8r"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["Pooling"]["nchw"] = {"nchw"}; + _node_layout_hint["Pooling"]["nchw_c8r"] = {"nchw_c8r", "nchw"}; + _node_layout_hint["Dense"]["nchw_c8r"] = {"nchw"}; + _node_layout_hint["Dense"]["nchw"] = {"nchw"}; + _node_layout_hint["ReLU"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["ReLU"]["nchw"] = {"nchw"}; + _node_layout_hint["Activation"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["Activation"]["nchw"] = {"nchw"}; + _node_layout_hint["Softmax"]["nchw"] = {"nchw"}; + _node_layout_hint["Split"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["Split"]["nchw"] = {"nchw"}; + _node_layout_hint["Gather"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["Gather"]["nchw"] = {"nchw"}; + _node_layout_hint["ConvEltwise"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["ConvEltwise"]["nchw"] = {"nchw"}; + _node_layout_hint["Eltwise"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["Eltwise"]["nchw"] = {"nchw"}; + _node_layout_hint["Concat"]["nchw_c8r"] = {"nchw_c8r"}; + _node_layout_hint["Concat"]["nchw"] = {"nchw"}; + + _node_layout_hint["Reshape"]["nchw"] = {"nchw"}; + _node_layout_hint["PriorBox"]["nchw"] = {"nchw"}; + _node_layout_hint["DetectionOutput"]["nchw"] = {"nchw"}; + _node_layout_hint["Permute"]["nchw"] = {"nchw"}; + _node_layout_hint["Flatten"]["nchw"] = {"nchw"}; + + for (auto node : _node_layout_hint) { + std::string node_name = node.first; + auto in_out_map = node.second; + std::unordered_map >out_in_map; + + for (auto in_layout_obj : in_out_map) { + std::string in_layout = in_layout_obj.first; + auto out_layout_vec = in_layout_obj.second; + + for (auto out_layout : out_layout_vec) { + if (std::count(out_in_map[out_layout].begin(), out_in_map[out_layout].end(), in_layout) == 0) { + out_in_map[out_layout].push_back(in_layout); + } + } + } + + _node_layout_hint_reverse[node_name] = out_in_map; + } +} + +template +std::unordered_map AutoLayoutConfigHelper:: +auto_config_int8_edge_layout(graph::Graph& graph) { + std::unordered_map result; + std::unordered_set relu_op = {"ConvBatchnormScaleRelu", "ConvRelu"}; + auto int8_edge_config = [&, this](graph::Edge& edge) { + auto bottom_node = graph[edge.bottom()]; + bottom_node->bit_type(); + auto edge_name = edge.name(); + + if (edge.scale().size() > 0 || relu_op.count(bottom_node->get_op_name()) > 0) { + result[edge.name()] = "nhwc"; + } else { + result[edge.name()] = "nchw"; + } + }; + graph.Scanner->BFS_Edge(int8_edge_config); + return result; +}; + +template +std::unordered_map AutoLayoutConfigHelper:: +auto_config_node_dtype(graph::Graph& graph) { + + std::unordered_map result; + std::unordered_set relu_op = {"ConvBatchnormScaleRelu", "ConvRelu", "ConvEltwise"}; + auto uint8_node_config = [&, this](graph::NodePtr target_node) { + if (target_node->bit_type() == AK_INT8) { + if (relu_op.count(target_node->get_op_name()) > 0) { + if (target_node->get_op_name() == "ConvEltwise") { + + for (auto k : target_node->attr()) { + LOG(INFO) << "ConvEltwise attr :" << k.first; + } + + } + + result[target_node->name()] = "uint8"; + return; + } else { + result[target_node->name()] = "int8"; + return; + } + } + }; + graph.Scanner->BFS(uint8_node_config); + return result; +}; + +template +void AutoLayoutConfigHelper::scane_dfs_int8_node(graph::Graph& graph, + graph::NodePtr& node, + std::string last_node_dtype) { + LOG(FATAL) << "not impl"; +} + +template +std::vector AutoLayoutConfigHelper::get_node_out_layout( + std::string node_type, + std::string in_layout) { + if (_node_layout_hint.count(node_type) > 0) { + return _node_layout_hint[node_type][in_layout]; + } else { + LOG(INFO) << "not find op prefer layout " << node_type; + + if (in_layout == "nchw") { + return {"nchw"}; + } else { + return {}; + } + } +} + +template +std::vector> AutoLayoutConfigHelper::get_node_output_arcs( +graph::Graph& graph, graph::NodePtr& node) { + std::vector> result; + + for (auto out_edge : graph.get_out_arc_its(node->name())) { + result.push_back(*out_edge); + } + + return result; +} + +template +std::vector AutoLayoutConfigHelper::get_node_output_nodes( + graph::Graph& graph, graph::NodePtr& node) { + std::vector result; + + for (auto out_edge : graph.get_out_arc_its(node->name())) { + result.push_back(graph[out_edge->top()]); + } + + return result; +} +template +void AutoLayoutConfigHelper::scane_dfs_from_input(graph::Graph& graph) { + for (auto out_name : graph.get_outs()) { + for (auto next_arc : graph.get_in_arc_its(out_name)) { + _layout_map_bynode[next_arc->name()] = "nchw"; + _edge_done_map[out_name] = "nchw"; + } + + } + + for (auto in_name : graph.get_ins()) { + for (auto next_arc : graph.get_out_arc_its(in_name)) { + scane_dfs(graph, *next_arc, "nchw", true); + } + } +} + +template +bool AutoLayoutConfigHelper::scane_dfs(graph::Graph& graph, + graph::Edge& edge, + std::string suggest_layout, bool frozen_layout, + std::unordered_map* return_layout_map) { + if (_layout_map_bynode.count(edge.name()) > 0) { + return _layout_map_bynode[edge.name()] == suggest_layout; + } + + auto node = graph[edge.top()]; + + auto layout_prefer_vec = get_node_out_layout(node->get_op_name(), suggest_layout); + + if (layout_prefer_vec.size() > 0) { + std::unordered_map retire_layout_map; + + for (auto layout_prefer : layout_prefer_vec) { + bool accept = true; + bool multi_output = get_node_output_arcs(graph, node).size() > 1; + + for (auto next_arc : get_node_output_arcs(graph, node)) { + std::string next_node_name = graph[next_arc.top()]->name(); + + bool ck = false; + + if (multi_output) { + if (return_layout_map == nullptr) { + ck = scane_dfs(graph, next_arc, layout_prefer, false, &retire_layout_map); + } else { + ck = scane_dfs(graph, next_arc, layout_prefer, false, return_layout_map); + } + } else { + ck = scane_dfs(graph, next_arc, layout_prefer, true); + } + + accept = accept && ck; + + if (!accept) { + break; + } + } + + if (accept) { + if (frozen_layout) { + _layout_map_bynode[edge.name()] = suggest_layout; + + if (multi_output) { + if (return_layout_map == nullptr) { + for (auto next_arc : retire_layout_map) { + _layout_map_bynode[next_arc.first] = next_arc.second; + } + } + } + } else { + (*return_layout_map)[edge.name()] = suggest_layout; + } + + return true; + } + } + + } + + return false; + +} +template +bool AutoLayoutConfigHelper::check_merge(graph::Graph& graph) { + bool result = true; + auto check_merge = [&, this](graph::Edge& edge) { + auto node = graph[edge.top()]; + auto layout = _layout_map_bynode[edge.name()]; + + if (layout == "") { + LOG(ERROR) << "layout for " << edge.name() << " is empty, auto layout config failed"; + result = false; + return; + } + + if (graph.get_in_arc_its(node->name()).size() > 1) { + for (auto in_edge : graph.get_in_arc_its(node->name())) { + if (_layout_map_bynode[(*in_edge).name()] != layout) { + result = false; + LOG(ERROR) << "layout not equal " << (*in_edge).name() << "," << node->name() << + _layout_map_bynode[(*in_edge).name()] << "!= " << layout; + return; + } + } + } + }; + graph.Scanner->BFS_Edge(check_merge); + return result; +} +template +void AutoLayoutConfigHelper::print_layout() { + for (auto k : _layout_map_bynode) { + LOG(INFO) << "layout " << k.first << " = " << k.second; + } +} + + +#ifdef USE_CUDA +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; + +#endif + +#ifdef USE_X86_PLACE +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; +#endif + +#ifdef AMD_GPU +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; +template class AutoLayoutConfigHelper; +#endif + +#ifdef USE_ARM_PLACE +#ifdef ANAKIN_TYPE_FP32 +template class AutoLayoutConfigHelper; +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class AutoLayoutConfigHelper; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class AutoLayoutConfigHelper; +#endif //int8 + +#endif //arm +} diff --git a/framework/core/net/auto_layout_config.h b/framework/core/net/auto_layout_config.h new file mode 100644 index 000000000..3a4a6047f --- /dev/null +++ b/framework/core/net/auto_layout_config.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_CORE_NET_AUTO_LAYOUT_CONFIG_H +#define ANAKIN_FRAMEWORK_CORE_NET_AUTO_LAYOUT_CONFIG_H + +#include "framework/graph/graph.h" +#include "framework/core/net/operator_func.h" +#include "framework/core/net/calibrator_factory.h" +namespace anakin { +template +class AutoLayoutConfigHelper { +public: + AutoLayoutConfigHelper() { + init(); + } + bool check_merge(graph::Graph& graph); + void print_layout(); + void scane_dfs_from_input(graph::Graph& graph); + std::unordered_map get_config_layout(){ + return _layout_map_bynode; + }; + std::unordered_map auto_config_int8_edge_layout(graph::Graph& graph); + std::unordered_map auto_config_node_dtype(graph::Graph& graph); + +private: + void init(); + std::vector get_node_out_layout(std::string node_type, std::string in_layout); + std::vector get_node_output_nodes(graph::Graph& graph, graph::NodePtr& node); + std::vector> get_node_output_arcs(graph::Graph& graph, + graph::NodePtr& node); + + bool scane_dfs(graph::Graph& graph, graph::Edge& edge, + std::string suggest_layout, bool frozen_layout, + std::unordered_map* return_layout_map = nullptr); + + void scane_dfs_int8_node(graph::Graph& graph, graph::NodePtr& node, std::string last_node_dtype); + + + std::unordered_map _lock_node_out_edge_map; + std::unordered_map _lock_node_in_edge_map; + std::unordered_map>> + _node_layout_hint; + std::unordered_map>> + _node_layout_hint_reverse; + std::unordered_map _edge_done_map; + std::unordered_map _layout_map_bynode; +}; +} +#endif //ANAKIN_AUTO_LAYOUT_CONFIG_H diff --git a/framework/core/net/batch_stream.cpp b/framework/core/net/batch_stream.cpp index b8d3db7a7..4f2623c0f 100644 --- a/framework/core/net/batch_stream.cpp +++ b/framework/core/net/batch_stream.cpp @@ -1,5 +1,3 @@ - - /* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,12 +14,15 @@ */ #include "framework/core/net/batch_stream.h" + +#ifndef USE_SGX + #include "saber/core/tensor_op.h" namespace anakin { using namespace anakin::saber; #ifdef USE_OPENCV using namespace cv; -void fill_tensor_with_cvmat(const Mat& img_in, Tensor& tout, const int num, \ +void fill_tensor_with_cvmat(const Mat& img_in, Tensor& tout, int num, \ const int width, const int height, const float* mean, const float* scale, float& max_val) { cv::Mat im; max_val = 0.f; @@ -49,13 +50,18 @@ void fill_tensor_with_cvmat(const Mat& img_in, Tensor& tout, const int num, } } #endif + +template +BatchStream::BatchStream(Tensor* (*inner_producer)()){ + _inner_producer=inner_producer; +} /** * \brief Net class used for execution of graph and it is thread safety. */ template BatchStream::BatchStream(std::string file, int batch_size):_batch_size(batch_size) { - std::ifstream ifs(file, std::ofstream::out|std::ofstream::binary); - CHECK(ifs.is_open()) << file << "can not be opened"; + std::ifstream ifs(file, std::ifstream::in); + CHECK(ifs.is_open()) << file << " can not be opened"; while (ifs.good()) { std::string new_file; std::getline(ifs, new_file); @@ -69,35 +75,36 @@ BatchStream::BatchStream(std::string file, int batch_size):_batch_size(ba _ifs.read((char*)(&_height), 4); _ifs.read((char*)(&_width), 4); Shape shape = std::vector {batch_size, _channel, _height, _width}; - auto tensor = new Tensor(shape); - _cpu_tensors.push_back(tensor); + _host_tensor.reshape(shape); _flag_from_image = false; } template void BatchStream::reset() { _file_id = 0; - _ifs.open(_file_list[_file_id++]); - CHECK(_ifs.is_open()) << _file_list[_file_id -1] << "can not be opened"; - _ifs.read((char*)(&_num), 4); - _ifs.read((char*)(&_channel), 4); - _ifs.read((char*)(&_height), 4); - _ifs.read((char*)(&_width), 4); - + if (_file_list.size() > 0) { + _ifs.open(_file_list[_file_id++]); + CHECK(_ifs.is_open()) << _file_list[_file_id - 1] << "can not be opened"; + _ifs.read((char *) (&_num), 4); + _ifs.read((char *) (&_channel), 4); + _ifs.read((char *) (&_height), 4); + _ifs.read((char *) (&_width), 4); + } } +template +BatchStream::~BatchStream() {} + #ifdef USE_OPENCV template -BatchStream::BatchStream(std::string image_list, int num, int channel, int height, int width, \ +BatchStream::BatchStream(std::string image_list, int channel, int height, int width, \ std::vector mean, std::vector scale) { - if (num != 1) { - LOG(FATAL) << "only support batchsize = 1 for image"; - } + if (channel != mean.size() || channel != scale.size()) { LOG(FATAL) << "channel size must = mean size && scale size"; } - _num = std::max(1, num); - _batch_size = num; + _num = 1; + _batch_size = 1; _channel = std::max(1, channel); _height = std::max(1, height); _width = std::max(1, width); @@ -122,11 +129,21 @@ BatchStream::BatchStream(std::string image_list, int num, int channel, in template int BatchStream::get_batch_data(std::vector> outs) { Shape shape = std::vector{_batch_size, _height, _width, _channel}; - //_cpu_tensors[0]->reshape(shape); int num = std::min(_num, _batch_size); int image_size = _channel * _height * _width; + if (_inner_producer!= nullptr){ + Tensor* host_tensor=_inner_producer(); + if (host_tensor== nullptr){ + return 0; + } + outs[0]->reshape(host_tensor->valid_shape()); + outs[0]->copy_from(*host_tensor); + outs[0]->set_seq_offset(host_tensor->get_seq_offset()); + return host_tensor->num(); + } + #ifdef USE_CUDA - auto data = static_cast(_host_tensor.mutable_data());//_cpu_tensors[0]->mutable_data(); + auto data = static_cast(_host_tensor.mutable_data()); #else auto data = static_cast(outs[0]->mutable_data()); #endif @@ -154,11 +171,10 @@ int BatchStream::get_batch_data(std::vector> outs) { if (num != 0) { //outs[0]->reshape(Shape{num, _channel, _height,_width}); Shape shape = std::vector{num, _height,_width, _channel}; - //_cpu_tensors[0]->reshape(shape); _host_tensor.reshape(shape); outs[0]->reshape(shape); #ifdef USE_CUDA - outs[0]->copy_from(*_cpu_tensors[0]); + outs[0]->copy_from(_host_tensor); #endif } return num; @@ -178,7 +194,7 @@ int BatchStream::get_batch_data(std::vector> outs) { LOG(INFO) << "load image " << _file_list.back() << " successed, with mean value: " << mean_val << ", max_val: " << max_val; _file_list.pop_back(); Shape shape = std::vector{_num, _channel, _height,_width}; - outs[0]->reshape(shape); + outs[0]->reshape(shape); outs[0]->copy_from(_host_tensor); return 1; } @@ -193,3 +209,5 @@ template class BatchStream; template class BatchStream; #endif } + +#endif // USE_SGX diff --git a/framework/core/net/batch_stream.h b/framework/core/net/batch_stream.h index 221af4fca..f2feffb48 100644 --- a/framework/core/net/batch_stream.h +++ b/framework/core/net/batch_stream.h @@ -18,6 +18,9 @@ #ifndef ANAKIN_BATCH_STREAM_H #define ANAKIN_BATCH_STREAM_H +#include "anakin_config.h" + +#ifndef USE_SGX #include "framework/core/parameter.h" #include "framework/core/data_types.h" #include "saber/saber_types.h" @@ -33,13 +36,14 @@ namespace anakin { template class BatchStream { public: + BatchStream(Tensor* (*inner_producer)()); BatchStream(std::string file, int batch_size); #ifdef USE_OPENCV - BatchStream(std::string image_list, int num, int channel, int height, int width, \ + BatchStream(std::string image_list, int channel, int height, int width, \ std::vector mean = {1.f, 1.f, 1.f}, std::vector scale = {1.f, 1.f, 1.f}); #endif - ~BatchStream() {} + ~BatchStream(); void reset(); @@ -47,7 +51,6 @@ class BatchStream { private: int _batch_size; std::vector _file_list; - std::vector> _cpu_tensors; Tensor _host_tensor; std::ifstream _ifs; int _num; @@ -58,8 +61,10 @@ class BatchStream { std::vector _mean; std::vector _scale; bool _flag_from_image{false}; + Tensor* (*_inner_producer)(){nullptr}; }; } +#endif // USE_SGX #endif diff --git a/framework/core/net/calibrator.h b/framework/core/net/calibrator.h index 717a4f0bd..5c961853c 100644 --- a/framework/core/net/calibrator.h +++ b/framework/core/net/calibrator.h @@ -17,6 +17,10 @@ #ifndef ANAKIN_CALIBRATOR_H #define ANAKIN_CALIBRATOR_H +#include "anakin_config.h" + +#ifndef USE_SGX + #include "framework/core/net/batch_stream.h" #include "framework/core/base.h" #include "framework/core/operator/operator.h" @@ -77,4 +81,6 @@ class Calibrator { }; } +#endif // USE_SGX + #endif diff --git a/framework/core/net/calibrator_factory.h b/framework/core/net/calibrator_factory.h index 321257667..7b09c30cc 100644 --- a/framework/core/net/calibrator_factory.h +++ b/framework/core/net/calibrator_factory.h @@ -1,11 +1,11 @@ /* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,12 +16,11 @@ #ifndef ANAKIN_NET_CALIBRATOR_FACTORY_H #define ANAKIN_NET_CALIBRATOR_FACTORY_H -#include - -#include "framework/core/operator/operator.h" #include "framework/core/net/calibrator_parse.h" +#include "framework/core/operator/operator.h" #include "utils/logger/logger.h" #include "framework/core/types.h" +#include namespace anakin{ @@ -29,10 +28,11 @@ OperatorBase* create_op_with_pt(std::string op_name, std::string precision, std: template OperatorBase* create_precision_op(std::string op_name, std::string precision){ + LOG(INFO) << "creating op:" << op_name << "( precision:" << precision << ")"; if (precision == "fp32"){ return OpFactory::Global()[op_name]; } - if (precision == "int8"){ + if (precision == "int8" || precision == "uint8"){ return OpFactory::Global()[op_name]; } LOG(FATAL) << "unsupport precision! (opname: " << op_name << ", precision:" << precision << ")"; @@ -41,7 +41,7 @@ OperatorBase* create_precision_op(std::string op_name, std::string precision){ template OperatorBase* calibrator_op(std::string op_name, std::string name, const CalibratorParser& parser){ std::string prec = parser.get_precision(name); - std::string target = parser.get_target(name); +// LOG(INFO)<<"name = "<(op_name, prec); diff --git a/framework/core/net/calibrator_parse.cpp b/framework/core/net/calibrator_parse.cpp index a2275062a..4ff58a860 100644 --- a/framework/core/net/calibrator_parse.cpp +++ b/framework/core/net/calibrator_parse.cpp @@ -1,162 +1,510 @@ #include "framework/core/net/calibrator_parse.h" + +#ifndef USE_SGX #include #include +#include +#endif + +#include + +namespace anakin { + +std::string layout2str(saber::LayoutType type) { + switch (type) { + case Layout_NCHW: + return "nchw"; + + case Layout_NHWC: + return "nhwc"; + + case Layout_NCHW_C8: + return "nchw_c8"; + + case Layout_NCHW_C8R: + return "nchw_c8r"; + + case Layout_NCHW_C4: + return "nchw_c4"; + + default: + return "nchw"; + } +} +saber::LayoutType str2layout(const std::string& str) { + if (str == "nchw") { + return Layout_NCHW; + } else if (str == "nchw_c8") { + return Layout_NCHW_C8; + } else if (str == "nchw_c4") { + return Layout_NCHW_C4; + } else if (str == "nhwc") { + return Layout_NHWC; + } else if (str == "nchw_c8r") { + return Layout_NCHW_C8R; + } else { + return Layout_NCHW; + } +} + + -namespace anakin{ - std::string CalibratorParser::get_precision(std::string name) const { //if not exist, return fp32 - if (_node_precision_map.find(name) == _node_precision_map.end()){ + if (_node_precision_map.find(name) == _node_precision_map.end()) { return "fp32"; } + return _node_precision_map.at(name); } -saber::DataType CalibratorParser::get_dtype(std::string name0, std::string name1) const { +saber::DataType CalibratorParser::get_dtype_of_precision(std::string name) const { + std::string pre_str = "fp32"; + + if (_node_precision_map.find(name) != _node_precision_map.end()) { + pre_str = _node_precision_map.at(name); + } else { + + } + + if (pre_str == "fp32") { + return AK_FLOAT; + } else if (pre_str == "int8") { + return AK_INT8; + } else if (pre_str == "uint8") { + return AK_UINT8; + } else { + LOG(FATAL) << "unsupport precision type of " << pre_str; + } + + return AK_FLOAT; +} + +saber::DataType CalibratorParser::get_dtype(std::string name0, std::string name1, + std::string bottom_op_type, + std::string top_op_type, std::string dev_name, graph::NodePtr bottom_node) const { + static std::unordered_set layout_pass_op = {"Split", "Gather", "Pooling"}; + static std::unordered_set conv_name_set = {"ConvBatchnormScaleRelu", "ConvRelu", + "ConvEltwise", "Convolution"}; std::string str0 = get_precision(name0); std::string str1 = get_precision(name1); - bool bint8 = (str0 == "int8") && (str1 == "int8"); - if (!bint8){ - return saber::AK_FLOAT; + bool bint8 = ((str0 == "int8") && (str1 == "int8")) || ((str0 == "int8") && (str1 == "uint8")); + //uint8 now use for x86, and x86 is 8bit perfer + bool buint8 = str0 == "uint8" || (conv_name_set.count(bottom_op_type) && str1 == "uint8"); + LOG(INFO) << "get dtype string " << name0 << "," << str0 << "||" << name1 << "," << str1 << "||" << + bottom_op_type << "," << top_op_type; + + if (dev_name == "X86") { +#if defined(USE_X86_PLACE) + bool top_8bit = (str1 == "int8" || str1 == "uint8"); + + if (top_8bit && (conv_name_set.count(bottom_op_type) > 0)) { + using pblock_type = PBlock; + auto conv_weights = bottom_node->template get_attr("weight_1"); + auto group = bottom_node->template get_attr("group"); + bool is_inchannel_1_or_3 = conv_weights.shape().channel() == 1 + || conv_weights.shape().channel() == 3; + + if (is_inchannel_1_or_3 && group == 1) { + bint8 = str1 == "int8"; + buint8 = str1 == "uint8"; + } + } +#endif + } + + if (bottom_op_type == "Input") { + bint8 = (str0 == "int8"); + buint8 = (str0 == "uint8"); + } + + if (bint8) { + return saber::AK_INT8; + } else if (buint8) { + return saber::AK_UINT8; } else { + return saber::AK_FLOAT; + } +} + +saber::DataType CalibratorParser::get_dtype(std::string name0, std::string name1) const { + std::string str0 = get_precision(name0); + std::string str1 = get_precision(name1); + bool bint8 = ((str0 == "int8") && (str1 == "int8")) || ((str0 == "int8") && (str1 == "uint8")); + //uint8 now use for x86, and x86 is 8bit perfer + bool buint8 = str0 == "uint8" ; + + + if (bint8) { return saber::AK_INT8; + } else if (buint8) { + return saber::AK_UINT8; + } else { + return saber::AK_FLOAT; } } std::string CalibratorParser::get_target(std::string name) const { //if not exist, return NV - if (_node_target_map.find(name) == _node_target_map.end()){ + if (_node_target_map.find(name) == _node_target_map.end()) { +#ifdef USE_CUDA return "NV"; +#endif +#ifdef USE_X86_PLACE + return "X86"; +#endif +#ifdef USE_ARM_PLACE + return "ARM"; +#endif } + return _node_target_map.at(name); } +saber::LayoutType CalibratorParser::get_layout(const std::string name) const { + //if not exist, return nchw + if (_layout_map.find(name) == _layout_map.end()) { + return Layout_NCHW; + } + + return str2layout(_layout_map.at(name)); +} + float CalibratorParser::get_calibrator(std::string name) const { //if not exist, return 1.0f - if (_node_calibrator_map.find(name) == _node_calibrator_map.end()){ + if (_node_calibrator_map.find(name) == _node_calibrator_map.end()) { return 1.0f; } + return _node_calibrator_map.at(name); } -saber::LayoutType CalibratorParser::get_layout(std::string name0, std::string name1, saber::LayoutType old_layout) const { +saber::LayoutType CalibratorParser::get_layout(std::string name0, std::string name1, + saber::LayoutType old_layout) const { std::string str0 = get_precision(name0); std::string str1 = get_precision(name1); bool bint8 = (str0 == "int8") && (str1 == "int8"); - if (!bint8){ + + if (!bint8) { return old_layout; } else { return saber::Layout_NCHW_C4; } - + +} + +saber::LayoutType CalibratorParser::get_layout(std::string name0, std::string name1, + saber::LayoutType old_layout, std::string target_type, + std::string bottom_op_name, std::string top_op_name, graph::NodePtr bottom_node) const { + static std::unordered_set conv_name_set = {"ConvBatchnormScaleRelu", "ConvRelu", "ConvEltwise", "Convolution"}; + + if (target_type == "x86") { +#if defined(USE_X86_PLACE) + std::string str0 = get_precision(name0); + std::string str1 = get_precision(name1); + bool bottom_8bit = (str0 == "int8" || str0 == "uint8"); + bool top_8bit = (str1 == "int8" || str1 == "uint8"); + bool bint8 = bottom_8bit && top_8bit; + + if (top_8bit && (conv_name_set.count(bottom_op_name) > 0)) { + + using pblock_type = PBlock; + auto conv_weights = bottom_node->template get_attr("weight_1"); + auto group = bottom_node->template get_attr("group"); + bool is_inchannel_1_or_3 = conv_weights.shape().channel() == 1 + || conv_weights.shape().channel() == 3; + + if (is_inchannel_1_or_3 && group == 1) { + bint8 = true; + } + + } + + if (bottom_8bit && bottom_op_name == "Pooling"){ + bint8 = true; + } + + LOG(INFO) << "get get_layout " << str0 << "," << str1 << " old layout " << old_layout << ",bint8 " + << bint8; + + if (!bint8) { + return old_layout; + } else { + return saber::Layout_NHWC; + } +#endif + } else { + LOG(FATAL) << "not support target type " << target_type; + } + + return old_layout; + +} + +void CalibratorParser::set_precision(std::string name, saber::DataType type) { + std::string str = "fp32"; + + switch (type) { + case AK_FLOAT: + str = "fp32"; + break; + + case AK_INT8: + str = "int8"; + break; + + case AK_UINT8: + str = "uint8"; + break; + + default: + break; + } + + _node_precision_map[name] = str; } - +void CalibratorParser::set_precision(std::string name, std::string type) { + _node_precision_map[name] = type; +} +void CalibratorParser::set_scale(std::string name, float scale) { + _node_calibrator_map[name] = scale; +} +void CalibratorParser::set_layout(std::string name, saber::LayoutType layout) { + _layout_map[name] = layout2str(layout); +} +void CalibratorParser::set_layout(std::string name, std::string layout_name) { + _layout_map[name] = layout_name; +} +#ifndef USE_SGX void CalibratorParser::auto_config(const std::vector& exe_nodes, - const std::vector& op_names, std::string dst){ - /* + const std::vector& op_names, std::string dst, + std::string precision, std::string target) { std::fstream fs; fs.open(dst, std::ios::in); - if (fs){ + + if (fs) { fs.close(); LOG(WARNING) << "config file already existed, will not be created "; return; } + LOG(WARNING) << "config file not existed, creating it "; - */ - LOG(WARNING) << "creating config file"; std::ofstream ofs(dst); - if (!ofs.is_open()) - { + + if (!ofs.is_open()) { LOG(FATAL) << "open file " << dst << "failed"; } - for (int i=0; i& names, + const std::vector& layouts, std::string dst) { + std::fstream fs; + fs.open(dst, std::ios::in); + + if (fs) { + fs.close(); + LOG(WARNING) << "config file already existed, will not be created "; + return; + } -void CalibratorParser::parse_from_file(std::string config, std::string calibrator) -{ + LOG(WARNING) << "config file not existed, creating it "; + std::ofstream ofs(dst); + + if (!ofs.is_open()) { + LOG(FATAL) << "open file " << dst << "failed"; + } + + for (int i = 0; i < names.size(); ++i) { + std::string name = names[i]; + + if (!name.empty()) { + std::string layout = layout2str(layouts[i]); + ofs << name << " " << layout << " \n"; + } + } + + ofs.close(); +} + +void CalibratorParser::parse_from_file(std::string config, std::string calibrator) { _config_parse(config); _calibrator_parse(calibrator); } - -void CalibratorParser::_config_parse(std::string config){ + +void CalibratorParser::_config_parse(std::string config) { std::ifstream ifs(config); - if (!ifs.is_open()) - { + + if (!ifs.is_open()) { LOG(ERROR) << "open file " << config << " failed, will use default config"; return; } + std::string line; - while (ifs.good()){ + + while (ifs.good()) { std::getline(ifs, line); - if (!line.empty()){ + + if (!line.empty()) { auto str_vec = _line_config_parse(line); std::string node_name; - if (str_vec.size()>=1){ + + // LOG(INFO)<<"read config "<= 1) { node_name = str_vec[0]; - node_name.erase(node_name.find("(")); + node_name.erase(node_name.find_last_of("(")); } - if (str_vec.size() >= 3){ + + if (str_vec.size() >= 3) { _node_target_map[node_name] = str_vec[2]; } - if (str_vec.size() >= 2){ + + if (str_vec.size() >= 2) { _node_precision_map[node_name] = str_vec[1]; + // LOG(INFO)<<"parser _node_precision_map "<<_node_precision_map[node_name]; } + + LOG(INFO) << "parse " << line << ", get " << node_name << ",size = " << str_vec.size(); } } + ifs.close(); } -void CalibratorParser::_calibrator_parse(std::string calibrator){ + +void CalibratorParser::_calibrator_parse(std::string calibrator) { std::ifstream ifs(calibrator); - if (!ifs.is_open()) - { + + if (!ifs.is_open()) { LOG(WARNING) << "open file " << calibrator << "failed!, will use default calibrator"; return; } + std::string line; - while (ifs.good()){ + + while (ifs.good()) { std::getline(ifs, line); - if (!line.empty()){ + + if (!line.empty()) { _line_calibrator_parse(line); } } + ifs.close(); } +#ifdef BUILD_LITE +std::string convert2underline(std::string& name) { + char* target = strdup(name.c_str()); + + for (char* p = target; *p != '\0'; ++p) { + if (*p == '-') { + *p = '_'; + } else if (*p == '/') { + *p = '_'; + } + } -std::vector CalibratorParser::_line_config_parse(std::string line){ - line.erase(line.find_last_not_of("\n")+1); - line.erase(line.find_last_not_of(" ")+1); + std::string str_tmp = target; + free(target); + return str_tmp; +}; +#endif +std::vector CalibratorParser::_line_config_parse(std::string line) { + line.erase(line.find_last_not_of("\n") + 1); + line.erase(line.find_last_not_of(" ") + 1); std::istringstream iss(line); std::string temp; std::vector str_vec; - while (iss.good()){ + + while (iss.good()) { iss >> temp; str_vec.push_back(temp); } + +#ifdef BUILD_LITE + str_vec[0] = convert2underline(str_vec[0]); +#endif return str_vec; } -void CalibratorParser::_line_calibrator_parse(std::string line){ - line.erase(line.find_last_not_of("\n")+1); - line.erase(line.find_last_not_of(" ")+1); +void CalibratorParser::_line_calibrator_parse(std::string line) { + line.erase(line.find_last_not_of("\n") + 1); + line.erase(line.find_last_not_of(" ") + 1); std::istringstream iss(line); std::string name; float value = 1.0f; - if (iss.good()){ + + if (iss.good()) { iss >> name; } + try { - if (iss.good()){ + if (iss.good()) { iss.precision(7); iss >> value; } } catch (std::exception& e) { LOG(FATAL) << "calibrator load wrong!! line:" << line; - } + } + +#ifdef BUILD_LITE + name = convert2underline(name); +#endif _node_calibrator_map[name] = value; } - + +void CalibratorParser::layout_parse(std::string layout) { + std::ifstream ifs(layout); + + if (!ifs.is_open()) { + LOG(WARNING) << "open file " << layout << " failed!, will use default calibrator"; + return; + } else { + LOG(INFO) << "open file layout config success " << layout; + } + + std::string line; + + while (ifs.good()) { + std::getline(ifs, line); + + if (!line.empty()) { + _line_layout_parse(line); + } + } + + ifs.close(); +} +void CalibratorParser::_line_layout_parse(std::string line) { + line.erase(line.find_last_not_of("\n") + 1); + line.erase(line.find_last_not_of(" ") + 1); + std::istringstream iss(line); + std::string temp; + std::vector str_vec; + + while (iss.good()) { + iss >> temp; + str_vec.push_back(temp); + } + + if (str_vec.size() >= 2) { + _layout_map[str_vec[0]] = str_vec[1]; + } +} +#endif // USE_SGX + +void CalibratorParser::clear_data() { + _node_precision_map.clear(); + _node_calibrator_map.clear(); + _node_target_map.clear(); + _layout_map.clear(); } + + +} + diff --git a/framework/core/net/calibrator_parse.h b/framework/core/net/calibrator_parse.h index 3bb018c07..31e550f56 100644 --- a/framework/core/net/calibrator_parse.h +++ b/framework/core/net/calibrator_parse.h @@ -5,47 +5,77 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef FRAMEWORK_CORE_NET_CALIBRATOR_PARSE_H #define FRAMEWORK_CORE_NET_CALIBRATOR_PARSE_H +#include "anakin_config.h" #include #include #include -#include #include "utils/logger/logger.h" #include "framework/core/types.h" #include "saber/saber_types.h" #include "framework/graph/graph.h" -namespace anakin{ -class CalibratorParser{ +namespace anakin { +class CalibratorParser { public: - CalibratorParser() = default; - ~CalibratorParser() = default; - void parse_from_file(std::string config, std::string calibrator); - static void auto_config(const std::vector& exe_nodes, const std::vector& op_names ,std::string dst); - std::string get_precision(std::string name) const; - saber::DataType get_dtype(std::string name0, std::string name1) const; - std::string get_target(std::string name) const; - saber::LayoutType get_layout(std::string name0, std::string name1, saber::LayoutType old_layout) const; - float get_calibrator(std::string edge_name) const; + CalibratorParser() = default; + ~CalibratorParser() = default; + void clear_data(); + +#ifndef USE_SGX + void parse_from_file(std::string config, std::string calibrator); + static void auto_config(const std::vector& exe_nodes, + const std::vector& op_names, std::string dst, + std::string precision, std::string target); +#endif + std::string get_precision(std::string name) const; + + saber::DataType get_dtype_of_precision(std::string name) const; + saber::DataType get_dtype(std::string name0, std::string name1) const; + saber::DataType get_dtype(std::string name0, std::string name1, std::string bottom_op_type, + std::string top_op_type, std::string dev_name, graph::NodePtr bottom_node) const; + void set_precision(std::string name, saber::DataType); + void set_precision(std::string name, std::string type); + void set_scale(std::string name, float scale); + void set_layout(std::string name, saber::LayoutType layout); + void set_layout(std::string name, std::string layout_name); + std::string get_target(std::string name) const; + saber::LayoutType get_layout(std::string name0, std::string name1, + saber::LayoutType old_layout) const; + saber::LayoutType get_layout(std::string name0, std::string name1, saber::LayoutType old_layout, + std::string target_type, std::string bottom_op_name, std::string top_op_name, + graph::NodePtr bottom_node) const; + float get_calibrator(std::string edge_name) const; + + saber::LayoutType get_layout(std::string name) const; +#ifndef USE_SGX + void layout_parse(std::string); + static void auto_config_layout(const std::vector& tensor_names, + const std::vector& layouts, std::string dst); +#endif private: - std::unordered_map _node_precision_map; - std::unordered_map _node_target_map; - std::unordered_map _node_calibrator_map; + std::unordered_map _node_precision_map; + std::unordered_map _node_target_map; + std::unordered_map _node_calibrator_map; + std::unordered_map _layout_map; private: - void _config_parse(std::string); - void _calibrator_parse(std::string); - std::vector _line_config_parse(std::string); - void _line_calibrator_parse(std::string); +#ifndef USE_SGX + void _config_parse(std::string); + void _calibrator_parse(std::string); + std::vector _line_config_parse(std::string); + void _line_calibrator_parse(std::string); + void _line_layout_parse(std::string); +#endif }; } diff --git a/framework/core/net/entropy_calibrator.cpp b/framework/core/net/entropy_calibrator.cpp index 24b01673a..513552b90 100644 --- a/framework/core/net/entropy_calibrator.cpp +++ b/framework/core/net/entropy_calibrator.cpp @@ -15,6 +15,9 @@ */ #include "framework/core/net/entropy_calibrator.h" + +#ifndef USE_SGX + #include "framework/utils/data_common.h" #include namespace anakin { @@ -167,7 +170,7 @@ void EntropyCalibrator::write_calibrator() { char buf[200]; typename std::map::iterator it; for (it = _scale_map.begin(); it != _scale_map.end(); ++it) { - int n = sprintf(buf, "%s %f\n", it->first.c_str(), float(it->second)); + int n = snprintf(buf, sizeof(buf), "%s %f\n", it->first.c_str(), float(it->second)); ofs.write(buf, n); } ofs.close(); @@ -356,7 +359,10 @@ void EntropyCalibrator::generate_calibrator_table() { init_statistics(tensor_num); auto exec_funcs = this->get_exec_funcs(); std::vector > in_vec = this->get_in_vec(); - get_max_values(in_vec, exec_funcs); + get_max_values(in_vec, exec_funcs); + for (auto i :_max_vec){ + LOG(INFO)<<"max vec "<; } +#endif // USE_SGX diff --git a/framework/core/net/entropy_calibrator.h b/framework/core/net/entropy_calibrator.h index 4fd343f1b..01495f823 100644 --- a/framework/core/net/entropy_calibrator.h +++ b/framework/core/net/entropy_calibrator.h @@ -17,6 +17,10 @@ #ifndef ANAKIN_ENTROPY_CALIBRATOR_H #define ANAKIN_ENTROPY_CALIBRATOR_H +#include "anakin_config.h" + +#ifndef USE_SGX + #include "framework/core/net/calibrator.h" namespace anakin { @@ -95,4 +99,6 @@ class EntropyCalibrator: public Calibrator { int _bin_num; }; } +#endif // USE_SGX + #endif diff --git a/framework/core/net/net.cpp b/framework/core/net/net.cpp index 49c3ab5d4..9d316d55a 100644 --- a/framework/core/net/net.cpp +++ b/framework/core/net/net.cpp @@ -1,7 +1,10 @@ #include "framework/core/net/net.h" -#include "saber/funcs/timer.h" #include "saber/funcs/debug.h" #include "framework/core/mem_info.h" +#include "framework/core/net/auto_layout_config.h" +#ifdef ENABLE_OP_TIMER +#include "saber/funcs/timer.h" +#endif namespace anakin { @@ -28,39 +31,109 @@ Net::Net(graph::Graph& graph, bool need_sum template Net::Net(\ - graph::Graph& graph, OpContextPtr ctx, bool need_summary) { + graph::Graph& graph, OpContextPtr ctx, bool need_summary) { _graph_p = new graph::Graph(); _need_summary = need_summary; //init_env(graph); init(graph, ctx); } + +#ifndef USE_SGX +template +void Net:: +load_calibrator_config(graph::Graph& graph, bool load_layout_from_graph, + bool auto_layout_config) { + //clear calibrator info + //load node precision + auto load_node_precision = [&, this](graph::NodePtr & node_p) { + auto type = node_p -> bit_type(); + _calibrator_parser.set_precision(node_p -> name(), type); + }; + graph.Scanner -> BFS(load_node_precision); + //load edge scale + auto load_edge_scale = [&, this](graph::Edge& edge) { + if (edge.scale().size() > 0) { + float scale = edge.scale()[0]; + _calibrator_parser.set_scale(edge.name(), scale); + } + }; + graph.Scanner -> BFS_Edge(load_edge_scale); + + if (load_layout_from_graph) { + //load edge layout + auto load_edge_layout = [&, this](graph::Edge& edge) { + auto layout = edge.layout(); + _calibrator_parser.set_layout(edge.name(), layout); + }; + graph.Scanner->BFS_Edge(load_edge_layout); + } + + if (auto_layout_config && std::is_same::value) { + bool is_all_nchw = true; + auto search_layout = [&, this](graph::Edge& edge) { + is_all_nchw = is_all_nchw && (_calibrator_parser.get_layout(edge.name()) == Layout_NCHW); + }; + graph.Scanner->BFS_Edge(search_layout); + bool is_edge_scale = false; + auto search_scale = [&, this](graph::Edge& edge) { + if (edge.scale().size() > 0 && edge.scale()[0] != 1.f) { + is_edge_scale = true; + } + }; + graph.Scanner->BFS_Edge(search_scale); + LOG(INFO) << "is_edge_scale " << is_edge_scale; + + if (is_edge_scale) { + AutoLayoutConfigHelper helper; + auto layout_map = helper.auto_config_node_dtype(graph); + + for (auto k : layout_map) { + LOG(INFO)<<"deduce "< helper; + helper.scane_dfs_from_input(graph); + helper.print_layout(); + + if (helper.check_merge(graph)) { + auto configed_layout = helper.get_config_layout(); + auto set_edge_layout = [&, this](graph::Edge& edge) { + auto layout = configed_layout[edge.name()]; + DLOG(ERROR) << edge.name() << " loaded layout: " << layout; + CHECK(layout != ""); + _calibrator_parser.set_layout(edge.name(), layout); + }; + + graph.Scanner->BFS_Edge(set_edge_layout); + } else { + LOG(ERROR) << "auto layout config cancel"; + } + + } + } +} +#endif + template void Net::init(graph::Graph& graph, \ - OpContextPtr ctx) { + OpContextPtr ctx, bool auto_config_layout) { init_env(graph); // shallow copy _graph_p->CopyFrom(graph); auto node_names_in_exec_order = graph.get_nodes_in_order(); - //**generate net_pt_config.txt - std::vector op_names; - for (auto& node_name : node_names_in_exec_order) { - auto node_ptr = (*_graph_p)[node_name]; - op_names.push_back(node_ptr->get_op_name()); - } - //autogen config file - _calibrator_parser.auto_config(node_names_in_exec_order, op_names, "net_pt_config.txt"); - //_calibrator_parser.parse_from_file("net_config.txt", "cal_file"); + +#ifndef USE_SGX + load_calibrator_config(graph,!_has_loaded_layout_from_file,auto_config_layout); +#endif // infer basic shape and parsing parameter from graph for (auto& node_name : node_names_in_exec_order) { auto node_ptr = (*_graph_p)[node_name]; - //LOG(ERROR) << "get node " << node_name << ", op type " << node_ptr->get_op_name(); - /*if (node_ptr->get_op_name() == "Output") { - continue; - }*/ - // create operations //auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); @@ -88,6 +161,10 @@ void Net::init(graph::Graph& graph, \ _exec_funcs.resize(node_names_in_exec_order.size()); + + std::vector tensor_names; + std::vector layouts; + for (int i = 0; i < node_names_in_exec_order.size(); i++) { auto& node_name = node_names_in_exec_order[i]; auto& op_func = _exec_funcs[i]; @@ -97,9 +174,6 @@ void Net::init(graph::Graph& graph, \ for (auto& edge_it : edge_in_its) { DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); DLOG(INFO)<<"set "<name()<<" scale :"<< _calibrator_parser.get_calibrator(edge_it->name()); - edge_it->weight()->set_scale({_calibrator_parser.get_calibrator(edge_it->name())});//set calibrator - edge_it->weight()->set_layout(_calibrator_parser.get_layout(edge_it->bottom(), edge_it->top(), edge_it->weight()->get_layout()));//set tensor layout - edge_it->weight()->set_dtype(_calibrator_parser.get_dtype(edge_it->bottom(), edge_it->top()));//set tensor precision op_func.ins.push_back(edge_it->weight().get()); op_func.in_lanes.push_back(edge_it->lane()); _tensor_name_list.push_back(edge_it->name()); @@ -108,6 +182,12 @@ void Net::init(graph::Graph& graph, \ auto& edge_out_its = _graph_p->get_out_arc_its(node_name); for (auto& edge_it : edge_out_its) { DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); + + tensor_names.push_back(edge_it->name()); + layouts.push_back(edge_it->weight()->get_layout()); +#ifndef USE_SGX + set_calibrator_info(edge_it); +#endif op_func.outs.push_back(edge_it->weight().get()); op_func.out_lanes.push_back(edge_it->lane()); } @@ -123,14 +203,13 @@ void Net::init(graph::Graph& graph, \ op_func.op->_helper->InferShape(op_func.ins, op_func.outs); op_func.op->_helper->Init(*(op_func.ctx_p), op_func.ins, op_func.outs); } - // init memory of _graph_p init_memory(); } template -void Net::init(graph::Graph& graph) { +void Net::init(graph::Graph& graph,bool auto_config_layout) { init_env(graph); // shallow copy _graph_p->CopyFrom(graph); @@ -138,20 +217,14 @@ void Net::init(graph::Graph& graph) { double curr_mem_in_mb_start = MemoryInfo::Global().get_used_mem_in_mb(); auto node_names_in_exec_order = graph.get_nodes_in_order(); - //**generate net_pt_config.txt - std::vector op_names; - for (auto& node_name : node_names_in_exec_order) { - auto node_ptr = (*_graph_p)[node_name]; - op_names.push_back(node_ptr->get_op_name()); - } - //load config - _calibrator_parser.auto_config(node_names_in_exec_order, op_names, "net_pt_config.txt"); - //_calibrator_parser.parse_from_file("net_config.txt", "cal_file"); + +#ifndef USE_SGX + load_calibrator_config(graph,!_has_loaded_layout_from_file,auto_config_layout); +#endif // infer basic shape and parsing parameter from graph for (auto& node_name : node_names_in_exec_order) { auto node_ptr = (*_graph_p)[node_name]; - #ifdef ENABLE_OP_TIMER if ((std::string::npos != (node_ptr->get_op_name()).find("Conv") @@ -196,63 +269,14 @@ void Net::init(graph::Graph& graph) { } #endif - - // create operations - -// if (std::is_same::value) { -// if (node_ptr->get_op_name() == "ConvBatchnormScale" || -// node_ptr->get_op_name() == "ConvBatchnormScaleRelu" || node_ptr->get_op_name() == "ConvRelu" || -// node_ptr->get_op_name() == "Convolution") { -// std::string group = "group"; -// auto group_val = node_ptr->template get_attr(group); -// std::string dilation = "dilation_rate"; -// auto dilation_rate_val = node_ptr->template get_attr >(dilation); -// std::string weight_name = "weight_1"; -// auto weights = node_ptr->template get_attr >(weight_name); -// -// int k_w = weights.d_tensor().width(); -// int k_h = weights.d_tensor().height(); -// int dil_h = dilation_rate_val.vector()[0]; -// int dil_w = dilation_rate_val.vector()[1]; - -// if ((group_val == 1) && (k_w == 3 && k_h == 3 && dil_h == 1 && dil_w == 1)) { -// //node_ptr->set_op(OpFactory::Global()["Sass"+node_ptr->get_op_name()]); -// auto* op_pointer = calibrator_op("Sass"+node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); -// if (op_pointer == nullptr) { -// LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; -// } -// node_ptr->set_op(op_pointer); -// -// node_ptr->get_op_name() = "Sass" + node_ptr->get_op_name(); -// } else { -// LOG(WARNING) << node_ptr->get_op_name() <<" sass not support yet."; -// //auto *op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; -// auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); -// if (op_pointer == nullptr) { -// LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; -// } -// node_ptr->set_op(op_pointer); -// } -// } else { -// //auto *op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; -// auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); -// if (op_pointer == nullptr) { -// LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; -// } -// node_ptr->set_op(op_pointer); -// } -// } else { - //auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; - auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); - - if (op_pointer == nullptr) { - CHECK(false) << node_name << ", type " << node_ptr->get_op_name() << " is null"; - LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; - } - - node_ptr->set_op(op_pointer); -// } - + //* create operations with target the same as this net + //auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); + if (op_pointer == nullptr) { + CHECK(false) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + } + node_ptr->set_op(op_pointer); // bind parameter structure static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); // parsing parameter @@ -270,6 +294,11 @@ void Net::init(graph::Graph& graph) { _exec_funcs.resize(node_names_in_exec_order.size()); + + std::vector tensor_names; + std::vector layouts; + + //_calibrator_parser.layout_parse(_layout_config_path); for (int i = 0; i < node_names_in_exec_order.size(); i++) { auto& node_name = node_names_in_exec_order[i]; auto& op_func = _exec_funcs[i]; @@ -280,6 +309,7 @@ void Net::init(graph::Graph& graph) { for (auto& edge_it : edge_in_its) { DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); DLOG(INFO)<<"set "<name()<<" scale :"<< _calibrator_parser.get_calibrator(edge_it->name()); + op_func.ins.push_back(edge_it->weight().get()); op_func.in_lanes.push_back(edge_it->lane()); } @@ -288,6 +318,12 @@ void Net::init(graph::Graph& graph) { for (auto& edge_it : edge_out_its) { DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); + + tensor_names.push_back(edge_it->name()); + layouts.push_back(edge_it->weight()->get_layout()); +#ifndef USE_SGX + set_calibrator_info(edge_it); +#endif op_func.outs.push_back(edge_it->weight().get()); op_func.out_lanes.push_back(edge_it->lane()); _tensor_name_list.push_back(edge_it->name()); @@ -307,13 +343,13 @@ void Net::init(graph::Graph& graph) { #ifdef ENABLE_DEBUG for (auto& in : op_func.ins) { - LOG(INFO) << " => [layout]: " << in->get_layout(); + LOG(INFO) << " => [dtype]: " << in->get_dtype(); LOG(INFO) << " => [shape]: " << in->valid_shape(); LOG(INFO) << "in offset size = " << in->get_seq_offset().size(); } for (auto& out : op_func.outs) { - LOG(INFO) << " <= [layout]: " << out->get_layout(); + LOG(INFO) << " <= [dtype]: " << out->get_dtype(); LOG(INFO) << " <= [shape]: " << out->valid_shape(); LOG(INFO) << "out offset size = " << out->get_seq_offset().size(); } @@ -326,35 +362,21 @@ void Net::init(graph::Graph& graph) { } double curr_mem_in_mb_end = MemoryInfo::Global().get_used_mem_in_mb(); - this->_graph_p->statistics.template set_info(curr_mem_in_mb_end - - curr_mem_in_mb_start); + this->_graph_p->statistics.template set_info(curr_mem_in_mb_end - curr_mem_in_mb_start); // init memory of _graph_p init_memory(); graph.statistics = _graph_p->statistics; // copy statistic back LOG(INFO) << "Temp mem used: " << this->_graph_p->statistics.template - get_info() << " MB"; + get_info() << " MB"; LOG(INFO) << "Original mem used: " << this->_graph_p->statistics.template - get_info() << " MB"; + get_info() << " MB"; LOG(INFO) << "Model mem used: " << this->_graph_p->statistics.template - get_info() << " MB"; + get_info() << " MB"; LOG(INFO) << "System mem used: " << this->_graph_p->statistics.template - get_info() << " MB"; + get_info() << " MB"; - // set new precision/layout/scale for edge of graph - for (int i = 0; i < node_names_in_exec_order.size(); i++) { - auto& node_name = node_names_in_exec_order[i]; - auto& edge_in_its = _graph_p->get_in_arc_its(node_name); - for (auto& edge_it : edge_in_its) { - edge_it->weight()->set_dtype(_calibrator_parser.get_dtype(edge_it->bottom(), - edge_it->top()));//set tensor dtype - edge_it->weight()->set_layout(_calibrator_parser.get_layout(edge_it->bottom(), - edge_it->top(), - edge_it->weight()->get_layout()));//set tensor layout - edge_it->weight()->set_scale({_calibrator_parser.get_calibrator(edge_it->name())});//set tensor calibrator - } - } #ifdef ENABLE_OP_TIMER _op_time = std::vector(_exec_funcs.size(), 0.0f); @@ -393,7 +415,9 @@ void Net::prediction() { #ifdef ENABLE_OP_TIMER int op_id = 0; #endif - +#ifdef ENABLE_DEBUG + int op_cnt = 0; +#endif for (auto& executer : _exec_funcs) { if (RunType == OpRunType::SYNC || executer.need_sync || executer.op_name == "Output") { for (int i = 0; i < executer.ins.size(); i++) { @@ -403,10 +427,10 @@ void Net::prediction() { } #ifdef ENABLE_DEBUG - LOG(WARNING) << " executer: " << executer.name << " (" << executer.op_name << ") "; + LOG(WARNING) << "[Num: "<< op_cnt++ << "] executer: " << executer.name << " (" << executer.op_name << ") "; for (auto in : executer.ins) { - LOG(INFO) << " \\ in shape (" << in->valid_shape() << ")" + LOG(INFO) << " \\ in shape (" << in->valid_shape() << ")"<<",data type "<get_dtype()<<" , " << " valid_size: " << in->valid_size() << " realsize: " << in->size() << " offset_size " << in->get_seq_offset().size(); @@ -414,7 +438,6 @@ void Net::prediction() { #endif - #ifdef ENABLE_OP_TIMER Context ctx(0, 0, 0); saber::SaberTimer my_time; @@ -432,30 +455,35 @@ void Net::prediction() { #ifdef ENABLE_DEBUG #ifdef USE_CUDA - CUDA_CHECK(cudaDeviceSynchronize()); + if (std::is_same::value) { + CUDA_CHECK(cudaDeviceSynchronize()); + } #endif for (auto out : executer.outs) { if (executer.name=="detection_out"){ print_tensor(*out); LOG(INFO)<<"==============================="; } - LOG(INFO) << " \\ out shape (" << out->valid_shape() << ") " + LOG(INFO) << " \\ out shape (" << out->valid_shape() << ") "<<",data type "<get_dtype()<<" , " << "executer name:"<< executer.name << " avg: " << tensor_mean_value_valid(*out); } - -#ifdef NVIDIA_GPU - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaPeekAtLastError()); +#ifdef USE_CUDA + if (std::is_same::value) { + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaPeekAtLastError()); + } #endif -//#ifdef RECORD_TENSOR_IN_NET +#ifndef USE_SGX +#if defined(RECORD_TENSOR_IN_NET) for (int i = 0; i < executer.ins.size(); i++) { record_tensor_in_format(*executer.ins[i], executer.op_name,executer.name,false,i); } for (int i = 0; i < executer.outs.size(); i++) { record_tensor_in_format(*executer.outs[i], executer.op_name,executer.name,true,i); } -//#endif +#endif +#endif #endif @@ -474,10 +502,136 @@ void Net::prediction() { #endif } // for +} + + +template +std::unique_ptr > Net::Clone() { + auto ret_net = std::unique_ptr >(new Net); + ret_net->_graph_p->CopyFrom(*(this->_graph_p)); + return ret_net; +} + +template +void Net::init() { + init_env(*_graph_p); + + double curr_mem_in_mb_start = MemoryInfo::Global().get_used_mem_in_mb(); + + auto node_names_in_exec_order = _graph_p->get_nodes_in_order(); + + load_calibrator_config(*_graph_p,!_has_loaded_layout_from_file); + + // infer basic shape and parsing parameter from graph + for (auto& node_name : node_names_in_exec_order) { + auto node_ptr = (*_graph_p)[node_name]; + + //* create operations with target the same as this net + //auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; + auto* op_pointer = calibrator_op(node_ptr->get_op_name(), node_ptr->name(), _calibrator_parser); + if (op_pointer == nullptr) { + CHECK(false) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + LOG(FATAL) << node_name << ", type " << node_ptr->get_op_name() << " is null"; + } + node_ptr->set_op(op_pointer); + // bind parameter structure + static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); + // parsing parameter + static_cast*>(node_ptr->Op())->_helper->InitParam(); + } + + // remove null op node + for (auto it = node_names_in_exec_order.begin(); it != node_names_in_exec_order.end();) { + if (!(*_graph_p)[*it]->Op()) { + it = node_names_in_exec_order.erase(it); + } else { + ++it; + } + } + _exec_funcs.resize(node_names_in_exec_order.size()); + + + std::vector tensor_names; + std::vector layouts; + + //_calibrator_parser.layout_parse(_layout_config_path); + for (int i = 0; i < node_names_in_exec_order.size(); i++) { + auto& node_name = node_names_in_exec_order[i]; + auto& op_func = _exec_funcs[i]; + op_func.name = node_name; + auto& edge_in_its = _graph_p->get_in_arc_its(node_name); + DLOG(WARNING) << " node : " << op_func.name << " (" << (*_graph_p)[node_name]->get_op_name() << ") "; + + for (auto& edge_it : edge_in_its) { + DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); + DLOG(INFO)<<"set "<name()<<" scale :"<< _calibrator_parser.get_calibrator(edge_it->name()); + + op_func.ins.push_back(edge_it->weight().get()); + op_func.in_lanes.push_back(edge_it->lane()); + } + + auto& edge_out_its = _graph_p->get_out_arc_its(node_name); + + for (auto& edge_it : edge_out_its) { + DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); + + tensor_names.push_back(edge_it->name()); + layouts.push_back(edge_it->weight()->get_layout()); + + set_calibrator_info(edge_it); + + op_func.outs.push_back(edge_it->weight().get()); + op_func.out_lanes.push_back(edge_it->lane()); + _tensor_name_list.push_back(edge_it->name()); + } + + op_func.current_lane = (*_graph_p)[node_name]->lane(); + op_func.need_sync = (*_graph_p)[node_name]->need_wait(); + op_func.op = static_cast* >((*_graph_p)[node_name]->Op()); + op_func.op_name = (*_graph_p)[node_name]->get_op_name(); + op_func.ctx_p = std::make_shared>(TargetWrapper::get_device_id(), + op_func.current_lane, + op_func.current_lane); + // call init of operator + CHECK_NOTNULL(op_func.op) << "Node(node_name) doesn't have op pointer! "; + op_func.op->_helper->InferShape(op_func.ins, op_func.outs); +#ifdef ENABLE_DEBUG + + for (auto& in : op_func.ins) { + LOG(INFO) << " => [layout]: " << in->get_layout(); + LOG(INFO) << " => [shape]: " << in->valid_shape(); + LOG(INFO) << "in offset size = " << in->get_seq_offset().size(); + } + + for (auto& out : op_func.outs) { + LOG(INFO) << " <= [layout]: " << out->get_layout(); + LOG(INFO) << " <= [shape]: " << out->valid_shape(); + LOG(INFO) << "out offset size = " << out->get_seq_offset().size(); + } + +#endif + op_func.op->_helper->Init(*(op_func.ctx_p), op_func.ins, op_func.outs); + } + + double curr_mem_in_mb_end = MemoryInfo::Global().get_used_mem_in_mb(); + this->_graph_p->statistics.template set_info(curr_mem_in_mb_end - curr_mem_in_mb_start); + // init memory of _graph_p + init_memory(); + + LOG(INFO) << "Temp mem used: " << this->_graph_p->statistics.template + get_info() << " MB"; + LOG(INFO) << "Original mem used: " << this->_graph_p->statistics.template + get_info() << " MB"; + LOG(INFO) << "Model mem used: " << this->_graph_p->statistics.template + get_info() << " MB"; + LOG(INFO) << "System mem used: " << this->_graph_p->statistics.template + get_info() << " MB"; } + + template void Net::execute_stop_at_node(std::string node_name) { if (_suspended_point == -1) { @@ -615,17 +769,39 @@ std::vector > Net::get_in_list() { template Tensor4dPtr Net::get_tensor_from_edge(const char* from, - const char* to) { + const char* to) { return _graph_p->get_arc(std::string(from), std::string(to)).weight().get(); } +template +Status Net::alloc_memory_first(graph::Graph& graph) { + _graph_p->CopyFrom(graph); + auto alloc_memory = [this](graph::Edge& edge) { + auto& tensor_p = edge.weight(); + + if (!edge.shared()) { + if(tensor_p->mutable_data() == nullptr) { + anakin::saber::Shape tmp_shape({1, 1 , 1, 1}); + tensor_p->re_alloc(tmp_shape, saber::AK_FLOAT); + return Status::EXIT(); + } + } + + return Status::OK(); + }; + _graph_p->Scanner->BFS_Edge(alloc_memory); + return Status::OK(); +} + template Status Net::init_memory() { auto alloc_memory = [this](graph::Edge& edge) { auto& tensor_p = edge.weight(); if (!edge.shared()) { - tensor_p->re_alloc(tensor_p->shape(), tensor_p->get_dtype()); + if(tensor_p->mutable_data() == nullptr) { + tensor_p->re_alloc(tensor_p->shape(), tensor_p->get_dtype()); + } } return 0; @@ -644,12 +820,23 @@ Status Net::init_memory() { edge_name = inner_edge.share_from(); return Status::EXIT(" Continue to find next."); } - - if (inner_edge.weight()->size() < edge.weight()->valid_size()) { - auto inner_original_shape = inner_edge.weight()->valid_shape(); - inner_edge.weight()->re_alloc(edge.weight()->valid_shape(), - edge.weight()->get_dtype()); - inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); + if ((inner_edge.weight()->size() * inner_edge.weight()->get_buf_dtype_size() + < edge.weight()->valid_size() * edge.weight()->get_dtype_size()) || + (inner_edge.weight()->capacity() < edge.weight()->valid_size() * edge.weight()->get_dtype_size())) { + if(inner_edge.weight()->size() * inner_edge.weight()->get_buf_dtype_size() > + edge.weight()->valid_size() * edge.weight()->get_dtype_size()) { + // this will be invoked when use API(alloc_memory_first) + inner_edge.weight()->re_alloc(inner_edge.weight()->valid_shape(), + inner_edge.weight()->get_dtype()); + } else { + // normal mode + auto inner_original_shape = inner_edge.weight()->valid_shape(); + auto inner_edge_dtype = inner_edge.weight()->get_dtype(); + inner_edge.weight()->re_alloc(edge.weight()->valid_shape(), + edge.weight()->get_dtype()); + inner_edge.weight()->set_dtype(inner_edge_dtype); + inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); + } } edge.weight()->share_from(*(inner_edge.weight())); @@ -673,12 +860,12 @@ Status Net::init_memory() { if (!edge.shared()) { temp_mem_in_mbytes += (tensor_p->size() * tensor_p->get_dtype_size()); - DLOG(WARNING) << "Edge("<< edge.bottom() << " ==> " - << edge.top() << ") shape(" + DLOG(WARNING) << "Edge("<< edge.bottom() << " ==> " + << edge.top() << ") shape(" << tensor_p->shape()[0] <<", " << tensor_p->shape()[1] <<", " << tensor_p->shape()[2] <<", " - << tensor_p->shape()[3] <<") . size: " + << tensor_p->shape()[3] <<") . size: " << tensor_p->size() * tensor_p->get_dtype_size() / 1024.0 / 1024.0 << " MB"; } @@ -733,21 +920,13 @@ template class Net; #endif #ifdef USE_ARM_PLACE -#ifdef ANAKIN_TYPE_FP32 template class Net; -template class Net; -#endif - -#ifdef ANAKIN_TYPE_FP16 template class Net; -template class Net; -#endif - -#ifdef ANAKIN_TYPE_INT8 template class Net; -template class Net; -#endif //int8 +template class Net; +template class Net; +template class Net; #endif //arm } /* namespace anakin */ diff --git a/framework/core/net/net.h b/framework/core/net/net.h index f85d030da..2ea980aea 100644 --- a/framework/core/net/net.h +++ b/framework/core/net/net.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_NET_H @@ -19,14 +19,17 @@ #include "framework/graph/graph.h" #include "framework/core/net/operator_func.h" #include "framework/core/net/calibrator_factory.h" +#include "framework/utils/csv.h" #include "saber/core/tensor_op.h" - namespace anakin { + +#ifndef USE_SGX template class Calibrator; +#endif -/** +/** * \brief Net class used for execution of graph and it is thread safety. */ template @@ -34,8 +37,8 @@ class Net { public: explicit Net(bool need_summary = false); - /** - * \brief Construct a net by graph. + /** + * \brief Construct a net by graph. * This construction should be use in thread call and make sure thread safety. */ explicit Net(graph::Graph&, bool need_summary = false); @@ -53,43 +56,56 @@ class Net { * \brief init execute net from graph, init with specified context. * you can use Net(Graph&) instead. */ - void init(graph::Graph& graph, OpContextPtr ctx); + void init(graph::Graph& graph, OpContextPtr ctx, + bool auto_config_layout = false); /** * \brief init execute net from graph. * you can use Net(Graph&) instead. */ - void init(graph::Graph&); - - /** - * \brief do inference. + void init(graph::Graph&, bool auto_config_layout = false); + + /** + * \brief init execute net. + * this api assumes that the net have cloned graph inside + */ + void init(); + + + /** + * \brief do inference. */ void prediction(); - /** - * \brief Running model from inputs to target edge - * - * We support some api for partly running mode. - * For example, you can execute part of the model by using api - * execute_stop_at_edge(node name), then anakin will run the model - * in order from input to the node(its computation is not invoked) - * and other computation is suspended. Beside, anakin supply an api - * running from target node throughtout end of model. - * NOTE: - * Those api should be carefully used, if you want to get edge - * tensors after target node you stop at, you need to register - * the edges at graph optimizing stage at first. - */ - void execute_stop_at_node(std::string node_name); - - /** - * \brief running from edge to end - */ - void execute_start_from_node(std::string node_name); - /** - * \brief generate calibration + /** + * \brief clone new execute net engine + */ + std::unique_ptr > Clone(); + + /** + * \brief Running model from inputs to target edge + * + * We support some api for partly running mode. + * For example, you can execute part of the model by using api + * execute_stop_at_edge(node name), then anakin will run the model + * in order from input to the node(its computation is not invoked) + * and other computation is suspended. Beside, anakin supply an api + * running from target node throughtout end of model. + * NOTE: + * Those api should be carefully used, if you want to get edge + * tensors after target node you stop at, you need to register + * the edges at graph optimizing stage at first. + */ + void execute_stop_at_node(std::string node_name); + + /** + * \brief running from edge to end + */ + void execute_start_from_node(std::string node_name); + /** + * \brief generate calibration */ - void generate_calibrator_table(); + void generate_calibrator_table(); /** * \brief load calibrator table; */ @@ -97,15 +113,77 @@ class Net { //! get time for each op; #ifdef ENABLE_OP_TIMER - void print_and_reset_optime_summary(int epoch){ - for (int i =0;i<_op_param.size();i++){ - LOG(INFO)<<"[SUMMARY OP TIMER] name = "<<_exec_funcs[i].name << " param "<< _op_param[i]<<" , time = "<<_op_time[i]/epoch<<" ms"; + void print_and_reset_optime_summary(int epoch) { + for (int i = 0; i < _op_param.size(); i++) { + LOG(INFO) << "[SUMMARY OP TIMER] name = " << _exec_funcs[i].name << " param " << _op_param[i] << + " , time = " << _op_time[i] / epoch << " ms"; + } + + std::map op_type_time_map; + std::map::iterator it; + + for (int i = 0; i < _op_param.size(); i++) { + it = op_type_time_map.find(_op_param[i]); + + if (it != op_type_time_map.end()) { + op_type_time_map[_op_param[i]] += (_op_time[i]); + } else { + op_type_time_map[_op_param[i]] = (_op_time[i]); + } } + + for (it = op_type_time_map.begin(); it != op_type_time_map.end(); it++) { + LOG(INFO) << " PARAM " << it->first \ + << " MS " << it->second / epoch; + } + reset_op_time(); } - void reset_op_time() {_op_time = std::vector(_exec_funcs.size(), 0.0f);} - std::vector get_op_time() {return _op_time;} - std::vector get_op_param() {return _op_param;} + void print_and_reset_optime_summary(int epoch, std::string const& file, bool app_mode = false) { + try { + Csvfile csvfile(file, app_mode); + float sum_time = 0; + csvfile << "EPOCH" << epoch << endrow; + + for (int i = 0; i < _op_param.size(); i++) { + csvfile << "NAME" << _exec_funcs[i].name << "PARAM" << _op_param[i] \ + << "MS" << _op_time[i] / epoch << endrow; + sum_time += _op_time[i] / epoch; + } + + csvfile << "SUM" << sum_time << endrow; + std::map op_type_time_map; + std::map::iterator it; + + for (int i = 0; i < _op_param.size(); i++) { + it = op_type_time_map.find(_op_param[i]); + + if (it != op_type_time_map.end()) { + op_type_time_map[_op_param[i]] += _op_time[i] / epoch; + } else { + op_type_time_map[_op_param[i]] = _op_time[i] / epoch; + } + } + + for (it = op_type_time_map.begin(); it != op_type_time_map.end(); it++) { + csvfile << "PARAM" << it->first \ + << "MS" << it->second / epoch << endrow; + } + } catch (const std::exception& ex) { + LOG(FATAL) << "Exception was thrown: " << ex.what(); + } + + reset_op_time(); + } + void reset_op_time() { + _op_time = std::vector(_exec_funcs.size(), 0.0f); + } + std::vector get_op_time() { + return _op_time; + } + std::vector get_op_param() { + return _op_param; + } std::vector > get_exec_funcs() { return _exec_funcs; } @@ -118,7 +196,7 @@ class Net { */ Tensor4dPtr get_out(std::string out_name); std::vector > get_out_list(); - + /** * \brief Get in by name. */ @@ -129,15 +207,67 @@ class Net { * \brief Get tensor from a given edge. */ Tensor4dPtr get_tensor_from_edge(const char* from, const char* to); - + +#ifndef USE_SGX /** * \brief Get tensor from a given edge. */ - void load_calibrator_config(std::string config, std::string calibrator){ - _calibrator_parser.parse_from_file(config, calibrator); + + void load_calibrator_config(graph::Graph& graph, bool load_layout_from_config = true, + bool auto_layout_config = false); + void load_x86_layout_config(std::string config) { + _calibrator_parser.layout_parse(config); + _layout_config_path = config; + + _has_loaded_layout_from_file = true; + } + + void set_calibrator_info(typename graph::Graph::Edge_it_t& edge_it) { + //set tensor dtype + auto bottom_op_name = (*_graph_p)[edge_it->bottom()]->get_op_name(); + auto top_op_name = (*_graph_p)[edge_it->top()]->get_op_name(); + + if (std::is_same::value) { + edge_it->weight()->set_dtype(_calibrator_parser.get_dtype(edge_it->bottom(), edge_it->top(), + bottom_op_name, top_op_name, "X86", (*_graph_p)[edge_it->bottom()])); + } else { + edge_it->weight()->set_dtype(_calibrator_parser.get_dtype(edge_it->bottom(), edge_it->top(), + bottom_op_name, top_op_name, "NV", (*_graph_p)[edge_it->bottom()])); + }; + + DLOG(ERROR) << "set " << edge_it->name() << "dtype:" << edge_it->weight()->get_dtype(); + + //set tensor calibrator + edge_it->weight()->set_scale({_calibrator_parser.get_calibrator(edge_it->name())}); + DLOG(WARNING) << "set " << edge_it->name() << " scale:" << _calibrator_parser.get_calibrator( + edge_it->name()); + + //set tensor layout + if (std::is_same::value) { + //set tensor layout + LayoutType layout = _calibrator_parser.get_layout(edge_it->bottom(), edge_it->top(), + _calibrator_parser.get_layout(edge_it->name()), "x86", bottom_op_name, top_op_name, + (*_graph_p)[edge_it->bottom()]); + DLOG(WARNING) << "set x86_layout " << edge_it->name() << "," << layout << ",in edge "; + edge_it->weight()->set_layout(layout); + } else { + edge_it->weight()->set_layout(_calibrator_parser.get_layout(edge_it->bottom(), + edge_it->top(), edge_it->weight()->get_layout())); + } } friend class Calibrator; +#endif + +public: + /** + * \brief Allocate memory before you invoke the Net::init. + * + * Note: + * This api should be carefully called, its only + * used and tested in anakin subgraph mode. + */ + Status alloc_memory_first(graph::Graph&); private: /** @@ -151,12 +281,15 @@ class Net { Status init_env(graph::Graph&); private: + ///< layout config file path , layout config will be load or create + std::string _layout_config_path{""}; + bool _has_loaded_layout_from_file{false}; ///< executor for operators in node. std::vector > _exec_funcs; - ///< suspended point is set when you invoke execute_stop_at_node - int _suspended_point{-1}; - ///< start point is set when you invoke execute_start_from_node - int _start_point{-1}; + ///< suspended point is set when you invoke execute_stop_at_node + int _suspended_point{-1}; + ///< start point is set when you invoke execute_start_from_node + int _start_point{-1}; ///< The pointer to Context. OpContextPtr _ctx_p; graph::Graph* _graph_p{nullptr}; @@ -166,7 +299,7 @@ class Net { std::vector > _out_tensor_list; //calibrator parser CalibratorParser _calibrator_parser; - ///< all tensor names + ///< all tensor names std::vector _tensor_name_list; bool _need_summary{false}; diff --git a/framework/core/net/operator_func.cpp b/framework/core/net/operator_func.cpp index 42a402d5a..e34248d3a 100644 --- a/framework/core/net/operator_func.cpp +++ b/framework/core/net/operator_func.cpp @@ -24,24 +24,16 @@ template class OperatorFunc; template class OperatorFunc; #endif -#ifdef AMD_GPU +#ifdef AMD_GPU template class OperatorFunc; template class OperatorFunc; template class OperatorFunc; #endif #ifdef USE_ARM_PLACE -#ifdef ANAKIN_TYPE_FP32 template class OperatorFunc; -#endif - -#ifdef ANAKIN_TYPE_FP16 template class OperatorFunc; -#endif - -#ifdef ANAKIN_TYPE_INT8 template class OperatorFunc; -#endif #endif //arm } /* namespace */ diff --git a/framework/core/net/rt_net.cpp b/framework/core/net/rt_net.cpp deleted file mode 100644 index 10208d9bb..000000000 --- a/framework/core/net/rt_net.cpp +++ /dev/null @@ -1,333 +0,0 @@ -#ifdef USE_TENSORRT -#include "framework/core/net/rt_net.h" -#include -using namespace nvinfer1; - -namespace anakin { - -class RTLogger : public ILogger -{ - void log(Severity severity, const char* msg) override - { - if (severity != Severity::kINFO) - LOG(INFO) << msg; - } -} rt_gLogger; - -class ICaffePoolOutputDimensionsFormula: public IOutputDimensionsFormula -{ -public: - virtual DimsHW compute(DimsHW inputDims, DimsHW kernelSize, DimsHW stride, DimsHW padding, DimsHW dilation, const char* layerName) const { - const int kernel_extent_h = dilation.d[0] * (kernelSize.d[0] - 1) + 1; - const int kernel_extent_w = dilation.d[1] * (kernelSize.d[1] - 1) + 1; - int h = ceil((inputDims.d[0] + 2* padding.d[0] - kernel_extent_h)*1.0 /stride.d[0]) + 1; - int w = ceil((inputDims.d[1] + 2* padding.d[1] - kernel_extent_w)*1.0 /stride.d[1]) + 1; - return DimsHW(h, w); - } - - ICaffePoolOutputDimensionsFormula() {} - ~ICaffePoolOutputDimensionsFormula() {} -}; - -//template -RTNet::~RTNet() { - if (_graph) { - delete _graph; - _network->destroy(); - _builder->destroy(); - _graph = nullptr; - } -} - -RTNet::RTNet(graph::Graph& graph, nvinfer1::IInt8Calibrator* calibrator) { - _builder = nvinfer1::createInferBuilder(rt_gLogger); - _network = _builder->createNetwork(); - ICaffePoolOutputDimensionsFormula poolFormula; - _network->setPoolingOutputDimensionsFormula(&poolFormula); - std::map tensor_map; - std::map tensor_dims_map; - std::map _input_dims_map; - auto node_name_in_exec_order = graph.get_nodes_in_order(); - - /*prepare inputs*/ - for(auto input : graph.get_ins()){ - auto input_dim = graph[input]->template get_attr>("input_shape"); - _batch_size = input_dim[0]; - DimsCHW dims = nvinfer1::DimsCHW{input_dim[1], input_dim[2], input_dim[3]}; - _input_dims_map.insert(std::pair(input, dims)); - auto data = _network->addInput(input.c_str(), nvinfer1::DataType::kFLOAT, dims); - CHECK(data != nullptr) << "rt input is not valid"; - auto node_ptr = graph[input]; - auto edge_out_its = graph.get_out_arc_its(input); - data->setName(edge_out_its[0]->name().c_str()); - tensor_dims_map.insert(std::pair(input, _input_dims_map[input])); - tensor_map.insert(std::pair(edge_out_its[0]->name().c_str(), data)); - _input_names.push_back(edge_out_its[0]->name().c_str()); - } - - for (auto output : graph.get_outs()) { - auto edge_in_its = graph.get_in_arc_its(output); - _output_names.push_back(edge_in_its[0]->name().c_str()); - } - /*construct net**/ - for(int i = 0; i < node_name_in_exec_order.size(); i++ ){ - auto node_name = node_name_in_exec_order[i]; - auto node_ptr = graph[node_name]; - auto edge_in_its = graph.get_in_arc_its(node_name); - auto edge_out_its = graph.get_out_arc_its(node_name); - auto bottom_size = edge_in_its.size(); - //node_ptr->template get_attr>(bottom_size); - ITensor* inputs[bottom_size]; - for (int j = 0; j < bottom_size; j++) { - CHECK(tensor_map[edge_in_its[j]->name()] != nullptr) << " " << node_name << "input tensor does not exist"; - inputs[j] = tensor_map[edge_in_its[j]->name()]; - } - if (node_ptr->get_op_name() == "Input") { - continue; - } - addLayer(node_ptr, edge_in_its, edge_out_its, inputs, bottom_size, _network, tensor_map, tensor_dims_map); - } - - /*trt output*/ - - for (auto& s : _output_names) { - _network->markOutput(*tensor_map[s]); - } - cudaStreamCreate(&_stream); - _workspace_size = 1<<20; - - _builder->setMaxBatchSize(_batch_size); - _builder->setMaxWorkspaceSize(_workspace_size); - _builder->setInt8Mode(calibrator != nullptr); - _builder->setInt8Calibrator(calibrator); - _builder->setDebugSync(true); - bool mode = calibrator != nullptr; - LOG(INFO)<<"int8 mode"<< mode; - - ICudaEngine * engine = _builder->buildCudaEngine(*_network); - _context = engine->createExecutionContext(); - _engine = &(_context->getEngine()); - - _buffers.resize(_input_names.size() + _output_names.size()); - int num = _engine->getNbBindings(); - LOG(INFO) << "binging num" << num; - for (auto input: _input_names) { - size_t bindingIndex = _engine->getBindingIndex(input.c_str()); - CHECK_LT(bindingIndex, _buffers.size()); - DimsCHW dims = static_cast(_engine->getBindingDimensions((int)bindingIndex)); - int count = dims.c() * dims.h() * dims.w() * _batch_size; - Shape shape({_batch_size, dims.c(), dims.h(), dims.w()}, Layout_NCHW); - Tensor* tensor = new Tensor(shape); - _input_tensors.push_back(tensor); - _buffers[bindingIndex] = tensor->data(); - } - - for (auto output: _output_names) { - size_t bindingIndex = _engine->getBindingIndex(output.c_str()); - CHECK_LT(bindingIndex, _buffers.size()); - DimsCHW dims = static_cast(_engine->getBindingDimensions((int)bindingIndex)); - int count = dims.c() * dims.h() * dims.w() * _batch_size; - Shape shape({_batch_size, dims.c(), dims.h(), dims.w()}, Layout_NCHW); - Tensor* tensor = new Tensor(shape); - _output_tensors.push_back(tensor); - _buffers[bindingIndex] = tensor->data(); - } -} - -void RTNet::prediction() { - _context->enqueue(_batch_size, &_buffers[0], _stream, nullptr); -} - - -Tensor4dPtr RTNet::get_out(std::string out_name) { - return _output_tensors[_output_names_id_map[out_name]]; -} - -std::vector > RTNet::get_out_list() { - return _output_tensors; - -} - -Tensor4dPtr RTNet::get_in(std::string in_name) { - return _input_tensors[_input_names_id_map[in_name]]; -} - -std::vector > RTNet::get_in_list() { - return _input_tensors; -} - - -void RTNet::addConvLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - //ConvParam param; - //parser_conv_param(conv, param); - auto num_output = edge_out_its.size(); - auto paddings = node_ptr->template get_attr>("padding"); - auto strides = node_ptr->template get_attr>("strides"); - auto dilation = node_ptr->template get_attr>("dilation_rate"); - auto filter_num = node_ptr->template get_attr("filter_num"); - auto kernel_size = node_ptr->template get_attr>("kernel_size"); - auto group = node_ptr->template get_attr("group"); - auto bias_term = node_ptr->template get_attr("bias_term"); - - using pblock_type = PBlock; - auto weights = node_ptr->template get_attr>("weight_1"); - Weights filter_weight{nvinfer1::DataType::kFLOAT, weights.d_tensor().data(), weights.d_tensor().valid_size()}; - IConvolutionLayer* convLayer = NULL; - if (bias_term) { - auto bias = node_ptr->template get_attr("weight_2"); - nvinfer1::Weights bias_weight{nvinfer1::DataType::kFLOAT, bias.d_tensor().data(), bias.count()}; - convLayer = net->addConvolution(*inputs[0], filter_num, DimsHW{kernel_size[0], kernel_size[1]}, filter_weight, bias_weight); - } else { - nvinfer1::Weights bias_weight{nvinfer1::DataType::kFLOAT, nullptr, 0}; - convLayer = net->addConvolution(*inputs[0], filter_num, DimsHW{kernel_size[0], kernel_size[1]}, filter_weight, bias_weight); - } - convLayer->setStride(DimsHW{strides[0], strides[1]}); - convLayer->setPadding(DimsHW{paddings[1], paddings[1]}); - convLayer->setNbGroups(group); - convLayer->setName(node_ptr->name().c_str()); - convLayer->setDilation(DimsHW{dilation[0], dilation[1]}); - auto top_name = (*edge_out_its[0]).name(); - convLayer->getOutput(0)->setName(top_name.c_str()); - tensor_map.insert(std::pair(top_name, convLayer->getOutput(0))); -} - -void RTNet::addPoolLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - //ConvParam param; - //parser_conv_param(conv, param); - auto num_output = edge_out_its.size(); - auto paddings = node_ptr->template get_attr>("padding"); - auto strides = node_ptr->template get_attr>("strides"); - auto kernel_size = node_ptr->template get_attr>("pool_size"); - auto pool_type = node_ptr->template get_attr("method"); - auto global_pooling = node_ptr->template get_attr("global_pooling"); - - IPoolingLayer* poolLayer = NULL; - nvinfer1::PoolingType pooling_type; - if (pool_type == "AVG") { - pooling_type = nvinfer1::PoolingType::kAVERAGE; - } else if (pool_type == "MAX") - pooling_type = nvinfer1::PoolingType::kMAX; - else { - LOG(FATAL) << "pooling type is not valid"; - } - poolLayer = net->addPooling(*inputs[0], pooling_type, DimsHW{kernel_size[0], kernel_size[1]}); - poolLayer->setStride(DimsHW{strides[0], strides[1]}); - poolLayer->setPadding(DimsHW{paddings[1], paddings[1]}); - poolLayer->setName(node_ptr->name().c_str()); - auto top_name = (*edge_out_its[0]).name(); - poolLayer->getOutput(0)->setName(top_name.c_str()); - tensor_map.insert(std::pair(top_name, poolLayer->getOutput(0))); -} - -void RTNet::addActiveLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - //nvinfer1::ActivationType type = nvinfer1::ActivationType::kSIGMOID; - //auto ak_type = node_ptr->template get_attr("type"); - //if (ak_type == "Sigmoid") { - // type = nvinfer1::ActivationType::kSIGMOID; - //} else if (ak_type == "TanH") { - // type = nvinfer1::ActivationType::kTANH; - //} else if (ak_type == "ReLU") { - // type = nvinfer1::ActivationType::kRELU; - //} else { - // LOG(FATAL) << "unknown type"; - //} - IActivationLayer* layer = net->addActivation(*inputs[0], ActivationType::kRELU); - layer->setName(node_ptr->name().c_str()); - auto top_name = (*edge_out_its[0]).name(); - layer->getOutput(0)->setName(top_name.c_str()); - tensor_map.insert(std::pair(top_name, layer->getOutput(0))); -} - -void RTNet::addSoftmaxLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - ISoftMaxLayer* layer = net->addSoftMax(*inputs[0]); - layer->setName(node_ptr->name().c_str()); - auto top_name = (*edge_out_its[0]).name(); - layer->getOutput(0)->setName(top_name.c_str()); - tensor_map.insert(std::pair(top_name, layer->getOutput(0))); -} - - -void RTNet::addInnerProductLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; - auto axis = node_ptr->template get_attr( "axis"); - auto out_dim = node_ptr->template get_attr( "out_dim"); - auto bias_term = node_ptr->template get_attr( "bias_term"); - using pblock_type = PBlock; - auto ak_weights = node_ptr->template get_attr("weight_1"); - nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, ak_weights.d_tensor().data(), ak_weights.count()}; - - IFullyConnectedLayer* layer = net->addFullyConnected(*inputs[0], out_dim, weights, bias); - layer->setName(node_ptr->name().c_str()); - if (bias_term) { - auto ak_bias = node_ptr->template get_attr("weight_2"); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, ak_bias.d_tensor().data(), ak_bias.count()}; - layer->setBiasWeights(bias); - } - auto top_name = (*edge_out_its[0]).name(); - layer->getOutput(0)->setName(top_name.c_str()); - tensor_map.insert(std::pair(top_name, layer->getOutput(0))); - -} - -void RTNet::addLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor* const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map) { - if (node_ptr->get_op_name() == "Convolution") { - addConvLayer(node_ptr, edge_in_its, edge_out_its, inputs, nbInputs, net, tensor_map, tensor_dims_map); - } else if (node_ptr->get_op_name() == "Pooling") { - addPoolLayer(node_ptr, edge_in_its, edge_out_its, inputs, nbInputs, net, tensor_map, tensor_dims_map); - } else if (node_ptr->get_op_name() == "Activation" || node_ptr->get_op_name() == "ReLU") { - addActiveLayer(node_ptr, edge_in_its, edge_out_its, inputs, nbInputs, net, tensor_map, tensor_dims_map); - } else if (node_ptr->get_op_name() == "Softmax") { - addSoftmaxLayer(node_ptr, edge_in_its, edge_out_its, inputs, nbInputs, net, tensor_map, tensor_dims_map); - } else if (node_ptr->get_op_name() == "Dense") { - addInnerProductLayer(node_ptr, edge_in_its, edge_out_its, inputs, nbInputs, net, tensor_map, tensor_dims_map); - } else if (node_ptr->get_op_name() == "Input" || node_ptr->get_op_name() == "Output"){ - } else { - std::cout << "unknown layer type:" << node_ptr->get_op_name() << std::endl; - } -} - -} -#endif - /* namespace anakin_rt */ diff --git a/framework/core/net/rt_net.h b/framework/core/net/rt_net.h deleted file mode 100644 index 83620ea83..000000000 --- a/framework/core/net/rt_net.h +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifdef USE_TENSORRT -#ifndef ANAKIN_RTNET_H -#define ANAKIN_RTNET_H - -#include "framework/graph/graph.h" -#include "framework/core/net/operator_func.h" -#include "framework/core/net/calibrator_factory.h" -#include "saber/core/tensor_op.h" -#include "third-party/tensorrt5/include/NvInfer.h" - -using namespace nvinfer1; - -namespace anakin { - -using namespace anakin::graph; - -typedef std::map> WeightMap; -typedef std::map TensorMap; -typedef std::map TensorDimsMap; - -template -class Calibrator; - -/** - * \brief Net class used for execution of graph and it is thread safety. - */ -class RTNet { -public: - typedef std::vector, - Edge > > ArcsIteratorList; - - RTNet(graph::Graph&, nvinfer1::IInt8Calibrator* calibrator); - - ~RTNet(); - -public: - - /** - * \brief do inference. - */ - void prediction(); - -public: - - /** - * \brief Get out by name. - */ - Tensor4dPtr get_out(std::string out_name); - std::vector > get_out_list(); - - /** - * \brief Get in by name. - */ - Tensor4dPtr get_in(std::string in_name); - - std::vector > get_in_list(); - -private: -void addConvLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - -void addPoolLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - -void addActiveLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - -void addSoftmaxLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - -void addInnerProductLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - -void addLayer(NodePtr node_ptr, - ArcsIteratorList& edge_in_its, - ArcsIteratorList& edge_out_its, - ITensor*const* inputs, - int nbInputs, - INetworkDefinition* net, - TensorMap& tensor_map, - TensorDimsMap& tensor_dims_map); - - -private: - ///< executor for operators in node. - //std::vector > _exec_funcs; - ///< The pointer to Context. - OpContextPtr _ctx_p; - - graph::Graph* _graph{nullptr}; - ///< Input - std::vector _input_names; - ///< Output - std::vector _output_names; - - std::map _input_names_id_map; - std::map _output_names_id_map; - - ///< A list of in tensor. - std::vector > _input_tensors; - ///< A list of out tensor. - std::vector > _output_tensors; - - ///< all tensor names - std::vector _tensor_name_list; - ///< network definition - INetworkDefinition* _network; - ///< create an optimized engine - IBuilder* _builder; - ///< engine - ICudaEngine* _engine; - IExecutionContext* _context; - //< inference - //void doInference(ICudaEngine& engine); - int _batch_size; - int _workspace_size; - std::vector _buffers; - cudaStream_t _stream; - IInt8Calibrator* _calibrator; -}; - -} -#endif -#endif - diff --git a/framework/core/net/worker.cpp b/framework/core/net/worker.cpp index ee462bc38..838a1dcbc 100644 --- a/framework/core/net/worker.cpp +++ b/framework/core/net/worker.cpp @@ -1,4 +1,6 @@ #include "framework/core/net/worker.h" + +#ifndef USE_SGX #include "saber/funcs/timer.h" namespace anakin { @@ -268,3 +270,4 @@ template class Worker; } /* namespace */ +#endif diff --git a/framework/core/net/worker.h b/framework/core/net/worker.h index b64f1cdbb..fa0de91ee 100644 --- a/framework/core/net/worker.h +++ b/framework/core/net/worker.h @@ -16,6 +16,10 @@ #ifndef ANAKIN_WORKER_H #define ANAKIN_WORKER_H +#include "anakin_config.h" + +#ifndef USE_SGX + #include #include #include @@ -199,4 +203,5 @@ using GlobalWorker = Singleton>; } /* namespace */ +#endif // ifndef USE_SGX #endif diff --git a/framework/core/operator/operator.h b/framework/core/operator/operator.h index 0f3076e68..a35eaafd0 100644 --- a/framework/core/operator/operator.h +++ b/framework/core/operator/operator.h @@ -108,7 +108,7 @@ class OperatorHelper { // Note: We can also use deep copy by using node operator=, // but if change the node attrs through net class, // the base graph can't detect it. - _node_p = node_p; + _node_p = node_p.get(); } /** @@ -152,7 +152,7 @@ class OperatorHelper { private: ///< Pointer to graph node. - graph::NodePtr _node_p; + graph::Node* _node_p; }; /** diff --git a/framework/core/parameter.h b/framework/core/parameter.h index b3b59c756..b128a6f44 100644 --- a/framework/core/parameter.h +++ b/framework/core/parameter.h @@ -262,6 +262,11 @@ class PBlock { return _d_inner_tensor->valid_shape(); } + ///get data type + DataType data_type(){ + return _h_inner_tensor -> get_dtype(); + } + /// get real shape Shape4d real_shape() { return _d_inner_tensor->shape(); @@ -353,6 +358,11 @@ class PBlock { return _d_inner_tensor->valid_shape(); } + ///get data type + DataType data_type(){ + return _h_inner_tensor -> get_dtype(); + } + /// get real shape Shape4d real_shape() { return _d_inner_tensor->shape(); @@ -431,7 +441,10 @@ class PBlock { Shape4d shape() { return _inner_tensor->valid_shape(); } - + ///get data type + DataType data_type(){ + return _inner_tensor -> get_dtype(); + } /// get real shape Shape4d real_shape() { return _inner_tensor->shape(); @@ -504,6 +517,10 @@ class PBlock { _inner_tensor->re_alloc(shape); } + ///get data type + DataType data_type(){ + return _inner_tensor -> get_dtype(); + } /// Get shape. Shape4d shape() { return _inner_tensor->valid_shape(); diff --git a/framework/core/singleton.h b/framework/core/singleton.h index b79b94042..7b8429db9 100644 --- a/framework/core/singleton.h +++ b/framework/core/singleton.h @@ -16,8 +16,12 @@ #ifndef ANAKIN_SINGLETON_H #define ANAKIN_SINGLETON_H -#include +#include "anakin_config.h" #include "framework/core/thread_safe_macros.h" +#include +#ifdef USE_SGX +#include +#endif namespace anakin { diff --git a/framework/core/thread_pool.h b/framework/core/thread_pool.h index ef5d842cd..dd20f0bbd 100644 --- a/framework/core/thread_pool.h +++ b/framework/core/thread_pool.h @@ -26,6 +26,10 @@ #include "framework/core/thread_safe_macros.h" #include "framework/core/type_traits_extend.h" #include "utils/logger/logger.h" +#include "anakin_config.h" +#ifdef USE_SGX +#include +#endif namespace anakin { diff --git a/framework/graph/graph.cpp b/framework/graph/graph.cpp index 90bbf31b5..a829d6c1e 100644 --- a/framework/graph/graph.cpp +++ b/framework/graph/graph.cpp @@ -38,6 +38,7 @@ Status Graph::load(const char* buffer, size_t len) EXCLUSIVE_LOCKS return ret; } +#ifndef USE_NANOPB template Status Graph::save(std::string model_path) { return parser::save(this, model_path); @@ -47,6 +48,7 @@ template Status Graph::save(const char* model_path) { return parser::save(this, model_path); } +#endif template std::vector& Graph::get_nodes_in_order() { @@ -61,7 +63,7 @@ void Graph::Reshape(std::string in_name, std::string in_shape = "input_shape"; auto input_dim = input_node_p->template get_attr>(in_shape); CHECK_EQ(input_dim.size(), shape.size()) << "Target shape parameter's dim should equal to " << - input_dim.size(); + input_dim.size(); for (int i = 0; i < input_dim.size(); i++) { input_dim[i] = shape[i]; @@ -84,41 +86,253 @@ void Graph::ResetBatchSize(std::string in_name, } template -void Graph::change_name() { - auto convert2underline = [&](std::string& name, char converter_char) -> std::string { - char* target_p = strdup(name.c_str()); - for (char* p = strchr(target_p + 1, converter_char); p!=NULL; p = strchr(p + 1, converter_char)) { - *p = '_'; +Status Graph::AddOp(const std::string& name, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + NodePtr node_p = std::make_shared(); + node_p->set_name(name); + node_p->get_op_name() = type; + this->add_vertex(name, node_p); + node_ins[name] = inputs; + node_outs[name] = outputs; + return Status::OK(); +} + +template +Status Graph::RegistBlock(PBlock * block_p) { + graph::GraphGlobalMem::Global().register_block(block_p); + return Status::OK(); +} + +template +Status Graph::SetOpPrec(const std::string& name, DataType dtype) { + if(this->has_vertex(name)) { + NodePtr node_p = (*this)[name]; + node_p->set_bit_type(dtype); + return Status::OK(); + } + return Status::ANAKINFAIL("[EEROR]: SetOpPrec is called on an unknown op name"); +} + +template +Status Graph::SetWeightsScale(const std::string& name, const std::vector& scales, bool is_bias) { + if(this->has_vertex(name)) { + NodePtr node_p = (*this)[name]; + if(is_bias) { + bool bias_term = node_p->get_attr("bias_term"); + if(bias_term) { + auto bias = node_p->get_attr>("weight_2"); + bias.d_tensor().set_scale(scales); + bias.h_tensor().set_scale(scales); + return Status::OK(); + } + return Status::OK("[WARNING]: SetWeightsScale is called to set bias scales in node which doesn't have it."); + } else { // is weight + if(node_p->inspect_attr("weight_1")) { + auto weight = node_p->get_attr>("weight_1"); + weight.d_tensor().set_scale(scales); + weight.h_tensor().set_scale(scales); + return Status::OK(); + } + return Status::OK("[WARNING]: SetWeightsScale is called to set weight scales in node which doesn't have it."); } - return std::string(target_p); - }; - auto change_node_name = [&, this](graph::NodePtr& node_p) { - auto & name = node_p->name(); - // add_alias is an important api for changing node's name and edge - // and add_alias is useful only at this place so far. - this->add_alias(name, convert2underline(name, '/')); - name = convert2underline(name, '/'); - this->add_alias(name, convert2underline(name, '-')); - name = convert2underline(name, '-'); + } + return Status::ANAKINFAIL("[EEROR]: SetOpPrec is called on an unknown op name"); +} + +template +Status Graph::SetVarScale(const std::string& var, float scale) { + std::unordered_map > in_to_op_map; + std::unordered_map > out_to_op_map; + for(const auto& pair: node_ins) { + for(auto& in : pair.second) { + in_to_op_map[in].push_back(pair.first); + } + } + for(const auto& pair: node_outs) { + for(auto& out : pair.second) { + out_to_op_map[out].push_back(pair.first); + } + } + for(const auto& pair : in_to_op_map) { + if(in_to_op_map.count(var) > 0) { + for(auto top : in_to_op_map[var]) { + auto bottom = out_to_op_map[var][0]; + if(this->has_arc(bottom, top)) { + auto& edge = this->get_arc(bottom, top); + edge.set_scale({scale}); + NodePtr node_p = (*this)[top]; + if(node_p->get_op_name() == "Split") { + for(auto& edge_it : this->get_out_arc_its(top)) { + edge_it->set_scale({scale}); + } + } + } + } + } + } + return Status::OK(); +} + +template +Status Graph::RegistVar(const std::string& var) { + auto regist_new_output = [&, this] () { + this->add_out(var); + this->AddOp(var, "Output", {var}, {}); }; - this->Scanner->BFS(change_node_name); - - auto change_edge_name = [&, this](graph::Edge& edge) { - auto & first = edge.first(); - auto & second = edge.second(); - first = convert2underline(first, '/'); - second = convert2underline(second, '/'); - first = convert2underline(first, '-'); - second = convert2underline(second, '-'); + + std::unordered_map > in_to_op_map; + std::unordered_map > out_to_op_map; + for(const auto& pair: node_ins) { + for(auto& in : pair.second) { + in_to_op_map[in].push_back(pair.first); + } + } + for(const auto& pair: node_outs) { + for(auto& out : pair.second) { + out_to_op_map[out].push_back(pair.first); + } + } + for(const auto& pair : in_to_op_map) { + if(in_to_op_map.count(var) > 0) { + for(auto top : in_to_op_map[var]) { + auto bottom = out_to_op_map[var][0]; + std::pair tmp_pair(bottom, top); + _registed_outs.push_back(tmp_pair); + regist_new_output(); + } + } + } + return Status::OK(); +} + +template +Status Graph::Freeze() { + std::unordered_map > in_to_op_map; + std::unordered_map > out_to_op_map; + for(const auto& pair: node_ins) { + for(auto& in : pair.second) { + in_to_op_map[in].push_back(pair.first); + } + } + for(const auto& pair: node_outs) { + for(auto& out : pair.second) { + out_to_op_map[out].push_back(pair.first); + } + } + std::unordered_map > op_map_ins; + std::unordered_map > op_map_outs; + std::unordered_map > split_map_ins; + std::unordered_map > split_map_outs; + + for(const auto& pair: node_ins) { + for(auto& in : pair.second) { + if(out_to_op_map.count(in) <= 0) { + op_map_ins[in] = std::vector{}; + op_map_outs[in] = std::vector{in}; + } + } + } + for(const auto& pair: op_map_ins) { + auto op_name = pair.first; + if(!this->has_vertex(op_name)) { + this->add_in(op_name); + this->AddOp(op_name, "Input", op_map_ins[op_name], op_map_outs[op_name]); + } + } + op_map_ins.clear(); + op_map_outs.clear(); + auto auto_replace_split_ins = [&, this](const std::string split_variable, + const std::vector& outputs, + const std::vector& split_nexts) { + for(int i=0; i < split_nexts.size(); i++) { + for(auto& in : node_ins[split_nexts[i]]) { + if(in == split_variable) { + in = outputs[i]; + } + } + } }; - this->Scanner->BFS_Edge(change_edge_name); + + // automatically add Split and Output + for(const auto& pair : node_outs) { + for(auto& out : pair.second) { + if(in_to_op_map.count(out) <=0) { + op_map_ins[out] = std::vector{out}; + op_map_outs[out] = std::vector{}; + continue; + } + if (in_to_op_map[out].size() > 1) { + // find one to multi edge + std::vector inputs; + std::vector outputs; + inputs.push_back(out); + int split_num = in_to_op_map[out].size(); + for(int i=0; i < split_num; i++) { + std::ostringstream oss; + oss << out << "_split_" << i; + outputs.push_back(oss.str()); + } + std::string split_name = out + std::string("split"); + split_map_ins[split_name] = inputs; + split_map_outs[split_name] = outputs; + auto_replace_split_ins(out, outputs, in_to_op_map[out]); + } + } + } + for(const auto& pair: op_map_ins) { + auto op_name = pair.first; + if(!this->has_vertex(op_name)) { + this->add_out(op_name); + this->AddOp(op_name, "Output", op_map_ins[op_name], op_map_outs[op_name]); + } + } + for(const auto& pair : split_map_ins) { + auto split_name = pair.first; + if(!this->has_vertex(split_name)) { + this->AddOp(split_name, "Split", split_map_ins[split_name], split_map_outs[split_name]); + this->AddOpAttr(split_name, "split_num", (int)(split_map_outs[split_name].size())); + } + } + + in_to_op_map.clear(); + out_to_op_map.clear(); + for(const auto& pair: node_ins) { + for(auto& in : pair.second) { + in_to_op_map[in].push_back(pair.first); + } + } + for(const auto& pair: node_outs) { + for(auto& out : pair.second) { + out_to_op_map[out].push_back(pair.first); + } + } + // those code logic with loop belown can't merge with that above + for (const auto& pair: node_ins) { + for (auto& in : pair.second) { + if(out_to_op_map.count(in) > 0) { + graph::Edge edge(out_to_op_map[in][0], pair.first); + this->add_in_arc(edge); + } + } + } + for (const auto& pair: node_outs) { + for (auto& out : pair.second) { + if(in_to_op_map.count(out) > 0) { + graph::Edge edge(pair.first, in_to_op_map[out][0]); + this->add_out_arc(edge); + } + } + } + return Status::OK(); } + template Status Graph::RegistOut(std::string node_bottom_name, std::string node_top_name) { std::pair tmp_pair(node_bottom_name, node_top_name); _registed_outs.push_back(tmp_pair); - return Status::OK();; + return Status::OK(); } template @@ -135,7 +349,7 @@ Status Graph::RegistAllOut() { } template -Status Graph::Optimize(bool use_tensorrt) EXCLUSIVE_LOCKS_REQUIRED(_mut) { +Status Graph::Optimize(bool with_fusion) EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); if (!_has_graph_optimized) { @@ -145,8 +359,9 @@ Status Graph::Optimize(bool use_tensorrt) EXCLUSIVE_LOCKS_REQUIRED //! decide wheter the vgraph is optimized auto is_optimized = statistics.get_info(); + is_optimized = false; - if (is_optimized && (_registed_outs.size() == 0) || use_tensorrt) { + if (is_optimized && (_registed_outs.size() == 0)) { // schedule for exec order Scheduler scheduler; scheduler.RegIOResource(_vgraph); @@ -154,14 +369,32 @@ Status Graph::Optimize(bool use_tensorrt) EXCLUSIVE_LOCKS_REQUIRED // get node exec in order _nodes_exec_order = scheduler.get_exec_node_in_order(); } else { - DLOG(WARNING) << "Exe the graph fusion and combination [ SUPPORT IN-ORDER PATTERM ]"; - // TODO ... - auto in_ordered_fusion_op_name_vec = FusionOpRegister::Global().get_list_op_name_in_fusion_order_of(IN_ORDER); - for (auto& fusion_name : in_ordered_fusion_op_name_vec) { - LOG(INFO) << " processing in-ordered fusion : " << fusion_name; - _vgraph->Match(FusionOpRegister::Global()[fusion_name]); - } + if (with_fusion) { + // xiaogang rang wo jia de + DLOG(WARNING) << "Exe the graph fusion and combination [ SUPPORT IN-ORDER PATTERM ]"; + // TODO ... + auto in_ordered_fusion_op_name_vec = FusionOpRegister::Global().get_list_op_name_in_fusion_order_of( + IN_ORDER); + for (auto &fusion_name : in_ordered_fusion_op_name_vec) { + //in x86, we ignore two fusion patterns + if (std::is_same::value && + (fusion_name == "ConvReluPool" || fusion_name == "ConvBatchnormScaleReluPool")) { + continue; + } + + if (std::is_same::value && Precision::INT8 == Ptype && + (fusion_name == "ConvReluPool" || fusion_name == "ConvBatchnormScaleReluPool")) { + continue; + } + if (std::is_same::value && Precision::INT8 == Ptype && + (fusion_name == "ConvReluPool" || fusion_name == "ConvBatchnormScaleReluPool")) { + continue; + } + DLOG(INFO) << " processing in-ordered fusion : " << fusion_name; + _vgraph->Match(FusionOpRegister::Global()[fusion_name]); + } + } DLOG(WARNING) << "Schedule the vgraph for memory optimization and exec lanes ,as well as sync flags."; @@ -172,18 +405,25 @@ Status Graph::Optimize(bool use_tensorrt) EXCLUSIVE_LOCKS_REQUIRED //LOG(ERROR) << "gen exe order"; - _nodes_exec_order = scheduler.get_exec_node_in_order(); - - + _nodes_exec_order = scheduler.get_exec_node_in_order(); +//#if 0 #ifndef BUILD_LITE // enable conv+eltwise fusion +#ifndef USE_ARM_PLACE // optimization - ConvElsFusionScheduler conv_eltwise_fusion_scheduler; - conv_eltwise_fusion_scheduler.RegIOResource(_vgraph); - conv_eltwise_fusion_scheduler.Run(); - // get node exec in order - _nodes_exec_order = conv_eltwise_fusion_scheduler.get_exec_node_in_order(); + // xiaogang rang wo jia de + if (with_fusion) { + if ((std::is_same::value||std::is_same::value) && Precision::INT8 == Ptype) { + } else { + ConvElsFusionScheduler conv_eltwise_fusion_scheduler; + conv_eltwise_fusion_scheduler.RegIOResource(_vgraph); + conv_eltwise_fusion_scheduler.Run(); + // get node exec in order + _nodes_exec_order = conv_eltwise_fusion_scheduler.get_exec_node_in_order(); + } + } +#endif #endif - // optimization again + // optimization again ParallScheduler para_scheduler; para_scheduler.RegIOResource(_vgraph); para_scheduler.Run(); @@ -268,9 +508,72 @@ VGraph& Graph::get_vgraph() { return *_vgraph; } +//get graph scale maps +template +std::unordered_map> +Graph::get_scale_map(){ + std::unordered_map> scale_map; + auto get_scale = [&, this](NodePtr& node_p){ + auto& arc_its = this->get_in_arc_its(node_p->name()); + for (auto arc : arc_its){ + std::string edge_s = arc -> name(); + std::vector scales = arc -> scale(); + scale_map[edge_s] = scales; + } + }; + + this->Scanner->BFS(get_scale); + return scale_map; +} +//get graph scale maps +template +std::unordered_map +Graph::get_layout_map(){ + std::unordered_map layout_map; + auto get_layout = [&, this](Edge& edge){ + layout_map[edge.name()] = edge.layout(); + }; + + this->Scanner->BFS_Edge(get_layout); + return layout_map; +} + +template +void Graph::load_calibrator_config( + std::string config_file, std::string cal_file){ + CalibratorParser cal_parser; +#ifndef USE_SGX + cal_parser.parse_from_file(config_file, cal_file); +#endif + + auto set_node_info = [&](NodePtr& node_p){ + node_p->set_bit_type(cal_parser.get_dtype_of_precision(node_p->name())); + }; + this->Scanner->BFS(set_node_info); + + auto set_edge_scale = [&](Edge& edge){ + edge.set_scale({cal_parser.get_calibrator(edge.name())}); + }; + this->Scanner->BFS_Edge(set_edge_scale); +} +template +void Graph::load_layout_config(std::string config_file){ + CalibratorParser cal_parser; + cal_parser.layout_parse(config_file); + + auto set_edge_info = [&](Edge& edge){ + LOG(ERROR)<<"load layout :: " << edge.name() <<","<< cal_parser.get_layout(edge.name()); + edge.set_layout(cal_parser.get_layout(edge.name())); + }; + this->Scanner->BFS_Edge(set_edge_info); +} + template Status Graph::restore_from_vgraph(VGraph* vgraph) { //! need to clear graph edge first + auto graph_scale_map = this->get_scale_map(); + auto graph_layout_map = this->get_layout_map(); + this->arcs_clear(); auto interpreter_io_in = [&, this](node& target_node) { @@ -333,7 +636,7 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { node & target_node) -> Status { if (node_p->name() == target_node.name) { CHECK_EQ(target_node.mergeNodes.size(), target_node.mergeNodeNames.size()) - << "Merge node must have same size with merged pattern name"; + << "Merge node must have same size with merged pattern name"; if (target_node.mergeNodes.size()) { // target node is merged nodes. for (int i = 0; i < target_node.mergeNodes.size(); i++) { @@ -341,11 +644,11 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { this->_pattern_name_merges[target_node.name].push_back(target_node.mergeNodeNames[i]); } } - if (target_node.idx_keep_in_merge_nodes.size()) { - for (auto& idx : target_node.idx_keep_in_merge_nodes) { - this->_node_merges_keep[target_node.name].push_back(idx); - } - } + if (target_node.idx_keep_in_merge_nodes.size()) { + for (auto& idx : target_node.idx_keep_in_merge_nodes) { + this->_node_merges_keep[target_node.name].push_back(idx); + } + } auto& need_wait = node_p->need_wait(); need_wait = target_node.need_wait; @@ -374,24 +677,101 @@ Status Graph::restore_from_vgraph(VGraph* vgraph) { (*node_p).Merge(*tmp_node_p, this->_pattern_name_merges[target_node_name][i]); // add the merge node's attr - // detect if the i-th node in _node_merges should be saved in Graph - auto ret = std::find(this->_node_merges_keep[target_node_name].begin(), - this->_node_merges_keep[target_node_name].end(), - i); - if (ret == this->_node_merges_keep[target_node_name].end()) { - this->remove(this->_node_merges[target_node_name][i]); // remove merge node which is useless - } + // detect if the i-th node in _node_merges should be saved in Graph + auto ret = std::find(this->_node_merges_keep[target_node_name].begin(), + this->_node_merges_keep[target_node_name].end(), + i); + if (ret == this->_node_merges_keep[target_node_name].end()) { + this->remove(this->_node_merges[target_node_name][i]); // remove merge node which is useless + } } } return Status::OK(); }; this->Scanner->BFS(merge_node_attrs); + + //recover scales to edge + auto recover_scale = [&, this](Edge& edge){ + std::string edge_name = edge.name(); + std::string old_name = vgraph -> get_fusion_old_edge(edge_name); + if (old_name != ""){ + edge_name = old_name; + } + if (graph_scale_map.count(edge_name) > 0){ + auto scales = graph_scale_map[edge_name]; + edge.set_scale(scales); + } else { + LOG(ERROR) << "when recover scale: the edge has no scale to map:" << edge_name; + } + + }; + this->Scanner->BFS_Edge(recover_scale); + + //recover layout to edge + auto recover_layout = [&, this](Edge& edge){ + std::string edge_name = edge.name(); + std::string old_name = vgraph -> get_fusion_old_edge(edge_name); + if (old_name != ""){ + edge_name = old_name; + } + if (graph_layout_map.count(edge_name) > 0){ + auto layout = graph_layout_map[edge_name]; + edge.set_layout(layout); + } else { + LOG(ERROR) << "when recover layout: the edge has no layout to map:" << edge_name; + } + + }; + this->Scanner->BFS_Edge(recover_layout); + + //for conv_eltwise, we deal scale to one node + auto conv_eltwise_deal_scale = [this](NodePtr& node_p) -> Status { + if (node_p->get_op_name() == "Gather"){ + auto in_edge_its = this->get_in_arc_its(node_p->name()); + float scale_0 = 1.f; + float scale_3 = 1.f; + DataType be_eltwise_dtype = AK_INVALID; + CHECK_EQ(in_edge_its.size(), 2); + auto eltwise_node_name = in_edge_its[0]->bottom(); + + if ((*this)[in_edge_its[0]->bottom()]->get_op_name() == "ConvEltwise"){ + if (in_edge_its[1]->scale().size() > 0){ + scale_0 = in_edge_its[1]->scale()[0]; + } + be_eltwise_dtype = (*this)[in_edge_its[1]->bottom()]->bit_type(); + } else { + if (in_edge_its[0]->scale().size() > 0){ + scale_0 = in_edge_its[0]->scale()[0]; + } + be_eltwise_dtype = (*this)[in_edge_its[0]->bottom()]->bit_type(); + eltwise_node_name = in_edge_its[1]->bottom(); + } + auto out_edge_its = this->get_out_arc_its(node_p->name()); + CHECK_EQ(out_edge_its.size(), 1); + if (in_edge_its[1]->scale().size() > 0){ + scale_3 = out_edge_its[0]->scale()[0]; + + } + auto eltwise_node = (*this)[eltwise_node_name]; + eltwise_node->template set_attr("scale_0", scale_0); + eltwise_node->template set_attr("scale_3", scale_3); + eltwise_node->template set_attr("be_eltwise_dtype", be_eltwise_dtype); + } + + return Status::OK(); + }; + this->Scanner->BFS(conv_eltwise_deal_scale); + + return Status::OK(); } template Status Graph::CopyFrom(Graph& graph) { + if(this->size() == graph.size()) { + return Status::OK(); + } // this clear all the edges and nodes this->all_clear(); auto shallow_copy_node = [&, this](NodePtr& node_p) { @@ -421,11 +801,11 @@ Status Graph::CopyFrom(Graph& graph) { graph.Scanner->BFS(shallow_copy_edge); // get node execution order _nodes_exec_order = graph.get_nodes_in_order(); - // get graph inputs and outputs - _ins = graph._ins; - _outs = graph._outs; - // get statistic - statistics = graph.statistics; + // get graph inputs and outputs + _ins = graph._ins; + _outs = graph._outs; + // get statistic + statistics = graph.statistics; return Status::OK(); } @@ -455,16 +835,10 @@ template class Graph; #endif #ifdef USE_ARM_PLACE -#ifdef ANAKIN_TYPE_FP32 template class Graph; -#endif -#ifdef ANAKIN_TYPE_FP16 template class Graph; -#endif -#ifdef ANAKIN_TYPE_INT8 template class Graph; #endif -#endif #ifdef AMD_GPU template class Graph; diff --git a/framework/graph/graph.h b/framework/graph/graph.h index 398660636..eae10e5b7 100644 --- a/framework/graph/graph.h +++ b/framework/graph/graph.h @@ -5,16 +5,16 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_GRAPH_H -#define ANAKIN_GRAPH_H +#define ANAKIN_GRAPH_H #include "framework/graph/graph_base.h" #include "framework/graph/node.h" @@ -22,6 +22,7 @@ #include "framework/graph/llvm/virtual_graph.h" #include "framework/core/thread_safe_macros.h" #include "framework/graph/graph_global_mem.h" +#include "framework/core/net/calibrator_parse.h" namespace anakin { @@ -32,22 +33,24 @@ namespace graph { * public inherit GraphBase */ template -class Graph : public GraphBase, +class Graph : public GraphBase, Edge > { public: - Graph():GraphBase, + typedef Arc_iterator, Edge > Edge_it_t; +public: + Graph():GraphBase, Edge >() {} - Graph(size_t size):GraphBase, + Graph(size_t size):GraphBase, Edge >(size) {} ~Graph() { - if(_vgraph) { + if (_vgraph) { delete _vgraph; _vgraph = nullptr; } @@ -68,12 +71,14 @@ class Graph : public GraphBase& get_nodes_in_order(); @@ -82,25 +87,71 @@ class Graph : public GraphBase shape); void ResetBatchSize(std::string in_name, const int batch_size); + +public: + + /** + * \brief add operation manually + */ + Status AddOp(const std::string& name, const std::string& type, + const std::vector& inputs, + const std::vector& outputs); - /// change graph node and edge name to standard of c(or others)variable name - void change_name(); + /** + * \brief set operation's attributes manually + */ + template + Status AddOpAttr(const std::string& op_name, const std::string& attr_name, const T& attr_value); + + /** + * \brief register external block pointer + */ + Status RegistBlock(PBlock * block_p); + + /** + * \brief set operation's running precision manually + */ + Status SetOpPrec(const std::string& name, DataType dtype); + + /** + * \brief set operation's weights scale factor manually + */ + Status SetWeightsScale(const std::string& name, const std::vector& scales, bool is_bias); + + /** + * \brief set operation's variable scale factor manually + */ + Status SetVarScale(const std::string& var, float scale); + + /** + * \brief freeze the graph + * + * note: this function should only be used after AddOp is called + */ + Status Freeze(); + + /** + * \brief register variable with corresponding edges + * + * note: this api should only be called before Freeze() + */ + Status RegistVar(const std::string& var); public: - /** + /** * \brief register out * - * Note: + * Note: * The outs is the same as edge weight from node_bottom_name to node_top_name * When register the out edge, all the fusion pattern that have the edge can't be combined * and maybe have an bad impact on the perfermance */ Status RegistOut(std::string node_bottom_name, std::string node_top_name); - - /** + + /** * \brief register all outs * - * Note: + * Note: * All the outs will be registered. * This api should be used when you test you model and want to test some edge's tensor inside the graph. */ @@ -108,7 +159,7 @@ class Graph : public GraphBase& graph); + //get all edge scales in graph + std::unordered_map> + get_scale_map(); + std::unordered_map + get_layout_map(); + ///< statistics stand for Statistics info of anakin graph Statistics statistics; @@ -135,9 +192,9 @@ class Graph : public GraphBase _ins; - ///< graph output node name - std::vector _outs; + std::vector _ins; + ///< graph output node name + std::vector _outs; ///< graph node execute list std::vector _nodes_exec_order; ///< node_merges map: target node map to all its fusion node @@ -151,16 +208,27 @@ class Graph : public GraphBase> _registed_outs; + ///< temporary map for node inputs and outputs + std::unordered_map > node_ins; + std::unordered_map > node_outs; private: /// this used to holder the name of target parsed model. std::string _model_path{"None"} GUARDED_BY(this->_mut); /// this make the graph optimized. - bool _has_graph_optimized{false}; GUARDED_BY(this->_mut); + bool _has_graph_optimized{false} GUARDED_BY(this->_mut); std::mutex _mut; -}; - +}; +template +template +Status Graph::AddOpAttr(const std::string& op_name, + const std::string& attr_name, const T& attr_value) { + if (this->has_vertex(op_name)) { + return (*this)[op_name]->set_attr(attr_name, attr_value); + } + return Status::ANAKINFAIL((op_name+std::string(" op doesn't exist!")).c_str()); +} } /* graph */ diff --git a/framework/graph/graph_base.inl b/framework/graph/graph_base.inl index e188da1db..b81febb11 100644 --- a/framework/graph/graph_base.inl +++ b/framework/graph/graph_base.inl @@ -1,3 +1,5 @@ +#include "framework/graph/node.h" +#include "saber/core/tensor.h" namespace anakin { namespace graph { @@ -16,7 +18,7 @@ GraphBase::GraphBase(size_t siz template GraphBase::~GraphBase() { - all_clear(); + all_clear(); delete Scanner; Scanner = nullptr; } @@ -33,6 +35,27 @@ void GraphBase::vertices_clear( _vertices.clear(); } +template <> +inline void GraphBase, Tensor4dPtr, Edge>::vertices_clear(){ + for (auto iter=_vertices.begin(); iter != _vertices.end(); iter++){ + if(iter->second.use_count()>1){ + LOG(INFO)<<"force destory node "<first<<",count = "<second.use_count(); +// delete iter->second.get(); + } + } + _vertices.clear(); +}; +template <> +inline void GraphBase, Tensor4dPtr, Edge>::vertices_clear(){ + for (auto iter=_vertices.begin(); iter != _vertices.end();iter++){ + if(iter->second.use_count()>1){ + LOG(INFO)<<"force destory node "<first<<",count = "<second.use_count(); +// delete iter->second.get(); + } + } + _vertices.clear(); +}; + template void GraphBase::all_clear() { arcs_clear(); @@ -73,7 +96,8 @@ template::add_in_arc(ArcType& arc) { if(!this->has_arc(arc)){ _arcs.push_back(arc); - CHECK(this->has_vertex(arc.bottom()) && this->has_vertex(arc.top())) << " The arc's top or bottom is not vertex! "; + CHECK(this->has_vertex(arc.bottom()) && this->has_vertex(arc.top())) + << " The arc("<< arc.bottom() <<", "<< arc.top() << ")'s top or bottom is not vertex! "; } Arc_iterator arc_iterator = find(arc.bottom(), arc.top()); auto top_in_arcs = _graph_in_arcs[arc.top()]; @@ -329,6 +353,9 @@ VertexType& GraphBase::operator template inline std::string GraphBase::to_string() { +#ifdef USE_SGX + return "GrahBase.to_string() not implemented in SGX mode"; +#else std::ostringstream vertices_ss; vertices_ss << "Graph infrastructure: \n-- Vertices: (sum " << size() << ") \n"; int index = 0; @@ -343,7 +370,8 @@ inline std::string GraphBase::t for(; it!=it_end; it++) { arcs_ss << " |-- (arc: " << it->bottom() << " --> " << it->top() << ") \n"; } - return vertices_ss.str() + arcs_ss.str(); + return vertices_ss.str() + arcs_ss.str(); +#endif } diff --git a/framework/graph/graph_global_mem.h b/framework/graph/graph_global_mem.h index 98448ef63..ca3339d59 100644 --- a/framework/graph/graph_global_mem.h +++ b/framework/graph/graph_global_mem.h @@ -5,29 +5,34 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_GRAPH_GLOBAL_MEM_H -#define ANAKIN_GRAPH_GLOBAL_MEM_H +#define ANAKIN_GRAPH_GLOBAL_MEM_H #include -#include #include "framework/core/singleton.h" #include "framework/core/parameter.h" #include "utils/logger/logger.h" +#include +#include "anakin_config.h" + +#ifdef USE_SGX +#include +#endif namespace anakin { using namespace saber; /** -* \brief global resource level +* \brief global resource level */ enum Level { Level_0 = 0, @@ -35,7 +40,7 @@ enum Level { Level_2, Level_3, Level_4, - Level_5 + Level_5 }; namespace graph { @@ -53,20 +58,17 @@ struct LevelStage { * \brief global resource multi level stage and restraint */ template -struct GlobalResRestrain : public LevelStage... { - GlobalResRestrain() {} - GlobalResRestrain& operator=(const GlobalResRestrain& other){ - return *this; - } - +struct GlobalResRestrain : public LevelStage ... { template - std::mutex& get_mut() { + std::mutex &get_mut() { return LevelStage::_mut; } + template - bool& check_access() { + bool &check_access() { return LevelStage::accessible; } + template void use() { LevelStage::accessible = false; @@ -78,66 +80,88 @@ struct GlobalResRestrain : public LevelStage... { */ template class GraphGlobalMemBase { +private: + typedef GlobalResRestrain LevelList; + + static inline std::unique_ptr make_lock() noexcept { + return std::unique_ptr(new LevelList()); + } + public: - GraphGlobalMemBase() {} + GraphGlobalMemBase() { + _res_guard.emplace(nullptr, make_lock()); + } + ~GraphGlobalMemBase() {} /// create Block memory template - PBlock* new_block(saber::Shape& shape) EXCLUSIVE_LOCKS_REQUIRED(_mut) { - std::unique_lock lock(this->_mut); - PBlock* block_p = new PBlock(shape, Dtype); + PBlock *new_block(saber::Shape &shape) EXCLUSIVE_LOCKS_REQUIRED(_mut) { + std::unique_lock lock(this->_mut); + PBlock *block_p = new PBlock(shape, Dtype); // register new block_p for resource guard - _res_guard[block_p->h_tensor().data()] = LevelList(); - _push_mem_pool(block_p, DataTypeWarpper()); + _res_guard[block_p->d_tensor().data()].reset(new LevelList()); + _push_mem_pool(block_p, DataTypeWarpper()); return block_p; } + /// register external block + void register_block(PBlock * block_p) EXCLUSIVE_LOCKS_REQUIRED(_mut) { + std::unique_lock lock(this->_mut); + _res_guard[block_p->d_tensor().data()].reset(new LevelList()); + // we don't push block to the mem pool when use this api + //_push_mem_pool(block_p, DataTypeWarpper()); + } + /// apply arbitrary function to two memory block /// note: that args may contain target PBlock pointer /// so we need to set mutex for mem management template - void apply(functor func, PBlock tensor_1 , PBlock tensor_2, ParamTypes ...args) { + void apply(functor func, PBlock tensor_1, PBlock tensor_2, ParamTypes &&...args) { std::unique_lock lock(this->_mut); - void* key_1 = tensor_1.h_tensor().data(); - void* key_2 = tensor_1.h_tensor().data(); - if(_res_guard[key_1].template check_access()) { - std::unique_lock lock(_res_guard[key_1].template get_mut()); - _res_guard[key_1].template use(); - _res_guard[key_2].template use(); + void *key_1 = tensor_1.d_tensor().data(); + void *key_2 = tensor_2.d_tensor().data(); + if (_res_guard[key_1]->template check_access() && _res_guard[key_2]->template check_access()) { + std::unique_lock lock1(_res_guard[key_1]->template get_mut()); + if(key_1 != key_2) { + std::unique_lock lock2(_res_guard[key_2]->template get_mut()); + } + _res_guard[key_1]->template use(); + _res_guard[key_2]->template use(); func(tensor_1, tensor_2, std::forward(args)...); - void* new_key_1 = tensor_1.h_tensor().data(); - void* new_key_2 = tensor_2.h_tensor().data(); - if(new_key_1 != key_1) { - _res_guard[new_key_1] = _res_guard[key_1]; - if(_res_guard.erase(key_1) != 1) { // delete old key-vale - LOG(FATAL) << "target key_1(" << key_1 << ") doesn't exist."; + void *new_key_1 = tensor_1.d_tensor().data(); + void *new_key_2 = tensor_2.d_tensor().data(); + if (new_key_1 != key_1) { + _res_guard.emplace(new_key_1, make_lock()).first->second.swap(_res_guard[key_1]); + if (key_1 && _res_guard.erase(key_1) != 1) { // delete old key-vale + LOG(FATAL) << "target key_1(" << key_1 << ") doesn't exist."; } } - if(new_key_2 != key_2) { - _res_guard[new_key_2] = _res_guard[key_2]; - if(_res_guard.erase(key_2) != 1) { // delete old key-vale - LOG(FATAL) << "target key_2(" << key_2 << ") doesn't exist."; + if (new_key_2 != key_2) { + _res_guard.emplace(new_key_2, make_lock()).first->second.swap(_res_guard[key_2]); + if (key_2 && _res_guard.erase(key_2) != 1) { // delete old key-vale + LOG(FATAL) << "target key_2(" << key_2 << ") doesn't exist."; } } } } + /// apply arbitrary function to one memory block /// note: that args may contain target PBlock pointer /// so we need to set mutex for mem management template - void apply(functor func, PBlock tensor , ParamTypes ...args) { + void apply(functor func, PBlock tensor, ParamTypes &&...args) { std::unique_lock lock(this->_mut); - void* key = tensor.h_tensor().data(); - if(_res_guard[key].template check_access()) { - std::unique_lock lock(_res_guard[key].template get_mut()); - _res_guard[key].template use(); + void *key = tensor.d_tensor().data(); + if (_res_guard[key]->template check_access()) { + std::unique_lock lock(_res_guard[key]->template get_mut()); + _res_guard[key]->template use(); func(tensor, std::forward(args)...); - void* new_key = tensor.data(); - if(new_key != key) { - _res_guard[new_key] = _res_guard[key]; - if(_res_guard.erase(key) != 1) { // delete old key-vale - LOG(FATAL) << "target key(" << key << ") doesn't exist."; + void *new_key = tensor.data(); + if (new_key != key) { + _res_guard.emplace(new_key, make_lock()).first->second.swap(_res_guard[key]); + if (key && _res_guard.erase(key) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key << ") doesn't exist."; } } } @@ -147,67 +171,73 @@ class GraphGlobalMemBase { /// note: that args may contain target PBlock pointer /// so we need to set mutex for mem management template - void apply(functor func, Tensor4d& tensor , ParamTypes ...args) { + void apply(functor func, Tensor4d &tensor, ParamTypes &&...args) { std::unique_lock lock(this->_mut); - void* key = tensor.data(); - if(_res_guard[key].template check_access()) { - std::unique_lock lock(_res_guard[key].template get_mut()); - _res_guard[key].template use(); + void *key = tensor.data(); + if (_res_guard[key]->template check_access()) { + std::unique_lock lock(_res_guard[key]->template get_mut()); + _res_guard[key]->template use(); func(tensor, std::forward(args)...); - void* new_key = tensor.data(); // check if tensor data has changed - if(key != new_key) { - _res_guard[new_key] = _res_guard[key]; - if(_res_guard.erase(key) != 1) { // delete old key-vale - LOG(FATAL) << "target key(" << key << ") doesn't exist."; + void *new_key = tensor.data(); // check if tensor data has changed + if (key != new_key) { + _res_guard.emplace(new_key, make_lock()).first->second.swap(_res_guard[key]); + if (key && _res_guard.erase(key) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key << ") doesn't exist."; } } } - if(key == nullptr) { + if (key == nullptr) { func(tensor, std::forward(args)...); } } -template -void apply(functor func, Tensor4d& tensor1 , Tensor4d& tensor2, ParamTypes ...args) { + + template + void apply(functor func, Tensor4d &tensor1, Tensor4d &tensor2, ParamTypes &&...args) { std::unique_lock lock(this->_mut); - void* key1 = tensor1.data(); - void* key2 = tensor2.data(); - if (_res_guard[key1].template check_access()) { - std::unique_lock lock(_res_guard[key1].template get_mut()); - _res_guard[key1].template use(); - _res_guard[key2].template use(); - func(tensor1, tensor2, std::forward(args)...); - void* new_key1 = tensor1.data(); // check if tensor data has changed - void* new_key2 = tensor2.data(); // check if tensor data has changed - if (key1 != new_key1) { - _res_guard[new_key1] = _res_guard[key1]; - if (_res_guard.erase(key1) != 1) { // delete old key-vale - LOG(FATAL) << "target key(" << key1 << ") doesn't exist."; + void *key1 = tensor1.data(); + void *key2 = tensor2.data(); + if(_res_guard.count(key1) > 0 && _res_guard.count(key2) > 0) { + if (_res_guard[key1]->template check_access() || _res_guard[key2]->template check_access()) { + std::unique_lock lock1(_res_guard[key1]->template get_mut()); + if (key2 != key1) { + std::unique_lock lock2(_res_guard[key2]->template get_mut()); } - } - if (key2 != new_key2) { - _res_guard[new_key2] = _res_guard[key2]; - if (_res_guard.erase(key2) != 1) { // delete old key-vale - LOG(FATAL) << "target key(" << key2 << ") doesn't exist."; + _res_guard[key1]->template use(); + _res_guard[key2]->template use(); + func(tensor1, tensor2, std::forward(args)...); + void *new_key1 = tensor1.data(); // check if tensor data has changed + void *new_key2 = tensor2.data(); // check if tensor data has changed + if (key1 != new_key1) { + _res_guard.emplace(new_key1, make_lock()).first->second.swap(_res_guard[key1]); + if (key1 && _res_guard.erase(key1) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key1 << ") doesn't exist."; + } + } + if (key2 != new_key2) { + _res_guard.emplace(new_key2, make_lock()).first->second.swap(_res_guard[key2]); + if (key2 && _res_guard.erase(key2) != 1) { // delete old key-vale + LOG(FATAL) << "target key(" << key2 << ") doesn't exist."; + } } } } if (key1 == nullptr && key2 == nullptr) { func(tensor1, tensor2, std::forward(args)...); } -} + } /// get sum size in m-btyes size_t get_sum_mbyte() EXCLUSIVE_LOCKS_REQUIRED(_mut) { - std::unique_lock lock(this->_mut); + std::unique_lock lock(this->_mut); size_t sum = 0; for (auto block_p : _int8_mem_pool) { sum += block_p->count(); } for (auto block_p : _fp16_mem_pool) { - sum += block_p->count()*2; + sum += block_p->count() * 2; } for (auto block_p : _fp32_mem_pool) { - sum += block_p->count()*4; + sum += block_p->count() * 4; } return sum / 1e6; } @@ -215,15 +245,15 @@ void apply(functor func, Tensor4d& tensor1 , Tensor4d& tensor2, Pa /// clean all void clean_all() EXCLUSIVE_LOCKS_REQUIRED(_mut) { std::unique_lock lock(this->_mut); - for(auto block_p : _int8_mem_pool) { + for (auto block_p : _int8_mem_pool) { delete block_p; } _int8_mem_pool.clear(); - for(auto block_p : _fp16_mem_pool) { + for (auto block_p : _fp16_mem_pool) { delete block_p; } _fp16_mem_pool.clear(); - for(auto block_p : _fp32_mem_pool) { + for (auto block_p : _fp32_mem_pool) { delete block_p; } _fp32_mem_pool.clear(); @@ -234,16 +264,18 @@ void apply(functor func, Tensor4d& tensor1 , Tensor4d& tensor2, Pa size_t get_pool_size() { return _get_pool_size(DataTypeWarpper()); } private: - /// push int8_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + /// push int8_mem operaiton + void _push_mem_pool(PBlock *block_p, DataTypeWarpper) { _int8_mem_pool.push_back(block_p); } - /// push fp16_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + + /// push fp16_mem operaiton + void _push_mem_pool(PBlock *block_p, DataTypeWarpper) { _fp16_mem_pool.push_back(block_p); } - /// push fp32_mem operaiton - void _push_mem_pool(PBlock* block_p, DataTypeWarpper) { + + /// push fp32_mem operaiton + void _push_mem_pool(PBlock *block_p, DataTypeWarpper) { _fp32_mem_pool.push_back(block_p); } @@ -251,24 +283,25 @@ void apply(functor func, Tensor4d& tensor1 , Tensor4d& tensor2, Pa size_t _get_pool_size(DataTypeWarpper) { return _int8_mem_pool.size(); } + /// get fp16_mem pool size size_t _get_pool_size(DataTypeWarpper) { return _fp16_mem_pool.size(); } + /// get fp32_mem pool size size_t _get_pool_size(DataTypeWarpper) { return _fp32_mem_pool.size(); } private: - typedef GlobalResRestrain LevelList; - std::unordered_map _res_guard; + std::unordered_map> _res_guard; ///< _int8_mem_pool stand for int8 type memory - std::vector* > _int8_mem_pool GUARDED_BY(_mut); + std::vector *> _int8_mem_pool GUARDED_BY(_mut); ///< _fp16_mem_pool stand for fp16 type memory - std::vector* > _fp16_mem_pool GUARDED_BY(_mut); + std::vector *> _fp16_mem_pool GUARDED_BY(_mut); ///< _fp32_mem_pool stand for fp32 type memory - std::vector* > _fp32_mem_pool GUARDED_BY(_mut); + std::vector *> _fp32_mem_pool GUARDED_BY(_mut); ///< _mut std::mutex _mut; }; @@ -277,11 +310,11 @@ void apply(functor func, Tensor4d& tensor1 , Tensor4d& tensor2, Pa template using GraphGlobalMem = Singleton>; -/** +/** * \brief InFO enum - * using number to stand for memory and other info of anakin + * using number to stand for memory and other info of anakin */ -enum INFO{ +enum INFO { TEMP_MEM = 0, ///< 0 stand for TEMP_MEM ORI_TEMP_MEM, ///< 1 stand for ORI_TEMP_MEM MODEL_MEM, ///< 2 stand for MODEL_MEM @@ -290,14 +323,15 @@ enum INFO{ }; template -struct Decide{ - typedef int type; +struct Decide { + typedef float type; }; template<> struct Decide { typedef bool type; }; + /** * \brief Statistics struct * used for memory information set and get @@ -307,27 +341,33 @@ struct Statistics { void set_info(typename Decide::type value) { _set_info(value, Info_to_type()); } - + template typename Decide::type get_info() { return _get_info(Info_to_type()); } + private: template - struct Info_to_type {}; + struct Info_to_type { + }; inline void _set_info(int mem_in_mbytes, Info_to_type) { temp_mem_used = mem_in_mbytes; } + inline void _set_info(int mem_in_mbytes, Info_to_type) { original_temp_mem_used = mem_in_mbytes; } + inline void _set_info(int mem_in_mbytes, Info_to_type) { model_mem_used = mem_in_mbytes; } + inline void _set_info(int mem_in_mbytes, Info_to_type) { system_mem_used = mem_in_mbytes; } + inline void _set_info(bool whether_optimized, Info_to_type) { is_optimized = whether_optimized; } @@ -335,15 +375,19 @@ struct Statistics { inline typename Decide::type _get_info(Info_to_type) { return temp_mem_used; } + inline typename Decide::type _get_info(Info_to_type) { return original_temp_mem_used; } + inline typename Decide::type _get_info(Info_to_type) { return model_mem_used; } + inline typename Decide::type _get_info(Info_to_type) { return system_mem_used; } + inline typename Decide::type _get_info(Info_to_type) { return is_optimized; } diff --git a/framework/graph/llvm/fusion/fusion_op_register.cpp b/framework/graph/llvm/fusion/fusion_op_register.cpp index 0c4a8a6d9..31be275bf 100644 --- a/framework/graph/llvm/fusion/fusion_op_register.cpp +++ b/framework/graph/llvm/fusion/fusion_op_register.cpp @@ -1,5 +1,5 @@ #include "framework/graph/llvm/fusion/graph_pattern.h" - +#include "anakin_config.h" namespace anakin { namespace graph { @@ -13,6 +13,36 @@ REGISTER_GRAPH_FUSION_PATTERN(DeconvRelu) .AddConnect("conv_0", "relu_0") .CreatePattern([](VGraph* graph) {}); +//* +REGISTER_GRAPH_FUSION_PATTERN(DeconvBatchnormScaleRelu) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Deconvolution") +.AddOpNode("batchnorm_0", "BatchNorm") +.AddOpNode("scale_0", "Scale") +.AddOpNode("relu_0", "ReLU") +.AddConnect("conv_0", "batchnorm_0") +.AddConnect("batchnorm_0", "scale_0") +.AddConnect("scale_0", "relu_0") +.CreatePattern([](VGraph* graph) {}); + +REGISTER_GRAPH_FUSION_PATTERN(DeconvBatchnormScale) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Deconvolution") +.AddOpNode("batchnorm_0", "BatchNorm") +.AddOpNode("scale_0", "Scale") +.AddConnect("conv_0", "batchnorm_0") +.AddConnect("batchnorm_0", "scale_0") +.CreatePattern([](VGraph* graph) {}); + +//* +REGISTER_GRAPH_FUSION_PATTERN(DeconvBatchnorm) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Deconvolution") +.AddOpNode("batchnorm_0", "BatchNorm") +.AddConnect("conv_0", "batchnorm_0") +.CreatePattern([](VGraph* graph) {}); +//*/ + REGISTER_GRAPH_FUSION_PATTERN(ConvRelu) .Type(IN_ORDER) .AddOpNode("conv_0", "Convolution") @@ -36,6 +66,7 @@ REGISTER_GRAPH_FUSION_PATTERN(ConvReluPool) .AddConnect("relu_0", "pooling_0") .CreatePattern([](VGraph* graph) {}); + REGISTER_GRAPH_FUSION_PATTERN(ConvBatchnormScaleReluPool) .Type(IN_ORDER) .AddOpNode("conv_0", "Convolution") @@ -67,6 +98,22 @@ REGISTER_GRAPH_FUSION_PATTERN(ConvBatchnormScale) .AddOpNode("scale_0", "Scale") .AddConnect("conv_0", "batchnorm_0") .AddConnect("batchnorm_0", "scale_0") +.CreatePattern([](VGraph* graph) {}); + +REGISTER_GRAPH_FUSION_PATTERN(ConvScale) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Convolution") +.AddOpNode("scale_0", "Scale") +.AddConnect("conv_0", "scale_0") +.CreatePattern([](VGraph* graph) {}); + +REGISTER_GRAPH_FUSION_PATTERN(ConvScaleRelu) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Convolution") +.AddOpNode("scale_0", "Scale") +.AddOpNode("relu_0", "ReLU") +.AddConnect("conv_0", "scale_0") +.AddConnect("scale_0", "relu_0") .CreatePattern([](VGraph* graph) {}); //* @@ -91,6 +138,31 @@ REGISTER_GRAPH_FUSION_PATTERN(EltwiseActivation) .AddConnect("eltwise_0", "prelu_0") .CreatePattern([](VGraph* graph) {}); +REGISTER_GRAPH_FUSION_PATTERN(ConvAffineChannel) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Convolution") +.AddOpNode("affine_channel_0", "AffineChannel") +.AddConnect("conv_0", "affine_channel_0") +.CreatePattern([](VGraph* graph) {}); + +REGISTER_GRAPH_FUSION_PATTERN(ConvAffineChannelRelu) +.Type(IN_ORDER) +.AddOpNode("conv_0", "Convolution") +.AddOpNode("affine_channel_0", "AffineChannel") +.AddOpNode("relu_0", "ReLU") +.AddConnect("conv_0", "affine_channel_0") +.AddConnect("affine_channel_0", "relu_0") +.CreatePattern([](VGraph* graph) {}); + +REGISTER_GRAPH_FUSION_PATTERN(SeqConcatSeqPoolSoftSign) +.Type(IN_ORDER) +.AddOpNode("seq_concat_0", "SequenceConcat") +.AddOpNode("seq_pool_0", "SequencePool") +.AddOpNode("soft_sign_0", "SoftSign") +.AddConnect("seq_concat_0", "seq_pool_0") +.AddConnect("seq_pool_0", "soft_sign_0") +.CreatePattern([](VGraph* graph) {}); + } /* namespace graph */ } /* namespace anakin */ diff --git a/framework/graph/llvm/fusion/graph_pattern.cpp b/framework/graph/llvm/fusion/graph_pattern.cpp index 89af9898a..1d1b4559e 100644 --- a/framework/graph/llvm/fusion/graph_pattern.cpp +++ b/framework/graph/llvm/fusion/graph_pattern.cpp @@ -78,17 +78,21 @@ std::unordered_map, FusionHash> Fu vgraph->remove(node_temp.name); } + auto old_bottom = vgraph_next_node.name; for (int tops_idx = 0; tops_idx < pattern_tops.size(); tops_idx++) { Arc arc(node_merge.name, pattern_tops[tops_idx]); auto& io_tmp = arc.weight(); io_tmp.name = arc.name(); vgraph->add_out_arc(arc); + //here,we record the map from origin edge to new edge after fusion + std::string old_e = old_bottom + "_" + pattern_tops[tops_idx]; + std::string new_e = node_merge.name + "_" + pattern_tops[tops_idx]; + vgraph->add_fusion_edge_map(new_e, old_e); } node_merge.mergeNodeNames = pattern_node_name_saves; param_node = node_merge; - return 0; } else { return 0; // continue searching diff --git a/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp index 91d2e7e8c..a4bcbca64 100644 --- a/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp +++ b/framework/graph/llvm/optimizer/conv_elewise_fusion_scheduler.cpp @@ -48,7 +48,7 @@ bool ConvElsFusionScheduler::callable(node& node_arg) { } _helper.set_holder(io_in, _vgraph);*/ //_helper.register_pair(node_arg.name, node_next.name); - if ((*_vgraph)[it->bottom()].opName == "Split") { + if ((*_vgraph)[it->bottom()].opName == "Split" || !_helper.has_node((*_vgraph)[it->bottom()])) { _helper.register_pair(node_arg.name, node_next.name); _force_order[node_arg.name] = (*_vgraph)[it->bottom()]; /* diff --git a/framework/graph/llvm/optimizer/memory_scheduler.cpp b/framework/graph/llvm/optimizer/memory_scheduler.cpp index e614e13b0..e4c4ad794 100644 --- a/framework/graph/llvm/optimizer/memory_scheduler.cpp +++ b/framework/graph/llvm/optimizer/memory_scheduler.cpp @@ -1,10 +1,25 @@ #include "framework/graph/llvm/optimizer/memory_scheduler.h" +#include namespace anakin { namespace graph { void IOBlockResource::reg_self_lock_tree(io& io_in, std::vector& io_out) { + // When traversing the graph in BFS, the sharing relationship + // needs to be completely recorded in the same tree, + // otherwise the release order may be error. + for (auto it = _self_lock_next_tree.begin(); it != _self_lock_next_tree.end(); it++) { + auto& io_vec = it->second; + for (auto io_out_existed : io_vec) { + if (io_in.name == io_out_existed.name) { + auto io_out_new = _self_lock_next_tree[it->first]; + io_out_new.insert(io_out_new.end(), io_out.begin(), io_out.end()); + _self_lock_next_tree[io_in] = io_out_new; + return; + } + } + } if (_self_lock_next_tree.count(io_in) <= 0) { _self_lock_next_tree[io_in] = io_out; } else { @@ -29,7 +44,7 @@ void IOBlockResource::rm_self_lock_tree(io& io_in) { } } -void IOBlockResource::free_self(std::vector& self_shared_edges, VGraph* vgraph_p) { +void IOBlockResource::free_self(std::vector& self_shared_edges, VGraph* vgraph_p, MemoryScheduler* mem_scher = nullptr) { for (auto& io : self_shared_edges) { rm_self_lock_tree(io); } @@ -40,7 +55,7 @@ void IOBlockResource::free_self(std::vector& self_shared_edges, VGraph* vgra } else { if (_self_lock_next_tree[*it].size() == 0) { //_free.push(*it); - push_free(*it, vgraph_p); + push_free(*it, vgraph_p, mem_scher); it = _lock.erase(it); } else { ++it; @@ -89,7 +104,7 @@ bool IOBlockResource::is_same_target(io& one, io& two, VGraph* vgraph_p) { return false; } -void IOBlockResource::push_free(io& io_free, VGraph* vgraph_p) { +void IOBlockResource::push_free(io& io_free, VGraph* vgraph_p, MemoryScheduler* mem_scher = nullptr) { bool io_free_have_regist = false; for (auto it = _free.begin(); it != _free.end();) { @@ -101,18 +116,20 @@ void IOBlockResource::push_free(io& io_free, VGraph* vgraph_p) { } if (!io_free_have_regist) { - _free.push_back(io_free); + if(!mem_scher->is_target_fixed(io_free)) { + _free.push_back(io_free); + } } } -void IOBlockResource::free(std::vector& io_vec, VGraph* vgraph_p) { +void IOBlockResource::free(std::vector& io_vec, VGraph* vgraph_p, MemoryScheduler* mem_scher = nullptr) { for (auto& io_res : io_vec) { for (auto it = _lock.begin(); it != _lock.end();) { io tmp_io; tmp_io.name = io_res.name; if ((*it) == tmp_io) { - push_free(*it, vgraph_p); + push_free(*it, vgraph_p, mem_scher); it = _lock.erase(it); } else { ++it; @@ -167,7 +184,107 @@ void IOBlockResource::map_ios_to_vgraph(std::vector& io_vec, VGraph* vgraph_ vgraph_p->Scanner->BFS_Edge(replace_arc); } } +void MemoryScheduler::Run(){ + //first, we need to get scheduled order of node + auto node_order = _vgraph -> get_exec_order(); + this->_wait_que.clear(); + for (int i=0; i < node_order.size(); ++i){ + auto node_arg = (*_vgraph)[node_order[i]]; + this->wait_push(node_arg); + } + + while (!(this->_wait_que.empty())) { + // lanuch the acessible op and remove it from wait que. + for (auto op_it = this->_wait_que.begin(); op_it != this->_wait_que.end();) { + if (callable(*op_it)) { + launch(*op_it); + op_it = this->_wait_que.erase(op_it); + } else { + ++op_it; + } + } + } + //try to check if graph has wrong order tensor memoryscheduler + //**if graph is correct schedulered, this function will do nothing + check_memory(); +} + + +//check if memory has wrong order +/*biref: this function checks if some nodes have wrong compute order to cover tensors wrong +//if graph has correct compute order, this function do nothing +//if this function works wrong, the model must be wrong computed though has no this function +*/ +void MemoryScheduler::check_memory(){ + auto node_order = _vgraph -> get_exec_order(); + //for (int i=0; i< node_order.size(); ++i){ + // LOG(ERROR) << "check_memory: " << node_order[i]; + //} + auto connect_table = _vgraph -> connect_table(); + + //check node input tensor + auto check_node = [&](node& node_arg){ + int i = 0; + while (node_order[i] != node_arg.name){ + if (connect_table[{node_order[i], node_arg.name}] || + connect_table[{node_arg.name, node_order[i]}]){ + ++i; + continue; + } + auto in_edge_its = _vgraph -> get_in_arc_its(node_arg.name); + auto out_edge_its = _vgraph -> get_out_arc_its(node_order[i]); + if (in_edge_its.size() == 1 && out_edge_its.size() == 1){ + auto in_io = in_edge_its[0]; + auto out_io = out_edge_its[0]; + + //check out_io top is before in_io bottom? + int topi = 0; + bool top_check = false; + while (node_order[topi] != in_io->bottom()){ + if (out_io->top() == node_order[topi]){ + top_check = true; + } + ++topi; + } + //if order is really wrong, we correct it. + if (!top_check && out_io->weight().shared && + (in_io->weight().name == out_io->weight().share_from || + in_io->weight().share_from == out_io->weight().share_from)){ + out_io->weight().shared = false; + LOG(WARNING) << "checked wrong order: " << in_io->weight().name << + "-->" << out_io->weight().name; + //set all output edge need self shared + if (check_self_shared_str((*_vgraph)[out_io->top()].opName)){ + //for recurisive + std::stack connect_nodes; + connect_nodes.push(out_io->top()); + while (!connect_nodes.empty()){ + auto& curnode = connect_nodes.top(); + connect_nodes.pop(); + auto out_edges = _vgraph -> get_out_arc_its(curnode); + for (int i = 0; i < out_edges.size(); ++i){ + if (check_self_shared_str((*_vgraph)[out_edges[i]->top()].opName)){ + connect_nodes.push(out_edges[i]->top()); + } + LOG(ERROR) << "follow correct order: " << out_edges[i]->weight().name; + out_edges[i]->weight().share_from = out_io->weight().name; + } + + } + } + + + } + } + + ++i; + } + }; + + _vgraph -> Scanner -> BFS(check_node); + +} void MemoryScheduler::launch(node& node_arg) { this->exe_push(node_arg); auto& node_arc_out_its = _vgraph->get_out_arc_its(node_arg.name); @@ -289,13 +406,13 @@ void MemoryScheduler::launch(node& node_arg) { } if (node_arg.opName != "Output") { - _io_block_res.free(io_in, _vgraph); + _io_block_res.free(io_in, _vgraph, this); } std::vector self_shared_edges; if (_need_self_shared.last_op_is_self_shared(_vgraph, node_arg, self_shared_edges)) { - _io_block_res.free_self(self_shared_edges, _vgraph); + _io_block_res.free_self(self_shared_edges, _vgraph, this); } } } diff --git a/framework/graph/llvm/optimizer/memory_scheduler.h b/framework/graph/llvm/optimizer/memory_scheduler.h index e9aad3547..94e113ea0 100644 --- a/framework/graph/llvm/optimizer/memory_scheduler.h +++ b/framework/graph/llvm/optimizer/memory_scheduler.h @@ -77,6 +77,8 @@ struct check_self_shared { } }; +class MemoryScheduler; + /** * \brief io block resource class used for scheduler of VGraph memory usage */ @@ -85,7 +87,7 @@ class IOBlockResource { IOBlockResource() {} ~IOBlockResource() {} - void free(std::vector&, VGraph*); + void free(std::vector&, VGraph*, MemoryScheduler*); inline bool has_free(io& target) { for (auto it = _free.begin(); it != _free.end();) { auto& io_tmp = *it; @@ -109,14 +111,14 @@ class IOBlockResource { return io(); } bool is_same_target(io&, io&, VGraph*); - void push_free(io&, VGraph*); + void push_free(io&, VGraph*, MemoryScheduler*); void lock(std::vector&); bool is_locked(io&); inline void push_self_lock(io& io_tmp) { _self_lock.push_back(io_tmp);} void reg_self_lock_tree(io&, std::vector&); void rm_self_lock_tree(io&); bool is_in_self_tree(io&); - void free_self(std::vector&, VGraph*); + void free_self(std::vector&, VGraph*, MemoryScheduler*); void map_ios_to_vgraph(std::vector&, VGraph*); private: @@ -137,6 +139,22 @@ class MemoryScheduler : public Scheduler { /// launch operator and push op to execution queue virtual void launch(node&) final; + virtual void Run(); + void check_memory(); + bool check_self_shared_str(std::string str){ + std::vector ops{ + "Split", + "Reshape", + "Gather", + "Flatten" + }; + for (std::string type : ops){ + if (str == type){ + return true; + } + } + return false; + } /// set fix io void set_fix_io(std::vector&); @@ -146,6 +164,7 @@ class MemoryScheduler : public Scheduler { private: IOBlockResource _io_block_res; check_self_shared _need_self_shared; + std::map io_number_map; }; diff --git a/framework/graph/llvm/scheduler.cpp b/framework/graph/llvm/scheduler.cpp index 970f51249..a5050cffd 100644 --- a/framework/graph/llvm/scheduler.cpp +++ b/framework/graph/llvm/scheduler.cpp @@ -77,11 +77,12 @@ void Scheduler::Run() { } } } + auto exec_node_order = this->get_exec_node_in_order(); + _vgraph->set_exec_order(exec_node_order); } bool Scheduler::is_fixed(io& io_arg) { auto it = std::find(_fix_io_res.begin(), _fix_io_res.end(), io_arg); - if (it != _fix_io_res.end()) { return true; } @@ -89,6 +90,23 @@ bool Scheduler::is_fixed(io& io_arg) { return false; } +bool Scheduler::is_target_fixed(io& io_arg) { + io target_io = io_arg; + auto search_target = [&](Arc& arc) { + auto share_from = target_io.share_from; + if(arc.weight().name == share_from) { + target_io = arc.weight(); + return Status::EXIT(" Find the matched target arc io. "); + } + return Status::OK(); + }; + _vgraph->Scanner->BFS_Edge(search_target); + if(is_fixed(target_io)) { + return true; + } + return false; +} + std::vector Scheduler::get_exec_node_in_order() { auto& exec_node_in_order = this->get_exec_que(); std::vector ret; diff --git a/framework/graph/llvm/scheduler.h b/framework/graph/llvm/scheduler.h index 8ffeb6e06..0dffd61ae 100644 --- a/framework/graph/llvm/scheduler.h +++ b/framework/graph/llvm/scheduler.h @@ -71,6 +71,9 @@ class Scheduler : public ScheduleBase { /// check if io is fixed bool is_fixed(io&); + /// check if io's share_from target is fixed + bool is_target_fixed(io&); + /// ...TODO // public: diff --git a/framework/graph/llvm/virtual_graph.cpp b/framework/graph/llvm/virtual_graph.cpp index 07f692564..aa94e1cf1 100644 --- a/framework/graph/llvm/virtual_graph.cpp +++ b/framework/graph/llvm/virtual_graph.cpp @@ -19,6 +19,9 @@ std::string io::ToString() { } std::string node::ToString() { +#ifdef USE_SGX + return "node.ToString not supported in SGX mode"; +#else std::ostringstream msg; if (mergeNodes.size()) { @@ -34,6 +37,7 @@ std::string node::ToString() { } return msg.str(); +#endif } void VGraph::Match(VGraph* vgraph_pattern) { diff --git a/framework/graph/llvm/virtual_graph.h b/framework/graph/llvm/virtual_graph.h index 44b03bab8..3206a422b 100644 --- a/framework/graph/llvm/virtual_graph.h +++ b/framework/graph/llvm/virtual_graph.h @@ -180,11 +180,26 @@ class VGraph : public GraphBase { std::vector& get_exec_order() { return _nodes_exec_order; } + void add_fusion_edge_map(std::string new_e, std::string old_e){ + _fusion_edge_map[new_e] = old_e; + } + std::string get_fusion_old_edge( + std::string new_e){ + if (_fusion_edge_map.count(new_e) > 0){ + return _fusion_edge_map[new_e]; + } else { + //LOG(ERROR) << "fusion map has no key: " << new_e; + return ""; + } + } + private: ///< _registed_outs :outs that needs to be exported std::vector> _registed_outs; ///< node execute order std::vector _nodes_exec_order; + ///< origin edge map to new edge after fusion + std::unordered_map _fusion_edge_map; }; diff --git a/framework/graph/node.h b/framework/graph/node.h index 4f0c3f64a..2b0679c5b 100644 --- a/framework/graph/node.h +++ b/framework/graph/node.h @@ -5,22 +5,26 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_NODE_H -#define ANAKIN_NODE_H +#define ANAKIN_NODE_H #include "framework/graph/arc.h" #include "framework/core/any.h" #include "framework/core/base.h" #include "framework/core/parameter.h" - +#include +#include "anakin_config.h" +#ifdef USE_SGX +#include +#endif namespace anakin { /** @@ -37,7 +41,7 @@ namespace graph { /** * \brief struct of share information for weights */ -class WeightShareCell { +class WeightShareCell { public: WeightShareCell() {} ~WeightShareCell() {} @@ -52,20 +56,20 @@ class WeightShareCell { } void accept_share_pair(const std::string& weight_name, const std::string& share_from) { - if(!has_weight(weight_name)) { - _share_map[weight_name] = share_from; + if (!has_weight(weight_name)) { + _share_map[weight_name] = share_from; } } bool has_weight(const std::string& weight_name) { - auto it_end = _share_map.end(); + auto it_end = _share_map.end(); auto it_find = _share_map.find(weight_name); - if(it_find == it_end) { - return false; + if (it_find == it_end) { + return false; } return true; } -private: +private: std::unordered_map _share_map; }; @@ -75,18 +79,18 @@ class WeightShareCell { struct AttrInfo { public: AttrInfo() { - parameter_p = - std::make_shared >(); + parameter_p = + std::make_shared >(); } inline bool inspect(const std::string& attr_name) { - auto it_end = parameter_p->end(); - auto it_find = parameter_p->find(attr_name); - if(it_find != it_end) { - return true; - } - return false; - } + auto it_end = parameter_p->end(); + auto it_find = parameter_p->find(attr_name); + if (it_find != it_end) { + return true; + } + return false; + } template T get(const std::string& attr_name) { @@ -141,8 +145,8 @@ struct AttrInfo { auto it_end = operand.parameter_p->end(); for(auto it = it_begin; it != it_end; ++it ) { // operand name has been changed! - std::string new_name = pattern_name + "_" + it->first; - (*parameter_p)[new_name] = it->second; + std::string new_name = pattern_name + "_" + it->first; + (*parameter_p)[new_name] = it->second; } } @@ -187,14 +191,15 @@ class Edge : public Arc > { public: Edge():Arc >() {} Edge(const Edge& edge):Arc >(edge) { - _shared = edge._shared; - _share_from = edge._share_from; - _current_lane = edge._current_lane; + _shared = edge._shared; + _share_from = edge._share_from; + _current_lane = edge._current_lane; + _scale = edge._scale; } explicit Edge(std::string first, std::string second):Arc >(first, second) {} explicit Edge(std::string first, std::string second, TensorSharedPtr tensor_ptr) - :Arc >(first, second, tensor_ptr) {} + :Arc >(first, second, tensor_ptr) {} /// Get first node name of the edge. inline std::string& first() { return this->bottom(); } @@ -205,11 +210,23 @@ class Edge : public Arc > { /// get data weigts of the edge. inline TensorSharedPtr data() { return this->weight(); } + inline std::vector scale() const { return _scale; } + + inline void set_scale(const std::vector &scale) { + _scale = scale; + } + + inline saber::LayoutType layout() const {return _layout;} + + inline void set_layout(saber::LayoutType layout){ + _layout = layout; + } + /// If edge's data is shared from the others. bool& shared() { return _shared; } std::string& share_from() { return _share_from; } - + /// lane which edge reside in Lane& lane() { return _current_lane; } @@ -228,6 +245,8 @@ class Edge : public Arc > { _shared = edge._shared; _share_from = edge._share_from; _current_lane = edge._current_lane; + _scale = edge._scale; + _layout = edge._layout; Arc >::operator=(edge); } @@ -236,8 +255,14 @@ class Edge : public Arc > { bool _shared{false}; ///< _share_from :the tensor this edge share from std::string _share_from; - ///< _current_lane :Current lane the edge's data resides in. + ///< _current_lane :Current lane the edge's data resides in. Lane _current_lane; + // _scale: Transfer the scale passed by external parser to Net tensor. + std::vector _scale; + + //_layout: the layout from config + + saber::LayoutType _layout{Layout_NCHW}; }; /** @@ -247,11 +272,11 @@ class Node { public: Node() {} ~Node() { - if(_Op) { - delete _Op; - _Op = nullptr; - } - } + if (_Op) { + delete _Op; + _Op = nullptr; + } + } /// print message std::string DebugString(); @@ -266,22 +291,26 @@ class Node { /// Node operator OperatorBase* Op() { return _Op; } - /// set node operator void set_op(OperatorBase* other) { _Op = other; } /// Node need wait bool& need_wait() { return _need_wait; } + + /// get bit type + DataType& bit_type() { return _bit_type; } + void set_bit_type(DataType dtype){_bit_type = dtype;} + /// get op name std::string& get_op_name() { return _op_name; } /// Access to attributes. - AttrInfo& attr() { return _attr; } + AttrInfo& attr() { return _attr; } - /// inspect if node attr have target attr name - inline bool inspect_attr(const std::string& attr_name) { - return this->_attr.inspect(attr_name); - } + /// inspect if node attr have target attr name + inline bool inspect_attr(const std::string& attr_name) { + return this->_attr.inspect(attr_name); + } /** * \brief Get target attr by name @@ -290,7 +319,7 @@ class Node { */ template T get_attr(const std::string& attr_name) { - return this->_attr.get(attr_name); + return this->_attr.get(attr_name); } /** * \brief Get target attr by name @@ -302,7 +331,7 @@ class Node { return this->_attr.get(attr_name,default_data); } /** - * \brief Set target attr by name and value + * \brief Set target attr by name and value * \param attr_name stand for target_attr name * \param val stand for attribute value * \return Status @@ -310,7 +339,7 @@ class Node { template Status set_attr(const std::string& attr_name, const T val) { std::unique_lock lock(this->_mut); - return this->_attr.set(attr_name, val); + return this->_attr.set(attr_name, val); } /** @@ -320,13 +349,13 @@ class Node { */ Status remove_attr(const std::string& attr_name) { std::unique_lock lock(this->_mut); - return this->_attr.remove(attr_name); + return this->_attr.remove(attr_name); } /** * \brief get share target node name of given weight * \param weight name - * \return string + * \return string */ inline std::string get_share_target(const std::string& weight_name) { return _share_weights.get_share_target(weight_name); @@ -352,15 +381,15 @@ class Node { } /** - * \brief check if the node's weights is shared from others + * \brief check if the node's weights is shared from others * \return bool */ - inline bool is_weight_shared() { - for(auto it = _attr.begin(); it != _attr.end(); ++it) { + inline bool is_weight_shared() { + for (auto it = _attr.begin(); it != _attr.end(); ++it) { if(check_shared(it->first)) { return true; } - } + } return false; } @@ -370,10 +399,10 @@ class Node { /** * \brief merge for attr * \param operand - * \param pattern_name - * \return Node + * \param pattern_name + * \return Node */ - inline Node& Merge(Node& operand, const std::string& pattern_name) { + inline Node& Merge(Node& operand, const std::string& pattern_name) { std::unique_lock lock(this->_mut); this->_attr.MergeWithPattern(operand.attr(), pattern_name); return *this; @@ -386,21 +415,26 @@ class Node { _Op = nullptr; // Assign the op pointer with operand's should be disabled, because it causes double free after binding the nodeptr by op itself. _op_name = operand._op_name; // shallow copy of attributes - this->_attr = operand.attr(); + this->_attr = operand.attr(); // copy of shared weights this->_share_weights = operand._share_weights; // copy others _need_wait = operand._need_wait; _in_degree = operand._in_degree; _out_degree = operand._out_degree; + _bit_type = operand._bit_type; return *this; } - + /// print message - inline std::string ToString() { - std::ostringstream msg; - msg << _name << " : op(" << _op_name << ") lane(" << _current_lane << ") need_wait(" << _need_wait << ")"; + inline std::string ToString() { +#ifdef USE_SGX + return "**Node.ToString not implemented in SGX mode**"; +#else + std::ostringstream msg; + msg << _name << " : op(" << _op_name << ") lane(" << _current_lane << ") need_wait(" << _need_wait << ")"<<", bit type "<open(path); - } - - // BinaryWritteropen file for code generating. - void open(std::string& path, const char* file_mode = "wb") { - _file_io.open(path, file_mode); - } - - // write data list to file - inline bool write(void* ptr, size_t size, size_t count) { - return _file_io.write(ptr, size, count); - } - - // read data list from file - inline bool read(void* ptr, size_t size, size_t count) { - return _file_io.read(ptr, size, count); - } - -private: - LiteFileIO _file_io; -}; - -/** - * \brief class Weghts - */ -struct WeghtOffset { - struct Offset{ - size_t offset{0}; // offset from start - size_t length{0}; // weight length - }; - std::vector weights; -}; - -/** - * \brief class to help generating model weigth file. - * - */ -class WeightsWritter : public BinaryWritter { -public: - WeightsWritter() {} - ~WeightsWritter() {} - - // set weight - template - void register_weights(const std::string& node_name, PBlock& weight) { - WeghtOffset::Offset offset_tmp; - offset_tmp.offset = _offset; - offset_tmp.length = weight.count(); - _offset += offset_tmp.length; - _node_weights_map[node_name].weights.push_back(offset_tmp); - size_t type_size = weight.h_tensor().get_dtype_size(); - write(weight.h_tensor().mutable_data(), type_size, offset_tmp.length); - } - - bool has_node(std::string node_name) { - return _node_weights_map.count(node_name) > 0 ? true : false; - } - - WeghtOffset get_weights_by_name(std::string node_name) { - if (!has_node(node_name)) { - LOG(FATAL) << "WeightsWritter doesn't have target node name: " << node_name; - return WeghtOffset(); - } - return _node_weights_map[node_name]; - } - -private: - size_t _offset{0}; - std::unordered_map _node_weights_map; -}; - - - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif diff --git a/framework/lite/code_gen_base.cpp b/framework/lite/code_gen_base.cpp deleted file mode 100644 index 0c4fe565c..000000000 --- a/framework/lite/code_gen_base.cpp +++ /dev/null @@ -1,208 +0,0 @@ -#include "framework/lite/code_gen_base.h" -#include "framework/graph/graph_global_mem.h" -#include "framework/core/net/net.h" -#include "framework/graph/llvm/scheduler.h" -#include "framework/graph/llvm/optimizer/parall_scheduler.h" -#include "framework/graph/llvm/optimizer/memory_scheduler.h" -namespace anakin { - -namespace lite { - -/** - * this full specialization use for help generating lite device running api - */ -template -bool CodeGenBase::extract_graph(const std::string& model_path, const int batch_size) { - graph::Graph graph; - auto status = graph.load(model_path); - if (!status ) { - LOG(ERROR) << " [ERROR] " << status.info(); - return false; - } - - //add batchsize - std::vector& ins = graph.get_ins(); - for (int i = 0; i < ins.size(); i++){ - graph.ResetBatchSize(ins[i], batch_size); - } - // Optimize -#ifdef USE_ARM_PLACE - auto vgraph = graph.get_vgraph(); - graph::Scheduler scheduler; - // schedule for exec order - scheduler.RegIOResource(&vgraph); - scheduler.Run(); - scheduler.get_exec_node_in_order(); - // optimize mem - graph::MemoryScheduler mem_scheduler; - mem_scheduler.RegIOResource(&vgraph); - mem_scheduler.Run(); - // analyse parallel - graph::ParallScheduler para_scheduler; - para_scheduler.RegIOResource(&vgraph); - para_scheduler.Run(); - // restore from vgraph - graph.restore_from_vgraph(&vgraph); -#else - // Optimize - graph.Optimize(); -#endif - LOG(ERROR) << "finish fusion"; - - // get graph io - _ins = graph.get_ins(); - _outs = graph.get_outs(); - - // copy graph - _graph.CopyFrom(graph); - - // getting execution order - auto& node_names_in_exec_order = _graph.get_nodes_in_order(); - for (auto& node_name : node_names_in_exec_order) { - auto node_ptr = _graph[node_name]; - //if(node_ptr->get_op_name() == "Output") { - // continue; - //} - // op execution order - _exec_node_order.push_back(node_name); - _graph_node_map[node_name].name = node_name; - _graph_node_map[node_name].op_name = node_ptr->get_op_name(); - // set node op pointer - auto* op_pointer = OpFactory::Global()[node_ptr->get_op_name()]; - node_ptr->set_op(op_pointer); - op_pointer = nullptr; - // bind parameter structure - static_cast*>(node_ptr->Op())->_helper->BindParam(node_ptr); - // parsing parameter - static_cast*>(node_ptr->Op())->_helper->InitParam(); - } - // remove null op node - for (auto it = node_names_in_exec_order.begin(); it != node_names_in_exec_order.end(); ){ - if (!_graph[*it]->Op()) { - it = node_names_in_exec_order.erase(it); - } else { - ++it; - } - } - // compute in/out shape and initialize the _graph - std::vector > exec_funcs; - exec_funcs.resize(node_names_in_exec_order.size()); - for (int i = 0; i < node_names_in_exec_order.size(); i++) { - auto& node_name = node_names_in_exec_order[i]; - auto& op_func = exec_funcs[i]; - auto& edge_in_its = _graph.get_in_arc_its(node_name); - DLOG(ERROR) << " node : " << node_name << " (" << _graph[node_name]->get_op_name() << ") "; - for (auto& edge_it : edge_in_its) { - DLOG(INFO) << " => find in arc : " << edge_it->bottom() << " --> " << edge_it->top(); - _graph_node_map[node_name].ins.push_back(edge_it->name()); - op_func.ins.push_back(edge_it->weight().get()); - op_func.in_lanes.push_back(edge_it->lane()); - } - auto& edge_out_its = _graph.get_out_arc_its(node_name); - for (auto& edge_it : edge_out_its) { - DLOG(INFO) << " <= find out arc : " << edge_it->bottom() << " --> " << edge_it->top(); - _graph_node_map[node_name].outs.push_back(edge_it->name()); - op_func.outs.push_back(edge_it->weight().get()); - op_func.out_lanes.push_back(edge_it->lane()); - } - op_func.current_lane = _graph[node_name]->lane(); - op_func.need_sync = _graph[node_name]->need_wait(); - op_func.op = static_cast* >(_graph[node_name]->Op()); - op_func.op_name = _graph[node_name]->get_op_name(); - - CHECK_NOTNULL(op_func.op) << "Node(node_name) doesn't have op pointer! "; - LOG(INFO)<<"OPNAME:"<_helper->InferShape(op_func.ins, op_func.outs); - } - // initialize memory info - if (!init_memory_info()) { - return false; - } - return true; -} - -template -bool CodeGenBase::init_memory_info() { - auto alloc_memory = [this](graph::Edge& edge) { - EdgeInfo edge_info; - edge_info.name = edge.name(); - - auto& tensor_p = edge.weight(); - if (!edge.shared()) { - tensor_p->re_alloc(tensor_p->shape()); - - edge_info.valid_shape = tensor_p->shape(); - edge_info.real_shape = tensor_p->shape(); - edge_info.is_shared = false; - } else { - edge_info.is_shared = true; - } - edge_info.in_node = edge.first(); - edge_info.out_node = edge.second(); - _tensor_map[edge_info.name] = edge_info; - return 0; - }; - _graph.Scanner->BFS_Edge(alloc_memory); - - auto share_memory = [this](graph::Edge& edge) { - if (edge.shared()) { - auto& edge_name = edge.share_from(); - - _tensor_map[edge.name()].valid_shape = edge.weight()->valid_shape(); - _tensor_map[edge.name()].real_shape = edge.weight()->shape(); - - bool continue_search = true; - while (continue_search) { - auto match_edge = [&](graph::Edge& inner_edge) { - if (inner_edge.name() == edge_name) { - if (inner_edge.shared()) { - edge_name = inner_edge.share_from(); - return Status::EXIT(" Continue to find next . "); - } - if (inner_edge.weight()->size() < edge.weight()->valid_size()) { - auto inner_original_shape = inner_edge.weight()->valid_shape(); - inner_edge.weight()->re_alloc(edge.weight()->valid_shape()); - inner_edge.weight()->set_shape(inner_original_shape, inner_edge.weight()->shape()); - - _tensor_map[edge_name].valid_shape = inner_edge.weight()->valid_shape(); - _tensor_map[edge_name].real_shape = edge.weight()->valid_shape(); - } - edge.weight()->share_from(*(inner_edge.weight())); - _tensor_map[edge.name()].share_from= edge_name; - continue_search = false; - return Status::EXIT(" Find the matched target edge. "); - } - return Status::OK(); - }; - _graph.Scanner->BFS_Edge(match_edge); - } - } - }; - _graph.Scanner->BFS_Edge(share_memory); - return true; -} - -#ifdef USE_CUDA -template class CodeGenBase; -template class CodeGenBase; -template class CodeGenBase; -#endif - -#ifdef USE_X86_PLACE -template class CodeGenBase; -template class CodeGenBase; -template class CodeGenBase; -#endif - -#ifdef USE_ARM_PLACE -template class CodeGenBase; -template class CodeGenBase; -template class CodeGenBase; -#endif - -template class CodeGenBase; - -} /* namespace lite */ - -} /* namespace anakin */ - diff --git a/framework/lite/code_gen_base.h b/framework/lite/code_gen_base.h deleted file mode 100644 index d47469ada..000000000 --- a/framework/lite/code_gen_base.h +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_FRAMEWORK_LITE_CODE_GEN_BASE_H -#define ANAKIN_FRAMEWORK_LITE_CODE_GEN_BASE_H - -#include -#include -#include - -#include "framework/graph/graph.h" - -namespace anakin { - -namespace lite { - -/** - * \brief Node information for generating executor - */ -struct NodeInfo { - std::string name; // node name - std::string op_name; // op name - std::vector ins; // input edge name - std::vector outs; // output edge name -}; - - -/** - * \brief Edge information for generating edge tensors. - */ -struct EdgeInfo { - std::string name; // edge name - std::vector valid_shape; // edge valid shape - std::vector real_shape; // edge real shape - bool is_shared{false}; // if the edge is shared by others - std::string share_from{""}; // if the edge is_shared(true), share_from will hold the target edge name. - std::string in_node; - std::string out_node; -}; - -/** - * \brief class for target language code generator. - * - * The class CodeGenBase hold base information for running model. - * There exists several base info: - * 1. Operatoin name in execution order. - * 2. All the tensor model needs and share info between those tensors. - * 3. Model weights - */ -template -class CodeGenBase { -public: - CodeGenBase() {} - virtual ~CodeGenBase(){} - - /** - * \biref extract graph msg - */ - bool extract_graph(const std::string& model_path, const int batch_size = 1); - - /** - * \brief generate all source files - */ - virtual void gen_files(const bool debug_mode) = 0; - - -private: - /** - * \brief analyse the memory reuse info - */ - bool init_memory_info(); - - - /** - * \brief generate ops of graph - */ - virtual void gen_ops() = 0; - -protected: - graph::Graph _graph; - std::vector _exec_node_order; /// running order of operation's name - std::vector _ins; /// graph ins - std::vector _outs; /// graph outs - std::unordered_map _graph_node_map; - /// graph base arch - std::unordered_map _tensor_map; -}; - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif - diff --git a/framework/lite/code_gen_cpp.cpp b/framework/lite/code_gen_cpp.cpp deleted file mode 100644 index bd8c709c5..000000000 --- a/framework/lite/code_gen_cpp.cpp +++ /dev/null @@ -1,763 +0,0 @@ -#include -#include "framework/lite/code_gen_cpp.h" -#include "framework/core/net/calibrator_parse.h" - -namespace anakin { - -namespace lite { - -template -void GenCPP::gen_license() { - _code<< "/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved.\n\n Licensed under the Apache License, Version 2.0 (the \"License\");\n you may not use this file except in compliance with the License.\n You may obtain a copy of the License at\n\n http://www.apache.org/licenses/LICENSE-2.0\n\n Unless required by applicable law or agreed to in writing, software\n distributed under the License is distributed on an \"AS IS\" BASIS,\n WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n See the License for the specific language governing permissions and\n limitations under the License.\n*/\n\n"; -} - -template -void GenCPP::gen_header_start() { - _code.Clean(); - gen_license(); - _code.feed("#ifndef ANAKIN_%s_H \n", _code_name.c_str()); - _code.feed("#define ANAKIN_%s_H \n\n", _code_name.c_str()); - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"using namespace anakin;\n"; - _code<<"using namespace anakin::saber;\n"; - _code<<"using namespace anakin::saber::lite;\n\n"; - _code<<"namespace anakin { \n\n"; -} - -template -void GenCPP::gen_header_end() { - _code<<"} /* namespace anakin */\n"; - _code<<"\n#endif\n"; -} - -template -void GenCPP::gen_source_start() { - _code.Clean(); - _code.feed("#include \"%s.h\" \n\n", _code_name.c_str()); - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n"; - _code<<"#include \n\n"; - _code<<"namespace anakin { \n\n"; - // add running impl for model api -} - -template -void GenCPP::gen_source_end() { - _code<<"} /* namespace anakin */\n"; -} - -template -void GenCPP::gen_tensors() { - _code<<"\n// generating tensors \n"; - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(! edge_info.is_shared) { - _code.feed("Tensor %s_%s;\n", _code_name.c_str(), edge_name.c_str()); - _code.feed("Shape %s_%s_real_shape(%d,%d,%d,%d);\n", _code_name.c_str(), - edge_name.c_str(), - edge_info.real_shape[0], - edge_info.real_shape[1], - edge_info.real_shape[2], - edge_info.real_shape[3]); - _code.feed("Shape %s_%s_valid_shape(%d,%d,%d,%d);\n", _code_name.c_str(), - edge_name.c_str(), - edge_info.valid_shape[0], - edge_info.valid_shape[1], - edge_info.valid_shape[2], - edge_info.valid_shape[3]); - } - } - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(edge_info.is_shared) { - _code.feed("Tensor %s_%s;\n", _code_name.c_str(), edge_name.c_str()); - _code.feed("Shape %s_%s_valid_shape(%d,%d,%d,%d);\n", _code_name.c_str(), - edge_name.c_str(), - edge_info.valid_shape[0], - edge_info.valid_shape[1], - edge_info.valid_shape[2], - edge_info.valid_shape[3]); - } - } -} - -template -void GenCPP::tensors_init() { - _code<<"\n// initialize tensors \n"; - _code.feed("void %s_tensors_init() {\n", _code_name.c_str()); - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(! edge_info.is_shared) { - _code.feed(" %s_%s.re_alloc(%s_%s_real_shape);\n", _code_name.c_str(), edge_name.c_str(), _code_name.c_str(), edge_name.c_str()); - _code.feed(" %s_%s.set_shape(%s_%s_valid_shape);\n", _code_name.c_str(), edge_name.c_str(), _code_name.c_str(), edge_name.c_str()); - } - } - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(edge_info.is_shared) { - _code.feed(" %s_%s.set_shape(%s_%s_valid_shape);\n", _code_name.c_str(), edge_name.c_str(), _code_name.c_str(), edge_name.c_str()); - _code.feed(" %s_%s.share_from(%s_%s);\n", _code_name.c_str(), edge_name.c_str(), _code_name.c_str(), edge_info.share_from.c_str()); - } - } - _code<<"}\n"; - -} - -template -void GenCPP::gen_model_ios() { - _code<<"\n// generating model's I/O \n"; - _code.feed("std::vector*>> %s_tensor_ins;\n", _code_name.c_str()); - _code.feed("std::vector*>> %s_tensor_outs;\n", _code_name.c_str()); -// for(auto & node_name : this->_exec_node_order) { -// auto& node_info = this->_graph_node_map[node_name]; -// _code.feed("std::vector*> %s_ins;\n", node_name.c_str()); -// _code.feed("std::vector*> %s_outs;\n", node_name.c_str()); -// } -} - -template -void GenCPP::model_ios_init() { - _code<<"\n// initialize model's I/O \n"; - _code.feed("void %s_model_ios_init() {\n", _code_name.c_str()); - _code.feed(" %s_tensor_ins.resize(%d);\n", _code_name.c_str(), this->_exec_node_order.size()); - _code.feed(" %s_tensor_outs.resize(%d);\n", _code_name.c_str(), this->_exec_node_order.size()); - _code.feed(" for(int i = 0; i < %d; i++) {\n", this->_exec_node_order.size()); - _code.feed(" %s_tensor_ins[i].clear();\n", _code_name.c_str()); - _code.feed(" %s_tensor_outs[i].clear();\n", _code_name.c_str()); - _code.feed(" }\n"); - _code.feed(" int i = 0;\n"); - for(auto & node_name : this->_exec_node_order) { - if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { - continue; - } - auto& node_info = this->_graph_node_map[node_name]; - for(auto &edge_in : node_info.ins) { - _code.feed(" %s_tensor_ins[i].push_back(&%s_%s);\n", _code_name.c_str(), _code_name.c_str(), edge_in.c_str()); - } - for(auto &edge_out : node_info.outs) { - _code.feed(" %s_tensor_outs[i].push_back(&%s_%s);\n", _code_name.c_str(), _code_name.c_str(), edge_out.c_str()); - } - _code.feed(" i++;\n"); - } - _code<<"}\n"; -} - -template -void GenCPP::gen_ops() { - _code<<"\n// generating model's operations\n"; - _code<<"\n// create vector of ops\n"; - _code.feed("std::vector %s_g_ops;\n", _code_name.c_str()); - _code.feed("void %s_gen_ops() {\n", _code_name.c_str()); - _code.feed(" if (%s_g_ops.size() > 0) {\n", _code_name.c_str()); - _code.feed(" return;\n"); - _code.feed(" }\n"); - for(auto & node_name : this->_exec_node_order) { - if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { - continue; - } - auto& node_info = this->_graph_node_map[node_name]; - if(OPERATION_MAP.count(node_info.op_name) > 0) { - _code.feed(" OpBase* %s = new %s; \n", node_name.c_str(), OPERATION_MAP[node_info.op_name].OpClassName.c_str()); - _code.feed("#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) \n"); - _code.feed(" %s->set_op_name(\"%s\"); \n", node_name.c_str(), node_name.c_str()); - _code.feed("#endif \n"); - _code.feed(" %s_g_ops.push_back(%s);\n", _code_name.c_str(), node_name.c_str()); - } - } - _code << "}\n"; -} - -template -void GenCPP::gen_init_impl() { - _code<<"// initial function for model.\n"; - _code.feed("bool %s_init(Context& ctx) {\n", _code_name.c_str()); - _code.feed(" bool flag = false;\n"); - _code.feed(" for (int i = 0; i < %s_g_ops.size(); i++) {\n", _code_name.c_str()); - _code.feed(" %s_g_ops[i]->compute_output_shape(%s_tensor_ins[i], %s_tensor_outs[i]);\n", _code_name.c_str(), _code_name.c_str(), _code_name.c_str()); - _code.feed(" flag = %s_g_ops[i]->init(%s_tensor_ins[i], %s_tensor_outs[i], ctx);\n", _code_name.c_str(), _code_name.c_str(), _code_name.c_str()); - _code.feed(" if (!flag) {\n"); - _code.feed("#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) \n"); - _code.feed(" printf(\"%s op init failed;\\n\", %s_g_ops[i]->get_op_name());\n", "%s", _code_name.c_str()); - _code.feed("#endif \n"); - _code.feed(" return false;\n"); - _code.feed(" }\n"); - _code << " }\n"; -// for(auto & node_name : this->_exec_node_order) { -// if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { -// continue; -// } -// auto& node_info = this->_graph_node_map[node_name]; -// if(OPERATION_MAP.count(node_info.op_name) > 0) { -// _code.feed(" %s.compute_output_shape(%s_ins,%s_outs); \n", node_name.c_str(), -// node_name.c_str(), -// node_name.c_str()); -// _code.feed(" %s.init(%s_ins,%s_outs,ctx); \n", node_name.c_str(), -// node_name.c_str(), -// node_name.c_str()); -// } -// } - _code << " return true;\n"; - _code << "}\n"; -} - -template -void GenCPP::gen_run_impl(const bool debug_mode) { - _code << "// Running prediction for model. \n"; - _code.feed("bool %s_prediction() {\n", _code_name.c_str()); - _code.feed(" bool flag = false;\n"); - _code.feed(" for (int i = 0; i < %s_g_ops.size(); i++) {\n", _code_name.c_str()); - _code.feed(" flag = %s_g_ops[i]->dispatch(%s_tensor_ins[i], %s_tensor_outs[i]);\n", _code_name.c_str(), _code_name.c_str(), _code_name.c_str()); - _code.feed(" if (!flag) {\n"); - _code.feed("#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) \n"); - _code.feed(" printf(\"%s op dispatch failed;\\n\", %s_g_ops[i]->get_op_name());\n", "%s", _code_name.c_str()); - _code.feed("#endif \n"); - _code.feed(" return false;\n"); - _code.feed(" }\n"); - if (debug_mode) { - _code.feed(" for(int j = 0; j < %s_tensor_outs[i].size(); j++) {\n", _code_name.c_str()); - _code.feed(" double mean_val = tensor_mean(*%s_tensor_outs[i][0]); \n", _code_name.c_str()); - _code.feed("#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) \n"); - _code.feed(" printf(\"mean_val in %s ops: %s \\n\", %s_g_ops[i]->get_op_name(), mean_val);\n", "%s", "%.6f", _code_name.c_str()); - _code.feed("#else \n"); - _code.feed(" printf(\"mean_val in ops: %s \\n\", mean_val);\n", "%.6f"); - _code.feed("#endif \n"); - _code.feed(" }\n"); - } - _code << " }\n"; - -// for(auto & node_name : this->_exec_node_order) { -// if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { -// continue; -// } -// auto& node_info = this->_graph_node_map[node_name]; -// if(OPERATION_MAP.count(node_info.op_name) > 0) { -// /* -// _code.feed(" %s.compute_output_shape(%s_ins,%s_outs); \n", node_name.c_str(), -// node_name.c_str(), -// node_name.c_str()); -// */ -// _code.feed(" %s.dispatch(%s_ins,%s_outs); \n", node_name.c_str(), -// node_name.c_str(), -// node_name.c_str()); -// if (debug_mode) { -// _code.feed(" double mean_%s = tensor_mean(*%s_outs[0]); \n", node_name.c_str(), node_name.c_str()); -// _code.feed(" printf(\"%s run mean_val: %s %s\", mean_%s);\n", node_name.c_str(), "%.6f", "\\n", node_name.c_str()); -// } -// } -// } - _code << " return true;\n"; - _code << "}\n"; -} - -template -void GenCPP::gen_head_api() { - // gen gloss for graph ins - _code << "/// Model "<< _code_name << " have " << this->_ins.size() << " inputs.\n"; - for(auto in : this->_ins) { - auto& node_info = this->_graph_node_map[in]; - auto& edge_info = this->_tensor_map[node_info.outs[0]]; - _code << "/// |-- input name : " << in << " -- Shape("; - std::string shape_str; - for(int i=0; i 0) { - _code << edge_info.valid_shape[edge_info.valid_shape.size() - 1] << ")\n"; - } else { - _code << ")\n"; - } - } - - // gen api for getting graph input tensor - _code.feed("LITE_EXPORT std::vector*> %s_get_in();\n\n", _code_name.c_str()); - - // gen gloss for graph outs - _code << "/// Model " << _code_name << " have " << this->_outs.size() << " outputs.\n"; - for(auto out : this->_outs) { - auto& node_info = this->_graph_node_map[out]; - auto& edge_info = this->_tensor_map[node_info.ins[0]]; - _code << "/// |-- output name : " << out << " -- Shape("; - for(int i=0; i 0) { - _code << edge_info.valid_shape[edge_info.valid_shape.size() - 1] << ")\n"; - } else { - _code << ")\n"; - } - } - // gen api for getting graph output tensor - - _code.feed("LITE_EXPORT std::vector*> %s_get_out();\n\n", _code_name.c_str()); - - // gen weights loading function - _code.feed("LITE_EXPORT bool %s_load_param(const char* param_path);\n\n", _code_name.c_str()); - - // gen weights loading function from memory - _code.feed("LITE_EXPORT bool %s_load_weights(const void* weights);\n\n", _code_name.c_str()); - - // gen api for model init - _code.feed("/// %s_init should only be invoked once when input shape changes.\n", _code_name.c_str()); - _code.feed("LITE_EXPORT bool %s_init(Context& ctx);\n\n", _code_name.c_str()); - - // gen api for model prediction - _code.feed("/// Running prediction for model %s.\n", _code_name.c_str()); - _code.feed("LITE_EXPORT bool %s_prediction();\n\n", _code_name.c_str()); - - // gen free function - _code.feed("/// Release all resource used by model %s.\n", _code_name.c_str()); - _code.feed("LITE_EXPORT void %s_release_resource();\n\n", _code_name.c_str()); - -} - -template -void GenCPP::gen_head_api_impl() { - // gen api for getting graph input tensor - _code << "\n// gen api for getting graph input tensor \n"; - _code.feed("std::vector*> %s_get_in() {\n", _code_name.c_str()); - _code.feed(" std::vector*> vin;\n", this->_ins[0].c_str()); - for(int i = 0; i < this->_ins.size(); i++) { - auto node_info = this->_graph_node_map[this->_ins[i]]; - auto edge_info = this->_tensor_map[node_info.outs[0]]; - _code.feed(" vin.push_back(&%s_%s);\n", _code_name.c_str(), edge_info.name.c_str()); - } - _code.feed(" return vin;\n"); - -// _code.feed(" if(strcmp(in_name, \"%s\") == 0) {\n", this->_ins[0].c_str()); -// auto node_info = this->_graph_node_map[this->_ins[0]]; -// auto edge_info = this->_tensor_map[node_info.outs[0]]; -// _code.feed(" return &%s;\n }", edge_info.name.c_str()); -// for(int i = 1; i < this->_ins.size(); i++) { -// node_info = this->_graph_node_map[this->_ins[i]]; -// edge_info = this->_tensor_map[node_info.outs[0]]; -// _code.feed(" else if(strcmp(in_name, \"%s\") == 0) {\n", this->_ins[i].c_str()); -// _code.feed(" return &%s;\n }\n", edge_info.name.c_str()); -// } -// _code <<" else {\n return nullptr;\n }\n"; - _code <<"}\n"; - - // gen api for getting graph output tensor - _code << "\n// gen api for getting graph output tensor \n"; - _code.feed("std::vector*> %s_get_out() {\n", _code_name.c_str()); - _code.feed(" std::vector*> vout;\n"); - for(int i = 0; i < this->_outs.size(); i++) { - auto node_info = this->_graph_node_map[this->_outs[i]]; - auto edge_info = this->_tensor_map[node_info.ins[0]]; - _code.feed(" vout.push_back(&%s_%s);\n", _code_name.c_str(), edge_info.name.c_str()); - } - _code.feed(" return vout;\n"); - -// _code.feed(" if(strcmp(out_name, \"%s\") == 0) {\n", this->_outs[0].c_str()); -// node_info = this->_graph_node_map[this->_outs[0]]; -// edge_info = this->_tensor_map[node_info.ins[0]]; -// _code.feed(" return &%s;\n }", edge_info.name.c_str()); -// for(int i = 1; i < this->_outs.size(); i++) { -// node_info = this->_graph_node_map[this->_outs[i]]; -// edge_info = this->_tensor_map[node_info.ins[0]]; -// _code.feed(" else if(strcmp(out_name ,\"%s\") == 0) {\n", this->_outs[i].c_str()); -// _code.feed(" return &%s;\n }\n", edge_info.name.c_str()); -// } -// _code <<" else {\n return nullptr;\n }\n"; - _code <<"}\n\n"; - - // gen weights loading function - _code.feed("float *%s = nullptr; // global weights start pointer \n", _g_weights_ptr_name.c_str()); - _code.feed("std::vector %s_g_param; // global vector of param \n", _code_name.c_str()); - - _code.feed("bool %s_load_param(const char* param_path) {\n", _code_name.c_str()); - _code << " FILE *f = fopen(param_path, \"rb\"); \n"; - _code << " if(!f) {\n"; - _code << " return false;\n }\n"; - _code << " fseek(f, 0, SEEK_END);\n"; - _code << " long fsize = ftell(f);\n"; - _code << " fseek(f, 0, SEEK_SET);\n"; - _code.feed(" if(%s) {\n", _g_weights_ptr_name.c_str()); - _code.feed(" delete [] %s;\n", _g_weights_ptr_name.c_str()); - _code.feed(" %s = nullptr;\n", _g_weights_ptr_name.c_str()); - _code.feed(" }\n"); - _code.feed(" %s = new float[fsize + 1];\n", _g_weights_ptr_name.c_str()); - _code.feed(" fread(%s, fsize, sizeof(float), f);\n", _g_weights_ptr_name.c_str()); - _code << " fclose(f);\n"; - _code.feed(" %s_load_weights((const void*)%s);\n", _code_name.c_str(), _g_weights_ptr_name.c_str()); - _code << "}"; - - _code.feed("bool %s_load_weights(const void* weights) {\n", _code_name.c_str()); - _code.feed(" if (weights == nullptr) {\n"); // invoke (model_name)_tensors_init() - _code.feed(" return false;\n"); // invoke (model_name)_tensors_init() - _code.feed(" }\n"); // invoke (model_name)_tensors_init() - _code.feed(" %s_tensors_init();\n", _code_name.c_str()); // invoke (model_name)_tensors_init() - _code.feed(" %s_model_ios_init();\n", _code_name.c_str()); // invoke (model_name)_model_ios_init() - _code.feed(" for (int i = 0; i < %s_g_param.size(); i++) {\n", _code_name.c_str()); - _code.feed(" if (%s_g_param[i]) {\n", _code_name.c_str()); - _code.feed(" delete %s_g_param[i];\n", _code_name.c_str()); - _code.feed(" }\n"); - _code.feed(" %s_g_param[i] = nullptr;\n", _code_name.c_str()); - _code.feed(" }\n"); - _code.feed(" %s_g_param.clear();\n", _code_name.c_str()); - _code.feed(" const float* weights_ptr = (const float*)weights;\n"); - std::string local_weight_string = "weights_ptr"; - - for(auto & node_name : this->_exec_node_order) { - if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { - continue; - } - - auto& node_info = this->_graph_node_map[node_name]; - auto& attr_info = this->_graph[node_name]->attr(); - if(OPERATION_MAP.count(node_info.op_name) > 0) { - LOG(INFO) << "node name: " << node_name; - LOG(INFO) << "Target op type : " << this->_graph_node_map[node_name].op_name << " parsing ..."; - auto str = OPERATION_MAP[node_info.op_name].parse(attr_info, _code_name, - OPERATION_MAP[node_info.op_name].OpClassName, - node_name, - local_weight_string, - _weights, false); - if(!str.empty()) { - _code.feed(" %s", str.c_str()); - } - } else { - LOG(FATAL) << "Target op type : " << this->_graph_node_map[node_name].op_name << " not support"; - } - } - _code.feed(" %s_gen_ops();\n", _code_name.c_str()); - _code.feed(" for (int i = 0; i < %s_g_ops.size(); i++) {\n", _code_name.c_str()); - _code.feed(" SaberStatus state = %s_g_ops[i]->load_param(%s_g_param[i]);\n", _code_name.c_str(), _code_name.c_str()); - _code.feed(" if (state != SaberSuccess) { \n"); - _code.feed(" printf(\"load param failed\\n\");\n"); - _code.feed(" }\n"); - _code.feed(" }\n"); - - _code << " return true;\n"; - _code <<"}\n\n"; - - // release all resource function impl - _code.feed("void %s_release_resource() {\n", _code_name.c_str()); - _code.feed(" for (int i = 0; i < %s_g_ops.size(); i++) {\n", _code_name.c_str()); - _code.feed(" if (%s_g_ops[i]) {\n", _code_name.c_str()); - _code.feed(" delete %s_g_ops[i];\n", _code_name.c_str()); - _code.feed(" %s_g_ops[i] = nullptr;\n", _code_name.c_str()); - _code.feed(" }\n"); - _code.feed(" }\n"); - _code.feed(" %s_g_ops.clear();\n", _code_name.c_str()); - _code.feed(" for (int i = 0; i < %s_g_param.size(); i++) {\n", _code_name.c_str()); - _code.feed(" if (%s_g_param[i]) {\n", _code_name.c_str()); - _code.feed(" delete %s_g_param[i];\n", _code_name.c_str()); - _code.feed(" %s_g_param[i] = nullptr;\n", _code_name.c_str()); - _code.feed(" }\n"); - _code.feed(" }\n"); - _code.feed(" %s_g_param.clear();\n", _code_name.c_str()); - _code.feed(" if (%s) {\n", _g_weights_ptr_name.c_str()); - _code.feed(" delete [] %s;\n", _g_weights_ptr_name.c_str()); - _code.feed(" %s = nullptr;\n", _g_weights_ptr_name.c_str()); - _code.feed(" }\n", _g_weights_ptr_name.c_str()); - _code <<"}\n\n"; -} - -template -void GenCPP::gen_header() { - _code.Clean(); - _code.open(_h_file_name); - gen_header_start(); - // gen api - gen_head_api(); - gen_header_end(); - _code.save(); -} - -template -void GenCPP::gen_source(const bool debug_mode) { - _code.Clean(); - _code.open(_cpp_file_name); - gen_source_start(); - // generate tensors - gen_tensors(); - // tensors init - tensors_init(); - // generate i/o - gen_model_ios(); - // initial model i/o - model_ios_init(); - // generate ops - gen_ops(); - // gen head api implement - gen_head_api_impl(); - // gen initial api impl - gen_init_impl(); - // gen running api impl - gen_run_impl(debug_mode); - gen_source_end(); - _code.save(); - gen_opt_model(); - if (!_flag_aot) { - gen_merge_model(); - } -} - -template -void GenCPP::gen_opt_model() { - - //parse config file - bool flag_precision = false; - bool flag_calibrator = false; - CalibratorParser parser; - if (_precision_path == ""){ - flag_precision = false; - }else { - parser.parse_from_file(_precision_path, ""); - flag_precision = true; - } - - if (_calibrator_path == ""){ - flag_calibrator = false; - }else { - parser.parse_from_file("", _calibrator_path); - flag_calibrator = true; - } - - auto get_op_precision = [&](std::string node_name)->std::string{ - if (flag_precision){ - return parser.get_precision(node_name); - } else { - return "fp32"; - } - }; - auto get_tensor_precision = [&](std::string in_node_name, std::string out_node_name)->std::string{ - if (flag_precision){ - auto dtype = parser.get_dtype(in_node_name, out_node_name); - if (dtype == AK_FLOAT){ - return "fp32"; - } else if (dtype == AK_INT8) { - return "int8"; - } else { - LOG(FATAL) << "unsupport precision type"; - return "fp32"; - } - } else { - return "fp32"; - } - return "fp32"; - }; - - auto get_tensor_calibrator = [&](std::string tensor_name)->float{ - if (flag_calibrator){ - auto calibrator_scale = parser.get_calibrator(tensor_name); - return calibrator_scale; - } else { - return 1.f; - } - }; - - //!generate Version Number - int version_num = MAJOR * 100 + MINOR * 10 + REVISION; - _opt_param_write << "Version: " << version_num << "\n"; - //! generate Tensors - LOG(INFO) << "gen opt model tensors"; - _opt_param_write << "Tensor_number " << this->_tensor_map.size() << "\n"; - //! firstly, gen tensor withnot shared - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(! edge_info.is_shared) { - //tensor info format: tensor_name tensor_precision valid_shape real_shape is_shared shared_tensor_name - _opt_param_write << edge_name << " "; - //tensor precision info - auto t_precision = get_tensor_precision(edge_info.in_node, edge_info.out_node); - _opt_param_write << t_precision << " "; - //tensor calibrator info - auto t_calibrator = get_tensor_calibrator(edge_name); - _opt_param_write << t_calibrator << " "; - //tensor valid shape - _opt_param_write << edge_info.valid_shape.size() << " "; - for (int i = 0; i < edge_info.valid_shape.size(); ++i) { - _opt_param_write << edge_info.valid_shape[i] << " "; - } - //tensor shape - _opt_param_write << edge_info.real_shape.size() << " "; - for (int i = 0; i < edge_info.real_shape.size(); ++i) { - _opt_param_write << edge_info.real_shape[i] << " "; - } - _opt_param_write << 0 << " " << "null" << "\n"; - } - } - //! then gen tensor shared memory - for(auto it = this->_tensor_map.begin(); it != this->_tensor_map.end(); ++it) { - auto& edge_name = it->first; - auto& edge_info = it->second; - if(edge_info.is_shared) { - //tensor info format: tensor_name valid_shape real_shape is_shared shared_tensor_name - - _opt_param_write << edge_name << " "; - - //tensor precision info - auto t_precision = get_tensor_precision(edge_info.in_node, edge_info.out_node); - _opt_param_write << t_precision << " "; - //tensor calibrator info - auto t_calibrator = get_tensor_calibrator(edge_name); - _opt_param_write << t_calibrator << " "; - //tensor valid shape - _opt_param_write << edge_info.valid_shape.size() << " "; - for (int i = 0; i < edge_info.valid_shape.size(); ++i) { - _opt_param_write << edge_info.valid_shape[i] << " "; - } - //tensor shape - _opt_param_write << edge_info.valid_shape.size() << " "; - for (int i = 0; i < edge_info.valid_shape.size(); ++i) { - _opt_param_write << edge_info.valid_shape[i] << " "; - } - _opt_param_write << 1 << " " << edge_info.share_from << "\n"; - } - } - //! gen inputs and outputs tensor name and precision - _opt_param_write << "inputs " << this->_ins.size(); - for(auto in : this->_ins) { - auto node_info = this->_graph_node_map[in]; - auto edge_info = this->_tensor_map[node_info.outs[0]]; - _opt_param_write << " " << edge_info.name; - _opt_param_write << " " << "fp32"; - } - _opt_param_write << "\n"; - - //! gen outputs and outputs tensor name and precision - _opt_param_write << "outputs " << this->_outs.size(); - for(auto out : this->_outs) { - auto node_info = this->_graph_node_map[out]; - auto edge_info = this->_tensor_map[node_info.ins[0]]; - _opt_param_write << " " << edge_info.name; - _opt_param_write << " " << "fp32"; - } - _opt_param_write << "\n"; - - //! gen ops and params - int op_num = this->_exec_node_order.size(); - for(auto & node_name : this->_exec_node_order) { - if (this->_graph_node_map[node_name].op_name == "Input" || - this->_graph_node_map[node_name].op_name == "Output") { - op_num--; - } - } - _opt_param_write << "OPS " << op_num << "\n"; - for(auto & node_name : this->_exec_node_order) { - if(this->_graph_node_map[node_name].op_name == "Input" || this->_graph_node_map[node_name].op_name == "Output") { - continue; - } - auto& node_info = this->_graph_node_map[node_name]; - auto& attr_info = this->_graph[node_name]->attr(); - if(OPERATION_MAP.count(node_info.op_name) > 0) { - LOG(INFO) << "Target op type : " << this->_graph_node_map[node_name].op_name << " parsing ..."; - _opt_param_write << OPERATION_MAP[node_info.op_name].OpClassName << " " << node_name << " "; - _opt_param_write << get_op_precision(node_name) << " "; - _opt_param_write << node_info.ins.size() << " "; - _opt_param_write << node_info.outs.size() << " "; - for(auto &edge_in : node_info.ins) { - _opt_param_write << edge_in << " "; - // auto edge_in_name = this->_tensor_map[edge_in].in_node; - // auto edge_out_name = this->_tensor_map[edge_in].out_node; - // auto t_precision = get_tensor_precision(edge_in_name, edge_out_name); - // _opt_param_write << t_precision << " "; - } - for(auto &edge_out : node_info.outs) { - _opt_param_write << edge_out.c_str() << " "; - // auto edge_in_name = this->_tensor_map[edge_out].in_node; - // auto edge_out_name = this->_tensor_map[edge_out].out_node; - // auto t_precision = get_tensor_precision(edge_in_name, edge_out_name); - // _opt_param_write << t_precision << " "; - } - std::string local_weighs_string = "null"; - auto str = OPERATION_MAP[node_info.op_name].parse(attr_info, _code_name, - OPERATION_MAP[node_info.op_name].OpClassName, - node_name, - local_weighs_string, - _opt_weights, - true); - _opt_param_write << str; - } else { - LOG(FATAL) << "Target op type : " << this->_graph_node_map[node_name].op_name << " not support"; - } - } - - _opt_param_write.save(); -} - -template -void GenCPP::gen_merge_model() { - FILE* fp_merge = fopen(_merge_opt_file.c_str(), "wb"); - FILE* fp_weight = fopen(_model_file_name.c_str(), "rb"); - FILE* fp_info = fopen(_model_opt_file_name.c_str(), "rb"); - fseek(fp_weight, 0, SEEK_END); - long wsize = ftell(fp_weight); - fseek(fp_weight, 0, SEEK_SET); - char* wbuffer = new char[wsize + 1]; - fread(wbuffer, wsize, 1, fp_weight); - - fseek(fp_info, 0, SEEK_END); - long isize = ftell(fp_info); - fseek(fp_info, 0, SEEK_SET); - char* ibuffer = new char[isize + 1]; - fread(ibuffer, isize, 1, fp_info); - - fprintf(fp_merge, "Wsize %lu\n", wsize); - fwrite(wbuffer, wsize, 1, fp_merge); - - fwrite(ibuffer, isize, 1, fp_merge); - - fflush(fp_merge); - fclose(fp_merge); - - fclose(fp_weight); - fclose(fp_info); - - delete [] wbuffer; - delete [] ibuffer; -} - -#ifdef USE_CUDA -template class GenCPP; -template class GenCPP; -template class GenCPP; -#endif - -#ifdef USE_X86_PLACE -template class GenCPP; -template class GenCPP; -template class GenCPP; -#endif - -#ifdef USE_ARM_PLACE -template class GenCPP; -template class GenCPP; -template class GenCPP; -#endif - -template class GenCPP; - -} /* namespace lite */ - -} /* namespace anakin */ - diff --git a/framework/lite/code_gen_cpp.h b/framework/lite/code_gen_cpp.h deleted file mode 100644 index bbadb1832..000000000 --- a/framework/lite/code_gen_cpp.h +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_FRAMEWORK_LITE_CODE_GENERATE_CPP_H -#define ANAKIN_FRAMEWORK_LITE_CODE_GENERATE_CPP_H - -#include "saber/lite/core/common_lite.h" -#include "framework/lite/op_map.h" -#include "framework/lite/code_gen_base.h" - -namespace anakin { - -namespace lite { - -/** - * \brief class to generate cpp files. - * - */ -template -class GenCPP : public CodeGenBase { -public: - explicit GenCPP(std::string model_name, std::string model_dir, std::string precision_path, \ - std::string calibrator_path, bool flag_aot) { - - _flag_aot = flag_aot; - if (!flag_aot) { - _cpp_file_name = model_dir + '/' + model_name + ".cpp.tmp"; - _h_file_name = model_dir + '/' + model_name + ".h.tmp"; - _model_file_name = model_dir + '/' + model_name + ".bin"; - _model_opt_file_name = model_dir + '/' + model_name + ".info"; - _weight_opt_file = model_dir + '/' + model_name + ".tmp"; - _weights.open(_model_file_name); - _opt_weights.open(_weight_opt_file); - _opt_param_write.open(_model_opt_file_name); - _code_name = model_name; - _g_weights_ptr_name = _code_name+"_weights_ptr"; - _merge_opt_file = model_dir + '/' + model_name + ".lite.bin"; - _precision_path = precision_path; - _calibrator_path = calibrator_path; - } else { - - _cpp_file_name = model_dir + '/' + model_name + ".cpp"; - _h_file_name = model_dir + '/' + model_name + ".h"; - _model_file_name = model_dir + '/' + model_name + ".bin"; - _model_opt_file_name = model_dir + '/' + model_name + ".lite.tmp"; - _weight_opt_file = model_dir + '/' + model_name + ".tmp"; - - _weights.open(_model_file_name); - _opt_weights.open(_weight_opt_file); - _opt_param_write.open(_model_opt_file_name); - _code_name = model_name; - _g_weights_ptr_name = _code_name+"_weights_ptr"; - - _merge_opt_file = model_dir + '/' + model_name + ".merge.tmp"; - _precision_path = precision_path; - _calibrator_path = calibrator_path; - } - - } - ~GenCPP()=default; - - /// generate all cpp files - virtual void gen_files(const bool debug_mode) { - gen_header(); - gen_source(debug_mode); - } - -private: - void gen_license(); - void gen_header_start(); - void gen_header_end(); - void gen_source_start(); - void gen_source_end(); - - /** - * \brief generator optimized model for lite executer - */ - void gen_opt_model(); - - /** - * \brief merge info and weights to one file - */ - void gen_merge_model(); - - /** - * \brief generate tensors for edges - */ - void gen_tensors(); - - /** - * \brief initialize tensors for edges - */ - void tensors_init(); - - /** - * \brief generate model's inputs and outputs - */ - void gen_model_ios(); - - /** - * \brief initialize model's inputs and outputs - */ - void model_ios_init(); - - /** - * \brief generate operations for model - */ - virtual void gen_ops(); - - /** - * \brief generate initial impl api for model - */ - void gen_init_impl(); - - /** - * \brief generate running api impl for model - */ - void gen_run_impl(const bool debug_mode); - - - /** - * \brief generate api for model - */ - void gen_head_api(); - - /** - * \brief generate head api implement - */ - void gen_head_api_impl(); - - /** - * \biref generata header file - */ - void gen_header(); - - /** - * \biref generata source file - */ - void gen_source(const bool debug_mode); - -private: - std::string _cpp_file_name; - std::string _h_file_name; - std::string _model_file_name; - std::string _model_opt_file_name; - std::string _code_name; - std::string _g_weights_ptr_name; - std::string _weight_opt_file; - std::string _merge_opt_file; - std::string _precision_path; - std::string _calibrator_path; - - CodeWritter _code; - CodeWritter _opt_param_write; - WeightsWritter _weights; - WeightsWritter _opt_weights; - - bool _flag_aot{true}; -}; - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif diff --git a/framework/lite/code_writter.h b/framework/lite/code_writter.h deleted file mode 100644 index 9dd03705e..000000000 --- a/framework/lite/code_writter.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_FRAMEWORK_LITE_CODE_WRITTER_H -#define ANAKIN_FRAMEWORK_LITE_CODE_WRITTER_H - -#include -#include "framework/lite/file_stream.h" - -namespace anakin { - -namespace lite { - -/** - * \brief class to help generating code string. - * - */ -class CodeWritter { -public: - CodeWritter() {} - explicit CodeWritter(std::string path) { - this->open(path); - } - - // CodeWritter open file for code generating. - void open(std::string& path, const char* file_mode = "w" ) { - _file_io.open(path, file_mode); - } - - // get CodeWritter's target name - std::string get_code_name() { - auto path = _file_io.get_file_path(); - char* file_path = strdup(path.c_str()); - char* pos_end = file_path + path.size()-1; - char* split_idx = nullptr; - while(*pos_end != '/') { - if(*pos_end == '.') { - *pos_end = '\0'; - split_idx = pos_end; - } - pos_end--; - } - std::string name = std::string(pos_end+1); - *split_idx='/'; - free(file_path); - return name; - } - - /// feed format string for code writter. - void feed(const char* format, ...) { - va_list vlist; - va_start(vlist, format); - auto code_str_p = pick_format(format, vlist); - // get msg - _code< - CodeWritter& operator<<(const T& var) { - _code<open(path, file_mode); - } - - ~LiteFileIO() { - if(_file_p) { - fflush(this->_file_p); - fclose(this->_file_p); - this->_file_p = nullptr; - } - } - - // write msg to file - inline bool write(const std::string& msg) { - fprintf(this->_file_p, "%s\n", msg.c_str()); - fflush(this->_file_p); - return true; - } - - // write data list to file - inline bool write(const void* ptr, size_t size, size_t count) { - size_t ret = fwrite(ptr, size, count, this->_file_p); - fflush(this->_file_p); - if(ret != count) { - LOG(ERROR) << "Writing error " << stderr; - return false; - } - return true; - } - - // read data list from file - inline bool read(void* ptr, size_t size, size_t count) { - size_t ret = fread(ptr, size, count, this->_file_p); - if(ret != count) { - LOG(ERROR) << "Reading error " << stderr; - return false; - } - return true; - } - - inline bool is_file_open() { - return _file_p != nullptr ? true:false; - } - - inline std::string get_file_path() { - return _file_path; - } - - /// open the target file path - void open(const std::string& path, const char* file_mode) { - // close old - if(is_file_open()) { - fflush(this->_file_p); - fclose(this->_file_p); - this->_file_p = nullptr; - } - // open new - if (!this->is_file_open()) { - _file_path = path; - char* file_path = strdup(path.c_str()); - for (char* p = strchr(file_path + 1, '/'); p!=NULL; p = strchr(p + 1, '/')){ - *p = '\0'; - struct stat st; - if ((stat(file_path, &st) == 0) && (((st.st_mode) & S_IFMT) == S_IFDIR)){ - // file_path exists and is a directory. do nothing - *p = '/'; - continue; - } else { - if(mkdir(file_path,0755)==-1){ - LOG(FATAL) << "Failed to ceate the path "<< file_path; - } - } - *p = '/'; - } - free(file_path); - this->_file_p = fopen(path.c_str(), file_mode); - if (!this->_file_p){ - LOG(FATAL)<< "Failed to open " << path.c_str(); - } - } - } - -private: - std::string _file_path{""}; - FILE* _file_p{nullptr}; -}; - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif diff --git a/framework/lite/generator/gen_code.sh b/framework/lite/generator/gen_code.sh deleted file mode 100755 index 401ac72eb..000000000 --- a/framework/lite/generator/gen_code.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash - -################################################# -# -# Usage: sh gen_code.sh -n -m -o -# -################################################# -# print help info -help_gen_code() { - echo "Usage: sh gen_code.sh [-h] [-n MODEL_NAME] [-m MODEL_PATH] [-p PRECISION_PATH] [-c CALIBRATOR_PATH] [-o OUTPUT_PATH] [-a AOT_MODE] [-d LOG_DEBUG_INFO]" - echo "" - echo " Generating lite code for target model." - echo "" - echo "optional arguments:" - echo "" - echo " -h help info" - echo " -n model name used as the name of generating codes." - echo " -m path to model " - echo " -p path to precision file" - echo " -c path to calibrator file" - echo " -o path to save the generating codes." - echo " -a aot mode: >0: aot mode, generate .h and .cpp; 0: general mode, generate .lite.info and .lite.bin" - echo " -d debug mode. [ default 0]" - echo " -b batch_size. [ default 1]" - exit 1 -} - -# generating code function -gen_code() { - if [ $# -lt 6 ]; then - exit 1 - fi - mode_name=$1 - mode_path=$2 - out_path=$3 - aot_mode=$4 - debug_mode=$5 - batch_size=$6 - prec_path=$7 - cali_path=$8 - executor="$( cd "$(dirname "$0")"/src ; pwd -P)"/anakin_lite_executer - $executor $mode_name $mode_path $out_path $aot_mode $debug_mode $batch_size $prec_path $cali_path -} - -# get args -if [ $# -lt 6 ]; then - help_gen_code - exit 1 -fi - -mode_name=0 -mode_path=0 -prec_path="" -cali_path="" -out_path="./" -aot_mode=1 -debug_mode=0 -batch_size=1 -while getopts h:n:m:p:c:o:a:d:b:hold opt -do - case $opt in - n) mode_name=$OPTARG;; - m) mode_path=$OPTARG;; - p) prec_path=$OPTARG;; - c) cali_path=$OPTARG;; - o) out_path=$OPTARG;; - a) aot_mode=$OPTARG;; - d) debug_mode=$OPTARG;; - b) batch_size=$OPTARG;; - *) help_gen_code;; - esac -done - -echo "User set model name: $mode_name" -echo "User set model path: $mode_path" -echo "User set out_path: $out_path" -echo "aot mode: $aot_mode" -echo "debug mode: $debug_mode" -echo "batch_size: $batch_size" - - -if [ -f $prec_path ];then - echo "User set precision file path: $prec_path" -fi - -if [ -f $cali_path ];then - echo "User set calibrator file path: $cali_path" -fi - -if [ ! -f $mode_path ];then - echo "mode_path: $mode_path not exists." - exit 1 -fi - -if [ ! -d $out_path ];then - echo "out path: $out_path not exists." - exit 1 -fi - -gen_code $mode_name $mode_path $out_path $aot_mode $debug_mode $batch_size $prec_path $cali_path - -rm $out_path/*.tmp -if [ $aot_mode -lt 1 ]; then - rm $out_path/*.h - rm $out_path/*.cpp -fi diff --git a/framework/lite/generator/src/anakin_lite_executer.cpp b/framework/lite/generator/src/anakin_lite_executer.cpp deleted file mode 100644 index 2c6de0a88..000000000 --- a/framework/lite/generator/src/anakin_lite_executer.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include "saber/saber_types.h" -#include "framework/lite/code_gen_cpp.h" -#include "framework/core/types.h" - -using namespace anakin; -using namespace anakin::saber; -using namespace anakin::lite; - -void anakin_lite_executer(const char* model_name, const char* model_path, const char* precision_path, \ - const char* calibrator_path, const char* output_path, const bool flag_aot, const bool debug_mode = false,\ - const int batch_size = 1) { - // constructs - GenCPP code_gen(model_name, output_path, precision_path, calibrator_path, flag_aot); - if (!code_gen.extract_graph(model_path, batch_size)) { - LOG(ERROR) << "extract error on : " << model_path; - } - // gen - code_gen.gen_files(debug_mode); -} - - -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - if (argc < 6) { - LOG(ERROR) << "Some arguments not supplied!"; - LOG(ERROR) << "usage: " << argv[0] << " model_name model_weights_path(xxx.anakin.bin) output_path aot_mode debug_mode batch_size precision_path calibrator_path"; - LOG(ERROR) << "model_name: output lib and api name"; - LOG(ERROR) << "model_weights_path: path to your anakin model"; - LOG(ERROR) << "output_path: output path"; - LOG(ERROR) << "aot_mode: >0: aot mode, generate .h and .cpp; 0: general mode, generate .lite.info and .lite.bin"; - LOG(ERROR) << "debug_mode: debug mode, only for aot mode, 0:no debug info, 1:with debug info"; - LOG(ERROR) << "batch_size: default 1"; - LOG(ERROR) << "precision_path: precision file path"; - LOG(ERROR) << "calibrator_path: calirator file path"; - - return 1; - } - const char* model_name = argv[1]; - const char* model_path = argv[2]; - const char* output_path = argv[3]; - bool flag_aot = atoi(argv[4]) > 0; - bool flag_debug = false; - if (argc > 5) { - flag_debug = atoi(argv[5]) > 0; - } - int batch_size = 1; - if (argc > 6){ - batch_size = atoi(argv[6]); - } - const char* precision_path = ""; - if (argc > 7){ - precision_path = argv[7]; - } - const char* calibrator_path = ""; - if (argc > 8){ - calibrator_path = argv[8]; - } - anakin_lite_executer(model_name, model_path, precision_path, calibrator_path,\ - output_path, flag_aot, flag_debug, batch_size); - return 0; -} diff --git a/framework/lite/op_map.h b/framework/lite/op_map.h deleted file mode 100644 index f4aca6c89..000000000 --- a/framework/lite/op_map.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_FRAMEWORK_LITE_OPERATION_MAP_H -#define ANAKIN_FRAMEWORK_LITE_OPERATION_MAP_H - -#include -#include - -#include "framework/lite/code_writter.h" -#include "framework/lite/binary_writter.h" - -namespace anakin { - -namespace lite { - -template -inline T get_attr(std::string attr_name, graph::AttrInfo& attrs) { - if (!attrs.inspect(attr_name)) { - LOG(FATAL) << "Target attr name(" << attr_name << ") not found."; - return T(); - } - return attrs.get(attr_name); -} - -inline SaberStatus find_attr(std::string attr_name, graph::AttrInfo& attrs) { - if (!attrs.inspect(attr_name)) { - LOG(WARNING) << "Target attr name(" << attr_name << ") not found."; - return SaberUnImplError; - } - return SaberSuccess; -} - -/// function type for parser -typedef std::function ParseParamFunctor; -/** - * \brief class OpParser - */ -struct OpParser { - std::string OpClassName; - ParseParamFunctor parse; -}; - -/// operations map -extern std::unordered_map OPERATION_MAP; - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif diff --git a/framework/lite/op_map_cpp.cpp b/framework/lite/op_map_cpp.cpp deleted file mode 100755 index cad264046..000000000 --- a/framework/lite/op_map_cpp.cpp +++ /dev/null @@ -1,2381 +0,0 @@ -#include "framework/lite/op_map.h" -#include "framework/lite/utils.h" - -namespace anakin { - -namespace lite { - -//using namespace anakin; -//using namespace anakin::lite; - -std::string not_impl_yet(graph::AttrInfo&, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - LOG(INFO) << "Target "<< op_class_name << "Parsing not impl yet. continue ..."; - return ""; -} - -// SaberConv2D -std::string ParserConvolution(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 0, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "false", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - if (gen_param) { - // gen cpp code - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0); - } else { - // gen cpp code - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d);\n", node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - - return code_w.get_code_string(); -} - // SaberPower -std::string ParserPower(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto power = get_attr("power", attr); - auto scale = get_attr("scale", attr); - auto shift = get_attr("shift", attr); - - // gen cpp code - CodeWritter code_w; - - if (gen_param) { - code_w.feed("%f %f %f\n", scale, shift, power); - } else { - code_w.feed("ParamBase* %s_param = new PowerParam(%f,%f,%f);\n", node_name.c_str(), scale, shift, power); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); - } -// SaberDeconv2D -std::string ParserDeconvolution(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = filter_num;//*weights_shape[1]; - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 0, //set flag_act - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "false", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d);\n", node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - return code_w.get_code_string(); -} - -// ParserDeConvolutionRelu -std::string ParserDeConvolutionRelu(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = filter_num;//*weights_shape[1]; - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 1, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "true", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - // gen cpp code - CodeWritter code_w; - if(gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - (int)Active_relu, - 1, //set flag_relu true - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0); - } else { - code_w.feed("ParamBase* %s_param = new ConvAct2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,Active_relu,%s,%s+%d,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - "true", //set flag_relu true - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - return code_w.get_code_string(); -} - -// ParserConvolutionRelu -std::string ParserConvolutionRelu(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 1, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "true", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - // gen cpp code - CodeWritter code_w; - if(gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - (int)Active_relu, - 1, //set flag_relu true - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0); - } else { - code_w.feed("ParamBase* %s_param = new ConvAct2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,Active_relu,%s,%s+%d,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - "true", //set flag_relu true - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - return code_w.get_code_string(); -} - -// ParserConvAct //also with eltwise -std::string ParserConvAct(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - // get act param - ActiveType act_type = Active_unknow; - std::string act_type_str; - bool act_shared = false; - int act_weights_offset = 0; - auto type = get_attr("act_0_type", attr); - if (type == "TanH") { - act_type = Active_tanh; - act_type_str = "Active_tanh"; - //LOG(FATAL) << "Activation TanH not supported now."; - } else if (type == "Sigmoid") { - act_type = Active_sigmoid; - act_type_str = "Active_sigmoid"; - //LOG(FATAL) << "Activation Sigmoid not supported now."; - } else if (type == "PReLU") { - act_type = Active_prelu; - act_shared = get_attr("act_0_channel_shared", attr); - auto prelu_weights = get_attr>("act_0_weight_1", attr); - writter.register_weights(node_name, prelu_weights); - LOG(INFO) << node_name << " write weights: " << prelu_weights.count(); - auto offset_info_1 = writter.get_weights_by_name(node_name); - act_weights_offset = offset_info_1.weights[2].offset; - act_type_str = "Active_prelu"; - } else if (type == "Stanh") { - LOG(FATAL) << "Activation Stanh not supported now."; - } else if (type == "Relu") { - act_type = Active_relu; - act_type_str = "Active_relu"; - } else if (type == "ClippedRelu") { - LOG(FATAL) << "Activation ClippedRelu not supported now."; - } else if (type == "Elu") { - LOG(FATAL) << "Activation Elu not supported now."; - } else { - LOG(FATAL) << "Other Activation type" << type << " should be replace by other ops."; - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 1, //set flag_act true - (int)act_type, - 0.f, //neg slope - 0.f, //act_coef - act_shared, //prelu, channel_shared - act_weights_offset/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "true", //set flag_act true - act_type_str.c_str(), 0.f, 0.f, act_shared? "true" : "false", weights_ptr_name.c_str(), act_weights_offset); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - return code_w.get_code_string(); -} - -// ParserConvolutionRelu -std::string ParserConvolutionReluPool(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - - writter.register_weights(node_name, weights); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - } - - // parsing pooling parameter - auto global_pooling = get_attr("pooling_0_global_pooling", attr); - auto pool_padding = get_attr>("pooling_0_padding", attr); - auto pool_strides = get_attr>("pooling_0_strides", attr); - auto pool_size = get_attr>("pooling_0_pool_size", attr); - auto pool_method = get_attr("pooling_0_method", attr); - - std::string str_pool_method; - - PoolingType pool_type; - if (pool_method == "MAX") { - pool_type = Pooling_max; - str_pool_method = "Pooling_max"; - } - if (pool_method == "AVG") { - pool_type = Pooling_average_include_padding; - str_pool_method = "Pooling_average_include_padding"; - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0, //flag_eltwise - 1, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0,/*prelu weights*/ - (int)pool_type, - global_pooling? 1 : 0, - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d,%s,%s,%d,%d,%d,%d,%d,%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false", //flag_eltwise - "true", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0, - str_pool_method.c_str(), global_pooling? "true" : "false", - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? 1 : 0, - (int)Active_relu, - 1, //set flag_relu true - (int)pool_type, - global_pooling? 1 : 0, - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0], - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0); - } else { - code_w.feed("ParamBase* %s_param = new ConvActPool2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,Active_relu,%s,%s,%s,%d,%d,%d,%d,%d,%d,%s+%d,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - bias_term ? "true":"false", - "true", //set flag_relu true - str_pool_method.c_str(), - global_pooling? "true" : "false", - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0], - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - return code_w.get_code_string(); -} - -//conv batchnorm -std::string ParserConvBatchnorm(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - 1, //BIAS term - offset_info.weights[0].offset, - offset_info.weights[1].offset, - 0, //flag_eltwise - 0, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - "true", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset, - "false", //flag_eltwise - "false", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - /* - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - 1,//bias term always true - offset_info.weights[0].offset, - offset_info.weights[1].offset); //always has bias - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d);\n", node_name.c_str(), \ - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - "true",//bias term always true - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset); //always has bias - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - */ - return code_w.get_code_string(); -} - -std::string ParserConvBatchnormScale(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); -// gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - 1, //BIAS term - offset_info.weights[0].offset, - offset_info.weights[1].offset, - 0, //flag_eltwise - 0, //set flag_act false - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - "true", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset, - "false", //flag_eltwise - "false", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - return code_w.get_code_string(); -} - -// SaberConvBatchnormScaleRelu -std::string ParserConvBatchnormScaleRelu(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - // gen cpp code - - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - 1, //BIAS term - offset_info.weights[0].offset, - offset_info.weights[1].offset, - 0, //flag_eltwise - 1, //set flag_act false - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0/*prelu weights*/); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - "true", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset, - "false", //flag_eltwise - "true", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - return code_w.get_code_string(); -} - -// SaberConvBatchnormScaleRelu -std::string ParserConvBatchnormScaleReluPool(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - auto bias_term = get_attr("bias_term", attr); - auto padding = get_attr>("padding", attr); - auto strides = get_attr>("strides", attr); - auto dilation_rate = get_attr>("dilation_rate", attr); - auto filter_num = get_attr("filter_num", attr); - auto kernel_size = get_attr>("kernel_size", attr); - auto axis = get_attr("axis", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count();//weights_shape[2]*weights_shape[3]; - int num_output = weights_shape[0];//*weights_shape[1]; - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - // parsing pooling parameter - auto global_pooling = get_attr("pooling_0_global_pooling", attr); - auto pool_padding = get_attr>("pooling_0_padding", attr); - auto pool_strides = get_attr>("pooling_0_strides", attr); - auto pool_size = get_attr>("pooling_0_pool_size", attr); - auto pool_method = get_attr("pooling_0_method", attr); - - std::string str_pool_method; - PoolingType pool_type; - if (pool_method == "MAX") { - pool_type = Pooling_max; - str_pool_method = "Pooling_max"; - } - if (pool_method == "AVG") { - pool_type = Pooling_average_include_padding; - str_pool_method = "Pooling_average_include_padding"; - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %f %f %d %d %d %d %d %d %d %d %d %d\n", - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - 1, //bias term - offset_info.weights[0].offset, - offset_info.weights[1].offset, - 0, //flag_eltwise - 1, //set flag_act true - (int)Active_relu, - 0.f, //neg slope - 0.f, //act_coef - 0, //prelu, channel_shared - 0,/*prelu weights*/ - (int)pool_type, - global_pooling? 1 : 0, - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - } else { - code_w.feed("ParamBase* %s_param = new Conv2DParam(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s+%d,%s+%d,%s,%s,%s,%f,%f,%s,%s+%d,%s,%s,%d,%d,%d,%d,%d,%d);\n", - node_name.c_str(), - weights_size, - num_output, - group, - kernel_size[1], - kernel_size[0], - strides[1], - strides[0], - padding[1], - padding[0], - dilation_rate[1], - dilation_rate[0], - "true", - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset, - "false", //flag_eltwise - "true", //set flag_act true - "Active_relu", 0.f, 0.f, "false", weights_ptr_name.c_str(), 0, - str_pool_method.c_str(), global_pooling? "true" : "false", - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - return code_w.get_code_string(); -} - -// SaberConcat -std::string ParserConcat(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto axis = get_attr("axis", attr); - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d\n", axis); - } else { - code_w.feed("ParamBase* %s_param = new ConcatParam(%d);\n", - node_name.c_str(), axis); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberDectionOutput -std::string ParserDectionOutput(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto flag_share_location = get_attr("share_location", attr); - auto flag_var_in_target = get_attr("variance_encode_in_target", attr); - auto classes_num = get_attr("class_num", attr); - auto background_id = get_attr("background_id", attr); - auto keep_top_k = get_attr("keep_top_k", attr); - auto code_type = get_attr("code_type", attr); - auto conf_thresh = get_attr("conf_thresh", attr); - auto nms_top_k = get_attr("nms_top_k", attr); - auto nms_thresh = get_attr("nms_thresh", attr); - auto nms_eta = get_attr("nms_eta", attr); - - CodeType cd_type; - if (code_type == "CORNER") { - cd_type = CORNER; - } else if (code_type == "CORNER_SIZE") { - cd_type = CORNER_SIZE; - } else if (code_type == "CENTER_SIZE") { - cd_type = CENTER_SIZE; - } else { - LOG(FATAL) << "unsupport code type in detection output param: " << code_type; - } - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %f %d %d %d %d %f %f %d %d\n", - classes_num, - conf_thresh, - nms_top_k, - background_id, - keep_top_k, - (int)cd_type, - nms_thresh, - nms_eta, - flag_share_location? 1 : 0, - flag_var_in_target? 1 : 0); - } else { - code_w.feed("ParamBase* %s_param = new DetectionOutputParam(%d,%f,%d,%d,%d,%s,%f,%f,%s,%s);\n", - node_name.c_str(), - classes_num, - conf_thresh, - nms_top_k, - background_id, - keep_top_k, - code_type.c_str(), - nms_thresh, - nms_eta, - flag_share_location? "true" : "false", - flag_var_in_target? "true" : "false"); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberEltwise -std::string ParserEltwise(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto type = get_attr("type", attr); - auto coeff = get_attr>("coeff", attr); - - std::string eltwise_type_str("Eltwise_unknow"); - EltwiseType et_type; - if (type == "Add") { - eltwise_type_str = "Eltwise_sum"; - et_type = Eltwise_sum; - } else if (type == "Max") { - eltwise_type_str = "Eltwise_max"; - et_type = Eltwise_max; - } else { - eltwise_type_str = "Eltwise_prod"; - et_type = Eltwise_prod; - } - - CodeWritter coeff_vec_code; - coeff_vec_code<<"{"; - for (int i=0; i 0) { - coeff_vec_code<("type", attr); - auto coeff = get_attr>("coeff", attr); - - std::string eltwise_type_str("Eltwise_unknow"); - EltwiseType et_type; - if (type == "Add") { - eltwise_type_str = "Eltwise_sum"; - et_type = Eltwise_sum; - } else if (type == "Max") { - eltwise_type_str = "Eltwise_max"; - et_type = Eltwise_max; - } else { - eltwise_type_str = "Eltwise_prod"; - et_type = Eltwise_prod; - } - - CodeWritter coeff_vec_code; - coeff_vec_code<<"{"; - for (int i=0; i 0) { - coeff_vec_code<("type", attr); - auto coeff = get_attr>("coeff", attr); - - std::string eltwise_type_str("Eltwise_unknow"); - EltwiseType et_type; - if (type == "Add") { - eltwise_type_str = "Eltwise_sum"; - et_type = Eltwise_sum; - } else if (type == "Max") { - eltwise_type_str = "Eltwise_max"; - et_type = Eltwise_max; - } else { - eltwise_type_str = "Eltwise_prod"; - et_type = Eltwise_prod; - } - - CodeWritter coeff_vec_code; - coeff_vec_code<<"{"; - for (int i=0; i 0) { - coeff_vec_code<("prelu_0_channel_shared", attr); - // auto prelu_weights = get_attr("weights", attr); - auto prelu_weights = get_attr>("prelu_0_weight_1", attr); - - writter.register_weights(node_name, prelu_weights); - LOG(INFO) << node_name << " write weights: " << prelu_weights.count(); - - auto offset_info = writter.get_weights_by_name(node_name); - // gen cpp code - CodeWritter code_w; - - if (gen_param) { - code_w.feed("%d %d ", (int)et_type, - coeff.size()); - for (int i = 0; i < coeff.size(); ++i) { - code_w << coeff[i] << " "; - } - code_w << (int)Active_prelu << " " << 0.f << " " << 0.f << " " << \ - (prelu_channel_shared ? 1 : 0) << " " << offset_info.weights[0].offset <<"\n"; - //code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new EltwiseActParam(%s, %s, %s, %f, %f, %s, %s+%d);\n", - node_name.c_str(), - eltwise_type_str.c_str(), - coeff_vec_code.get_code_string().c_str(), - "Active_prelu", - 0.f, - 0.f, - (prelu_channel_shared ? "true" : "false"), - weights_ptr_name.c_str(), - offset_info.weights[0].offset); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberActivation -std::string ParserActivation(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto type = get_attr("type", attr); - - std::string act_type("Active_unknow"); - - //! ActiveType act_type, float neg_slope = 0.f, float coef = 1.f, bool channel_shared = false, const float* weights = nullptr - // gen cpp code - CodeWritter code_w; - if (type == "TanH") { - if (gen_param) { - code_w << (int)Active_tanh << " " << 0.f << " " << 0.f << " " << 0 << " " << 0 << "\n"; - } else { - act_type = "Active_tanh"; - code_w.feed("ParamBase* %s_param = new ActivationParam(%s);\n", - node_name.c_str(), - act_type.c_str()); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - } else if (type == "Sigmoid") { - if (gen_param) { - code_w << (int)Active_sigmoid << " " << 0.f << " " << 0.f << " " << 0 << " " << 0 << "\n"; - } else { - act_type = "Active_sigmoid"; - code_w.feed("ParamBase* %s_param = new ActivationParam(%s);\n", - node_name.c_str(), - act_type.c_str()); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - } else if (type == "ReLU") { - if (gen_param) { - code_w << (int)Active_relu << " " << 0.f << " " << 0.f << " " << 0 << " " << 0 << "\n"; - } else { - act_type = "Active_relu"; - code_w.feed("ParamBase* %s_param = new ActivationParam(%s);\n", - node_name.c_str(), - act_type.c_str()); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - } else if (type == "PReLU") { - act_type = "Active_prelu"; - auto prelu_channel_shared = get_attr("channel_shared", attr); - // auto prelu_weights = get_attr("weights", attr); - auto prelu_weights = get_attr>("weight_1", attr); - - writter.register_weights(node_name, prelu_weights); - LOG(INFO) << node_name << " write weights: " << prelu_weights.count(); - - auto offset_info = writter.get_weights_by_name(node_name); - if (gen_param) { - code_w << (int)Active_prelu << " " << 0.f << " " << 0.f << " " << \ - (prelu_channel_shared ? 1 : 0) << " " << offset_info.weights[0].offset << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ActivationParam(%s, %f, %f, %s, %s+%d);\n", - node_name.c_str(), - act_type.c_str(), - 0.f, - 0.f, - prelu_channel_shared ? "true" : "false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - } else { - LOG(FATAL) << "Other Activation type" << type << " unknown."; - } - return code_w.get_code_string(); -} - -std::string ParserRelu(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, bool gen_param) { - // parsing parameter - auto alpha = get_attr("alpha", attr); - - std::string act_type("Active_relu"); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w << (int)Active_relu << " " << 0.f << " " << 0.f << " " << 0 << " " << 0 << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ActivationParam(%s);\n", - node_name.c_str(), - act_type.c_str()); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - - return code_w.get_code_string(); -} - -// SaberFc -std::string ParserFc(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto axis = get_attr("axis", attr); - auto out_dim = get_attr("out_dim", attr); - auto bias_term = get_attr("bias_term", attr); - - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count(); - - writter.register_weights(node_name, weights); - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - writter.register_weights(node_name, bias); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d\n", - axis, - out_dim, - bias_term ? 1 : 0, - weights_size, - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - 0); - } else { - code_w.feed("ParamBase* %s_param = new FcParam(%d,%d,%s,%d,%s+%d,%s+%d,%s);\n", - node_name.c_str(), - axis, - out_dim, - bias_term ? "true":"false", - weights_size, - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - "false"); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberPermute -std::string ParserPermute(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto dims = get_attr>("dims", attr); - - CodeWritter dims_vec_code; - dims_vec_code<<"{"; - for (int i=0; i 0) { - dims_vec_code<("global_pooling", attr); - auto pool_padding = get_attr>("padding", attr); - auto pool_strides = get_attr>("strides", attr); - auto pool_size = get_attr>("pool_size", attr); - auto pool_method = get_attr("method", attr); - - PoolingType pool_type; - std::string str_pool_method; - if (pool_method == "MAX") { - pool_type = Pooling_max; - str_pool_method = "Pooling_max"; - } - if (pool_method == "AVG") { - pool_type = Pooling_average_include_padding; - str_pool_method = "Pooling_average_include_padding"; - } - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d %d\n", - (int)pool_type, - global_pooling ? 1 : 0, - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - } else { - code_w.feed("ParamBase* %s_param = new PoolParam(%s,%s,%d,%d,%d,%d,%d,%d);\n", - node_name.c_str(), - str_pool_method.c_str(), - global_pooling ? "true" : "false", - pool_size[1], - pool_size[0], - pool_strides[1], - pool_strides[0], - pool_padding[1], - pool_padding[0]); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberPrelu -std::string ParserPrelu(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto channel_shared = get_attr("channel_shared", attr); - - auto weights = get_attr>("weight_1", attr); - writter.register_weights(node_name, weights); - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w << (int)Active_prelu << " " << 0.f << " " << 0.f << " " << \ - (channel_shared ? 1 : 0) << " " << offset_info.weights[0].offset << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ActivationParam(%s, %f, %f, %s, %s+%d);\n", - node_name.c_str(), - "Active_prelu", - 0.f, - 0.f, - channel_shared ? "true" : "false", - weights_ptr_name.c_str(), - offset_info.weights[0].offset); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberPriorBox -std::string ParserPriorBox(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto min_size = get_attr>("min_size", attr); - auto max_size = get_attr>("max_size", attr); - auto as_ratio = get_attr>("aspect_ratio", attr); - //add - std::vector fixed_size, fixed_ratio, density; - if (find_attr("fixed_size", attr) == SaberSuccess) { - auto fix_size = get_attr>("fixed_size", attr); - fixed_size = fix_size.vector(); - } - - if (find_attr("fixed_ratio", attr) == SaberSuccess) { - auto fix_ratio = get_attr>("fixed_ratio", attr); - fixed_ratio = fix_ratio.vector(); - } - - if (find_attr("density", attr) == SaberSuccess) { - auto den = get_attr>("density", attr); - density = den.vector(); - } - - auto flip_flag = get_attr("is_flip", attr); - auto clip_flag = get_attr("is_clip", attr); - auto var = get_attr>("variance", attr); - auto image_h = get_attr("img_h", attr); - auto image_w = get_attr("img_w", attr); - auto step_h = get_attr("step_h", attr); - auto step_w = get_attr("step_w", attr); - auto offset = get_attr("offset", attr); - auto order = get_attr>("order", attr); - - std::vector order_; - CodeWritter order_string; - order_string << "{"; - - int order_size = order.size(); - for (int i = 0; i < order_size - 1; i++) { - if (order[i] == "MIN") { - order_.push_back(PRIOR_MIN); - order_string << "PRIOR_MIN, "; - } else if (order[i] == "MAX") { - order_.push_back(PRIOR_MAX); - order_string << "PRIOR_MAX, "; - } else if (order[i] == "COM") { - order_.push_back(PRIOR_COM); - order_string << "PRIOR_COM, "; - } - } - if (order[order_size - 1] == "MIN") { - order_.push_back(PRIOR_MIN); - order_string << "PRIOR_MIN"; - } else if (order[order_size - 1] == "MAX") { - order_.push_back(PRIOR_MAX); - order_string << "PRIOR_MAX"; - } else if (order[order_size - 1] == "COM") { - order_.push_back(PRIOR_COM); - order_string << "PRIOR_COM"; - } - - order_string << "}"; - - auto gen_vec_code_0 = [](PTuple ptuple) -> std::string { - CodeWritter dims_vec_code; - dims_vec_code<<"{"; - for (int i=0; i 0) { - dims_vec_code< ptuple) -> std::string { - CodeWritter dims_vec_code; - dims_vec_code<<"{"; - for (int i=0; i 0) { - dims_vec_code<("slice_dim", attr); - auto slice_point = get_attr>("slice_point", attr); - auto axis = get_attr("axis", attr); - - CodeWritter slice_point_vec_code; - slice_point_vec_code<<"{"; - for (int i=0; i 0) { - slice_point_vec_code<("num_axes", attr); - auto axis = get_attr("axis", attr); - auto bias_term = get_attr("bias_term", attr); - auto weights = get_attr>("weight_1", attr); - auto weights_shape = weights.shape(); - int weights_size = weights_shape.count(); - - writter.register_weights(node_name, weights); - LOG(INFO) << node_name << " write weights: " << weights.count(); - - int bias_size = 0; - if (bias_term) { - auto bias = get_attr>("weight_2", attr); - auto bias_shape = bias.shape(); - bias_size = bias_shape.count(); - writter.register_weights(node_name, bias); - LOG(INFO) << node_name << " write bias: " << bias.count(); - } - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d\n", - offset_info.weights[0].offset, - bias_term ? offset_info.weights[1].offset : 0, - weights_size, - bias_size, - bias_term ? 1 : 0, - axis, - num_axes); - } else { - code_w.feed("ParamBase* %s_param = new ScaleParam(%s+%d, %s+%d, %d, %d, %s, %d, %d);\n", - node_name.c_str(), - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - bias_term ? offset_info.weights[1].offset : 0, - weights_size, - bias_size, - bias_term ? "true":"false", - axis, - num_axes); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberScale -std::string ParserBatchNorm(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - - // get batchnorm param - auto eps = get_attr("epsilon", attr); - auto momentum = get_attr("momentum", attr); - auto mean = get_attr>("weight_1", attr); - auto mean_vec = mean.vector(); - auto var = get_attr>("weight_2", attr); - auto var_vec = var.vector(); - auto scale_factor = get_attr>("weight_3", attr); - auto scale_factor_vec = scale_factor.vector(); - - std::vector scale; - std::vector bias; - scale.resize(mean.count()); - bias.resize(mean.count()); - auto scale_val = scale_factor_vec[0] == 0 ? 0 : 1 / scale_factor_vec[0]; - - for (int i = 0; i < mean.count(); i++) { - scale[i] = 1.0f / std::sqrt(var_vec[i] * scale_val + eps); - bias[i] = - mean_vec[i] * scale_val / std::sqrt(var_vec[i] * scale_val + eps); - } - - Shape sh1({1, 1, 1, scale.size()}); - Shape sh2({1, 1, 1, bias.size()}); - PBlock pscale(sh1); - PBlock pbias(sh2); - float* pscale_ptr = (float*)pscale.h_tensor().mutable_data(); - for (int j = 0; j < scale.size(); ++j) { - pscale_ptr[j] = scale[j]; - } - float* pbias_ptr = (float*)pbias.h_tensor().mutable_data(); - for (int j = 0; j < bias.size(); ++j) { - pbias_ptr[j] = bias[j]; - } - writter.register_weights(node_name, pscale); - LOG(INFO) << node_name << " write weights: " << pscale.count(); - - writter.register_weights(node_name, pbias); - LOG(INFO) << node_name << " write bias: " << pbias.count(); - - auto weights_shape = pscale.shape(); - int weights_size = weights_shape.count(); - - auto bias_shape = pbias.shape(); - int bias_size = bias_shape.count(); - - auto offset_info = writter.get_weights_by_name(node_name); - - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w.feed("%d %d %d %d %d %d %d\n", - offset_info.weights[0].offset, - offset_info.weights[1].offset, - weights_size, - bias_size, - 1, - 1, - 1); - } else { - code_w.feed("ParamBase* %s_param = new ScaleParam(%s+%d, %s+%d, %d, %d, %s, %d, %d);\n", - node_name.c_str(), - weights_ptr_name.c_str(), - offset_info.weights[0].offset, - weights_ptr_name.c_str(), - offset_info.weights[1].offset, - weights_size, - bias_size, - "true", - 1, - 1); - - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberSoftmax -std::string ParserSoftmax(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto axis = get_attr("axis", attr); - - // gen cpp code - CodeWritter code_w; - - if (gen_param) { - code_w << axis; - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new SoftmaxParam(%d);\n", - node_name.c_str(), - axis); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberShuffleChannel -std::string ParserShuffleChannel(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto group = get_attr("group", attr); - - // gen cpp code - CodeWritter code_w; - - if (gen_param) { - code_w << group; - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ShuffleChannelParam(%d);\n", - node_name.c_str(), - group); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberSplit -std::string ParserSplit(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - // no param - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new SplitParam;\n", - node_name.c_str()); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberFlatten -std::string ParserFlatten(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - // no param - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new FlattenParam;\n", - node_name.c_str()); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// Parser reshape -std::string ParserReshape(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto dims = get_attr>("dims", attr); - std::vector vdims = dims.vector(); - - CodeWritter reshape_dims_vec_code; - reshape_dims_vec_code << "{"; - for (int i = 0; i < vdims.size() - 1; i++) { - reshape_dims_vec_code << vdims[i] << ","; - } - if (vdims.size() > 0) { - reshape_dims_vec_code << vdims[vdims.size() - 1] << "}"; - } else { - reshape_dims_vec_code<< "}"; - } - - CodeWritter code_w; - if (gen_param) { - code_w << dims.size() << " "; - for (int i = 0; i < dims.size(); ++i) { - code_w << dims[i] << " "; - } - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ReshapeParam(%s);\n", node_name.c_str(), reshape_dims_vec_code.get_code_string().c_str()); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} - -// SaberResize -std::string ParserResize(graph::AttrInfo& attr, - std::string& code_name, - std::string& op_class_name, - std::string& node_name, - std::string& weights_ptr_name, - WeightsWritter& writter, - bool gen_param) { - // parsing parameter - auto width_scale = get_attr("width_scale", attr); - auto height_scale = get_attr("height_scale", attr); - // gen cpp code - CodeWritter code_w; - if (gen_param) { - code_w << width_scale << " " << height_scale; - code_w << "\n"; - } else { - code_w.feed("ParamBase* %s_param = new ResizeParam(%f, %f);\n", - node_name.c_str(), - width_scale, - height_scale); - code_w.feed(" %s_g_param.push_back(%s_param);\n", code_name.c_str(), node_name.c_str()); - } - return code_w.get_code_string(); -} -std::unordered_map OPERATION_MAP({ - {"Input", {"Input", not_impl_yet} }, - {"Convolution", {"SaberConv2D", ParserConvolution} }, // done - {"Deconvolution", {"SaberDeconv2D", ParserDeconvolution}}, //done - {"DeconvRelu", {"SaberDeconv2D", ParserDeConvolutionRelu}}, //done - {"Activation", {"SaberActivation", ParserActivation} }, // done - {"ReLU", {"SaberActivation",ParserRelu}}, // done - {"ConvRelu", {"SaberConv2D", ParserConvolutionRelu} }, // done - {"ConvAct", {"SaberConv2D", ParserConvAct} }, // done - {"ConvReluPool", {"SaberConvPooling2D", ParserConvolutionReluPool} }, // done - {"ConvBatchnormScaleRelu", {"SaberConv2D", ParserConvBatchnormScaleRelu}}, // done have question ?? - {"ConvBatchnormScaleReluPool", {"SaberConvPooling2D", ParserConvBatchnormScaleReluPool}}, // done have question ?? - {"ConvBatchnormScale", {"SaberConv2D", ParserConvBatchnormScale}}, //done - {"ConvBatchnorm", {"SaberConv2D", ParserConvBatchnorm}}, //done - {"Concat", {"SaberConcat", ParserConcat} }, // done - {"DetectionOutput", {"SaberDetectionOutput", ParserDectionOutput} }, // done - {"Eltwise", {"SaberEltwise", ParserEltwise} }, //done - {"EltwiseRelu", {"SaberEltwiseAct", ParserEltwiseRelu}}, // done - {"EltwiseActivation", {"SaberEltwiseAct", ParserEltwisePRelu}}, // done - {"Dense", {"SaberFc", ParserFc} }, // done - {"Permute", {"SaberPermute", ParserPermute} }, // done - {"Pooling", {"SaberPooling", ParserPooling} }, // done - {"PReLU", {"SaberPrelu", ParserPrelu} }, // done - {"PriorBox", {"SaberPriorBox", ParserPriorBox} }, // done - {"Power", {"SaberPower", ParserPower} }, // done - {"Scale", {"SaberScale", ParserScale} }, // done - {"BatchNorm", {"SaberScale", ParserBatchNorm} }, // done - {"Slice", {"SaberSlice", ParserSlice} }, // done - {"Flatten", {"SaberFlatten", ParserFlatten}}, //done - {"Reshape", {"SaberReshape", ParserReshape}}, //done - {"Softmax", {"SaberSoftmax", ParserSoftmax}}, //done - {"Split", {"SaberSplit", ParserSplit}}, // done - {"ShuffleChannel", {"SaberShuffleChannel", ParserShuffleChannel}}, // done - {"Resize", {"SaberResize", ParserResize}}, //done -}); - -} /* namespace lite */ - -} /* namespace anakin */ - diff --git a/framework/lite/utils.h b/framework/lite/utils.h deleted file mode 100644 index 56d63ca46..000000000 --- a/framework/lite/utils.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_FRAMEWORK_LITE_UTILS_H -#define ANAKIN_FRAMEWORK_LITE_UTILS_H - -#include -#include - -namespace anakin { - -namespace lite { - -/** - * \brief update conv weights with batchnorm and scale parameters. - */ -template -void update_weights(PBlock weights, PBlock bias, - int n, int c, int h, int w, bool conv_bias_term, - float batchnorm_scale, float batchnorm_eps, - std::vector batchnorm_mean, - std::vector batchnorm_variance, - std::vector scale_w, - std::vector scale_b, - bool scale_bias_term) { - float* weights_p = (float*)weights.h_tensor().mutable_data(); - size_t type_size = weights.h_tensor().get_dtype_size(); - if (!conv_bias_term) { - bias.re_alloc(Shape({1,batchnorm_mean.size(),1,1})); - void* new_bias_data = bias.h_tensor().mutable_data(); - memset(new_bias_data, 0, type_size * bias.h_tensor().size()); - } - float* bias_p = (float*)bias.h_tensor().mutable_data(); - - batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; - int chw = c*h*w; - for (int i=0; i -void update_weights(PBlock weights, PBlock bias, - int n, int c, int h, int w, bool conv_bias_term, - float batchnorm_scale, float batchnorm_eps, - std::vector batchnorm_mean, - std::vector batchnorm_variance) { - float* weights_p = (float*)weights.h_tensor().mutable_data(); - size_t type_size = weights.h_tensor().get_dtype_size(); - if (!conv_bias_term) { - bias.re_alloc(Shape({1,batchnorm_mean.size(),1,1})); - void* new_bias_data = bias.h_tensor().mutable_data(); - memset(new_bias_data, 0, type_size * bias.h_tensor().size()); - } - float* bias_p = (float*)bias.h_tensor().mutable_data(); - batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; - int chw = c * h * w; - for (int i = 0; i < n; i++) { - float alpha = 1.f; - float beta = 0.f; - // insert batchnorm parameters - alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; - alpha = 1.f / sqrtf(alpha); - beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); - beta = beta * alpha; - for (int j = 0; j < chw; j++) { - weights_p[i * chw + j] *= alpha; - } - bias_p[i] *= alpha; - bias_p[i] += beta; - } -} - -} /* namespace lite */ - -} /* namespace anakin */ - -#endif diff --git a/framework/model_parser/CMakeLists.txt b/framework/model_parser/CMakeLists.txt deleted file mode 100644 index c6bc3e721..000000000 --- a/framework/model_parser/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set(ANAKIN_BASE_SRC "") - -# add ak_base_source files -anakin_fetch_files_with_suffix(${ANAKIN_MODEL_PARSER}/parser "cpp" ANAKIN_BASE_SRC) -anakin_fetch_files_with_suffix(${ANAKIN_MODEL_PARSER}/proto "cpp" ANAKIN_BASE_SRC) - -list(APPEND ANAKIN_SRC ${ANAKIN_BASE_SRC}) -set(ANAKIN_SRC ${ANAKIN_SRC} PARENT_SCOPE) -unset(ANAKIN_BASE_SRC) diff --git a/framework/model_parser/parser/model_io.cpp b/framework/model_parser/parser/model_io.cpp index ba62848e6..04b6b6d8e 100644 --- a/framework/model_parser/parser/model_io.cpp +++ b/framework/model_parser/parser/model_io.cpp @@ -19,8 +19,15 @@ NodeIO& NodeIO::operator>>(const NodeProto& node_pro node_p->name() = node_proto.name(); node_p->need_wait() = node_proto.need_wait(); node_p->lane() = node_proto.lane(); + switch (node_proto.bit_type()) { + case INT8: node_p->bit_type() = AK_INT8; break; + case FLOAT: node_p->bit_type() = AK_FLOAT; break; + default: node_p->bit_type() = AK_INVALID; break; + } + DLOG(INFO) << "read node: " << node_p->name() << \ + " (type: " << node_p->bit_type() << " )"; + auto it = node_proto.attr().begin(); - DLOG(INFO)<<"read :"<name(); for (; it != node_proto.attr().end(); ++it) { auto& key = it->first; auto& value = it->second; @@ -140,13 +147,18 @@ NodeIO& NodeIO::operator>>(const NodeProto& node_pro if(tensor.shared()) { // cope with shared weights(tensor) auto target_node = _node_name2ptr_map[tensor.share_from()]-> template get_attr >(key); node_p->set_attr(key, target_node); - // record share info of weights + // record share info of weights node_p->set_share_pair(key, tensor.share_from()); } else { auto& real_shape = tensor.shape(); auto& valid_shape = tensor.valid_shape(); CHECK_EQ(real_shape.dim().size(), 4) << "Weights parameter's shape len must equal to 4."; auto& data = tensor.data(); + auto& scale = tensor.scale().f(); + std::vector scale_vector; + for (const float val: scale) { + scale_vector.push_back(val); + } switch (data.type()) { case FLOAT: { /* At so far, we only support weights saved as float. */ @@ -164,8 +176,10 @@ NodeIO& NodeIO::operator>>(const NodeProto& node_pro for (int i = 0; i < data.size(); i++) { cpu_data[i] = data.f()[i]; } + block->d_tensor().set_scale(scale_vector); + block->h_tensor().set_scale(scale_vector); -#if defined( USE_CUDA) || defined(AMD_GPU) +#if defined( USE_CUDA) || defined(AMD_GPU) // map cpu data to GPU block->d_tensor().set_shape(saber_shape); block->d_tensor().copy_from(block->h_tensor()); @@ -175,7 +189,7 @@ NodeIO& NodeIO::operator>>(const NodeProto& node_pro block->d_tensor().set_shape(saber_shape); block->h_tensor().set_shape(saber_shape); } else { - saber::Shape saber_valid_shape({1, 1, 1, 1}); + saber::Shape saber_valid_shape({1, 1, 1, 1}); for (int i=0; i < 4; i++) { saber_valid_shape[i] = valid_shape.dim().value()[i]; } @@ -187,7 +201,45 @@ NodeIO& NodeIO::operator>>(const NodeProto& node_pro node_p->set_attr(key, *block); } break; + case INT8: { /* At so far, we only support weights saved as float. */ + saber::Shape saber_shape({1, 1, 1, 1}); + + // get real_shape + for (int i = 0; i < 4; i++) { + saber_shape[i] = real_shape.dim().value()[i]; + } + + auto* block = graph::GraphGlobalMem::Global().template new_block(saber_shape); + // fill data to block + char* cpu_data = static_cast(block->h_tensor().mutable_data()); + for (int i = 0; i < data.size(); i++) { + cpu_data[i] = data.c().data()[i]; + } + block->d_tensor().set_scale(scale_vector); + block->h_tensor().set_scale(scale_vector); + +#if defined( USE_CUDA) || defined(AMD_GPU) + // map cpu data to GPU + block->d_tensor().set_shape(saber_shape); + block->d_tensor().copy_from(block->h_tensor()); +#endif + if (valid_shape.dim().size() == 0) { + // set valid shape (== real shape) for host and device + block->d_tensor().set_shape(saber_shape); + block->h_tensor().set_shape(saber_shape); + } else { + saber::Shape saber_valid_shape({1, 1, 1, 1}); + for (int i = 0; i < 4; i++) { + saber_valid_shape[i] = valid_shape.dim().value()[i]; + } + // set valid shape for host and device + block->d_tensor().set_shape(saber_valid_shape); + block->h_tensor().set_shape(saber_valid_shape); + } + node_p->set_attr(key, *block); + } + break; default : { LOG(FATAL) << "UnSupport data type(DateTypeProto:" << data.type() << ") in list "; } @@ -244,6 +296,12 @@ Status NodeIO::operator<<(GraphProto& graph) { node_proto->set_name(node_p->name()); node_proto->set_lane(node_p->lane()); node_proto->set_need_wait(node_p->need_wait()); + + switch (node_p->bit_type()) { + case AK_INT8: node_proto->set_bit_type(INT8); break; + case AK_FLOAT: node_proto->set_bit_type(FLOAT); break; + default: node_proto->set_bit_type(FLOAT); break; + } // set node proto's op proto OpProto* op = node_proto->mutable_op(); op->set_name(node_p->get_op_name()); @@ -328,10 +386,10 @@ Status NodeIO::operator<<(GraphProto& graph) { (*node_proto_attr)[key].mutable_cache_list()->set_size(tuple_bool.size()); } else if (value.type() == "anakin_block") { // default block have float data // cope with shared weights - if(node_p->check_shared(key)) { + if (node_p->check_shared(key)) { auto share_target = node_p->get_share_target(key); - (*node_proto_attr)[key].mutable_tensor()->set_shared(true); - (*node_proto_attr)[key].mutable_tensor()->set_share_from(share_target); + (*node_proto_attr)[key].mutable_tensor()->set_shared(true); + (*node_proto_attr)[key].mutable_tensor()->set_share_from(share_target); (*node_proto_attr)[key].set_type(TENSOR); } else { auto block_float = any_cast>(value); @@ -371,7 +429,6 @@ Status NodeIO::operator<<(GraphProto& graph) { (*node_proto_attr)[key].mutable_tensor()->mutable_shape()->mutable_dim()->set_size( real_shape.size()); - // set proto tensor data for (int i = 0; i < real_shape.count(); i++) { (*node_proto_attr)[key].mutable_tensor()->mutable_data()->add_f(cpu_data[i]); @@ -381,14 +438,14 @@ Status NodeIO::operator<<(GraphProto& graph) { (*node_proto_attr)[key].mutable_tensor()->mutable_data()->set_size(real_shape.count()); (*node_proto_attr)[key].set_type(TENSOR); } - } + } } else { auto tuple_float = any_cast>(value); (*node_proto_attr)[key].set_type(CACHE_LIST); (*node_proto_attr)[key].mutable_cache_list()->set_type(FLOAT); (*node_proto_attr)[key].mutable_cache_list()->set_size(tuple_float.size()); - LOG(ERROR) << "node: " << node_p->name() << " (" << node_p->get_op_name() << ") \ + //LOG(ERROR) << "node: " << node_p->name() << " (" << node_p->get_op_name() << ") \ key : " << key << " value_type: " << value.type(); } } @@ -405,7 +462,7 @@ template class NodeIO; template class NodeIO; #endif -#ifdef AMD_GPU +#ifdef AMD_GPU template class NodeIO; template class NodeIO; template class NodeIO; @@ -418,20 +475,11 @@ template class NodeIO; #endif #ifdef USE_ARM_PLACE -#ifdef ANAKIN_TYPE_FP32 template class NodeIO; -#endif - -#ifdef ANAKIN_TYPE_FP16 template class NodeIO; -#endif - -#ifdef ANAKIN_TYPE_INT8 template class NodeIO; #endif -#endif - } /* parser */ } /* anakin */ diff --git a/framework/model_parser/parser/model_io.h b/framework/model_parser/parser/model_io.h index a1e33b37c..57507317c 100644 --- a/framework/model_parser/parser/model_io.h +++ b/framework/model_parser/parser/model_io.h @@ -22,10 +22,17 @@ #include "framework/graph/node.h" #include "framework/graph/algorithm.h" #include "framework/model_parser/parser/parser.h" -#include "framework/model_parser/proto/graph.pb.h" -#include "framework/model_parser/proto/node.pb.h" -#include "framework/model_parser/proto/operator.pb.h" -#include "framework/model_parser/proto/tensor.pb.h" +#ifdef USE_NANOPB +#include "graph.pb.hpp" +#include "node.pb.hpp" +#include "operator.pb.hpp" +#include "tensor.pb.hpp" +#else +#include "graph.pb.h" +#include "node.pb.h" +#include "operator.pb.h" +#include "tensor.pb.h" +#endif namespace anakin { diff --git a/framework/model_parser/parser/nanopb/graph.pb.cpp b/framework/model_parser/parser/nanopb/graph.pb.cpp new file mode 100644 index 000000000..6712839b5 --- /dev/null +++ b/framework/model_parser/parser/nanopb/graph.pb.cpp @@ -0,0 +1,253 @@ +#include + +#include +#include + +#include "graph.pb.hpp" + +#include + +namespace nanopb_cpp { + +void Version::fill(Nanopb *pb) { + + // major: optional int32 + + // minor: optional int32 + + // patch: optional int32 + + // version: optional int64 + +} + +void Version::retrieve(const Nanopb *pb) { + + // major: optional int32 + _major = static_cast(pb->major); + + // minor: optional int32 + _minor = static_cast(pb->minor); + + // patch: optional int32 + _patch = static_cast(pb->patch); + + // version: optional int64 + _version = static_cast(pb->version); + +} + +IMPLEMENT_PARSING_WRAPPERS(Version); + +void Info::fill(Nanopb *pb) { + + // temp_mem_used: optional int32 + + // original_temp_mem_used: optional int32 + + // system_mem_used: optional int32 + + // model_mem_used: optional int32 + + // is_optimized: optional bool + +} + +void Info::retrieve(const Nanopb *pb) { + + // temp_mem_used: optional int32 + _temp_mem_used = static_cast(pb->temp_mem_used); + + // original_temp_mem_used: optional int32 + _original_temp_mem_used = static_cast(pb->original_temp_mem_used); + + // system_mem_used: optional int32 + _system_mem_used = static_cast(pb->system_mem_used); + + // model_mem_used: optional int32 + _model_mem_used = static_cast(pb->model_mem_used); + + // is_optimized: optional bool + _is_optimized = static_cast(pb->is_optimized); + +} + +IMPLEMENT_PARSING_WRAPPERS(Info); + +void TargetProto::fill(Nanopb *pb) { + + // node: optional string + pb->node.funcs.decode = decode_string; + pb->node.arg = &_node; + + // scale: repeated float + pb->scale.funcs.decode = decode_repeated>; + pb->scale.arg = &_scale; + + // layout: optional LayoutProto + +} + +void TargetProto::retrieve(const Nanopb *pb) { + + // node: optional string + + // scale: repeated float + + // layout: optional LayoutProto + _layout = static_cast(pb->layout); + +} + +IMPLEMENT_PARSING_WRAPPERS(TargetProto); + +void List::fill(Nanopb *pb) { + + // val: repeated string + pb->val.funcs.decode = decode_repeated; + pb->val.arg = &_val; + + // target: repeated TargetProto + pb->target.funcs.decode = decode_repeated>; + pb->target.arg = &_target; + +} + +void List::retrieve(const Nanopb *pb) { + + // val: repeated string + + // target: repeated TargetProto + +} + +IMPLEMENT_PARSING_WRAPPERS(List); + +void GraphProto::EdgesInEntry::fill(Nanopb *pb) { + + // key: optional string + pb->key.funcs.decode = decode_string; + pb->key.arg = &_key; + + // value: optional List + _value.fill(&pb->value); + +} + +void GraphProto::EdgesInEntry::retrieve(const Nanopb *pb) { + + // key: optional string + + // value: optional List + _value.retrieve(&pb->value); + +} + + +void GraphProto::EdgesOutEntry::fill(Nanopb *pb) { + + // key: optional string + pb->key.funcs.decode = decode_string; + pb->key.arg = &_key; + + // value: optional List + _value.fill(&pb->value); + +} + +void GraphProto::EdgesOutEntry::retrieve(const Nanopb *pb) { + + // key: optional string + + // value: optional List + _value.retrieve(&pb->value); + +} + + +void GraphProto::EdgesInfoEntry::fill(Nanopb *pb) { + + // key: optional string + pb->key.funcs.decode = decode_string; + pb->key.arg = &_key; + + // value: optional TensorProto + _value.fill(&pb->value); + +} + +void GraphProto::EdgesInfoEntry::retrieve(const Nanopb *pb) { + + // key: optional string + + // value: optional TensorProto + _value.retrieve(&pb->value); + +} + + +void GraphProto::fill(Nanopb *pb) { + + // name: optional string + pb->name.funcs.decode = decode_string; + pb->name.arg = &_name; + + // nodes: repeated NodeProto + pb->nodes.funcs.decode = decode_repeated>; + pb->nodes.arg = &_nodes; + + // edges_in: repeated GraphProto.EdgesInEntry + pb->edges_in.funcs.decode = decode_map; + pb->edges_in.arg = &_edges_in; + + // edges_out: repeated GraphProto.EdgesOutEntry + pb->edges_out.funcs.decode = decode_map; + pb->edges_out.arg = &_edges_out; + + // edges_info: repeated GraphProto.EdgesInfoEntry + pb->edges_info.funcs.decode = decode_map; + pb->edges_info.arg = &_edges_info; + + // ins: repeated string + pb->ins.funcs.decode = decode_repeated; + pb->ins.arg = &_ins; + + // outs: repeated string + pb->outs.funcs.decode = decode_repeated; + pb->outs.arg = &_outs; + + // version: optional Version + _version.fill(&pb->version); + + // summary: optional Info + _summary.fill(&pb->summary); + +} + +void GraphProto::retrieve(const Nanopb *pb) { + + // name: optional string + + // nodes: repeated NodeProto + + // edges_in: repeated GraphProto.EdgesInEntry + + // edges_out: repeated GraphProto.EdgesOutEntry + + // edges_info: repeated GraphProto.EdgesInfoEntry + + // ins: repeated string + + // outs: repeated string + + // version: optional Version + _version.retrieve(&pb->version); + + // summary: optional Info + _summary.retrieve(&pb->summary); + +} + +IMPLEMENT_PARSING_WRAPPERS(GraphProto); + +} // namespace nanopb_cpp diff --git a/framework/model_parser/parser/nanopb/graph.pb.hpp b/framework/model_parser/parser/nanopb/graph.pb.hpp new file mode 100644 index 000000000..9896dcfc7 --- /dev/null +++ b/framework/model_parser/parser/nanopb/graph.pb.hpp @@ -0,0 +1,158 @@ +#ifndef NANOPB_CPP_GRAPH_PROTO_HPP +#define NANOPB_CPP_GRAPH_PROTO_HPP + +#include + +#include "node.pb.hpp" +#include "tensor.pb.hpp" + +#define Version Nanopb_Version +#define Info Nanopb_Info +#define TargetProto Nanopb_TargetProto +#define List Nanopb_List +#define GraphProto Nanopb_GraphProto +#define GraphProto_EdgesInEntry Nanopb_GraphProto_EdgesInEntry +#define GraphProto_EdgesOutEntry Nanopb_GraphProto_EdgesOutEntry +#define GraphProto_EdgesInfoEntry Nanopb_GraphProto_EdgesInfoEntry +#define valueType Nanopb_valueType +#define NodeProto Nanopb_NodeProto +#define NodeProto_AttrEntry Nanopb_NodeProto_AttrEntry +#define TensorShape Nanopb_TensorShape +#define CacheDate Nanopb_CacheDate +#define TensorProto Nanopb_TensorProto +#define TensorShape_Dim Nanopb_TensorShape_Dim +#include "graph.pb.h" +#undef Version +#undef Info +#undef TargetProto +#undef List +#undef GraphProto +#undef GraphProto_EdgesInEntry +#undef GraphProto_EdgesOutEntry +#undef GraphProto_EdgesInfoEntry +#undef valueType +#undef NodeProto +#undef NodeProto_AttrEntry +#undef TensorShape +#undef CacheDate +#undef TensorProto +#undef TensorShape_Dim + +namespace nanopb_cpp { + +enum LayoutProto { + Invalid = 0, + LP_W = 1, + LP_HW = 2, + LP_WH = 3, + LP_NC = 4, + LP_NH = 5, + LP_NW = 6, + LP_NHW = 7, + LP_NCHW = 8, + LP_NHWC = 9, + LP_NCHW_C4 = 10, + LP_NCHW_C8 = 11, + LP_NCHW_C16 = 12, + LP_OIHW16I16O = 13, + LP_GOIHW16I16O = 14, + LP_NCHW_C8R = 15, + LP_NCHW_C16R = 16, +}; + +class Version { + PROTO_FIELD(int32_t, major); + PROTO_FIELD(int32_t, minor); + PROTO_FIELD(int32_t, patch); + PROTO_FIELD(int64_t, version); + + PARSING_MEMBERS(Version); +}; // end class Version; + +class Info { + PROTO_FIELD(int32_t, temp_mem_used); + PROTO_FIELD(int32_t, original_temp_mem_used); + PROTO_FIELD(int32_t, system_mem_used); + PROTO_FIELD(int32_t, model_mem_used); + PROTO_FIELD(bool, is_optimized); + + PARSING_MEMBERS(Info); +}; // end class Info; + +class TargetProto { + PROTO_FIELD(std::string, node); + REPEATED_PROTO_FIELD(float, scale); + PROTO_FIELD(nanopb_cpp::LayoutProto, layout); + + PARSING_MEMBERS(TargetProto); +}; // end class TargetProto; + +class List { + REPEATED_PROTO_FIELD(std::string, val); + REPEATED_PROTO_FIELD(nanopb_cpp::TargetProto, target); + + PARSING_MEMBERS(List); +}; // end class List; + +class GraphProto { + class EdgesInEntry { + PROTO_MAP_ENTRY_KEY_FIELD(std::string); + PROTO_MAP_ENTRY_VALUE_FIELD(nanopb_cpp::List); + + PROTO_MAP_ENTRY_MEMBERS(GraphProto_EdgesInEntry); + }; // end class EdgesInEntry; + + class EdgesOutEntry { + PROTO_MAP_ENTRY_KEY_FIELD(std::string); + PROTO_MAP_ENTRY_VALUE_FIELD(nanopb_cpp::List); + + PROTO_MAP_ENTRY_MEMBERS(GraphProto_EdgesOutEntry); + }; // end class EdgesOutEntry; + + class EdgesInfoEntry { + PROTO_MAP_ENTRY_KEY_FIELD(std::string); + PROTO_MAP_ENTRY_VALUE_FIELD(nanopb_cpp::TensorProto); + + PROTO_MAP_ENTRY_MEMBERS(GraphProto_EdgesInfoEntry); + }; // end class EdgesInfoEntry; + + PROTO_FIELD(std::string, name); + REPEATED_PROTO_FIELD(nanopb_cpp::NodeProto, nodes); + PROTO_FIELD((std::map), edges_in); + PROTO_FIELD((std::map), edges_out); + PROTO_FIELD((std::map), edges_info); + REPEATED_PROTO_FIELD(std::string, ins); + REPEATED_PROTO_FIELD(std::string, outs); + PROTO_FIELD(nanopb_cpp::Version, version); + PROTO_FIELD(nanopb_cpp::Info, summary); + + PARSING_MEMBERS(GraphProto); +}; // end class GraphProto; + +} // namespace nanopb_cpp + +using nanopb_cpp::Version; +using nanopb_cpp::Info; +using nanopb_cpp::TargetProto; +using nanopb_cpp::List; +using nanopb_cpp::GraphProto; + +using nanopb_cpp::Invalid; +using nanopb_cpp::LP_W; +using nanopb_cpp::LP_HW; +using nanopb_cpp::LP_WH; +using nanopb_cpp::LP_NC; +using nanopb_cpp::LP_NH; +using nanopb_cpp::LP_NW; +using nanopb_cpp::LP_NHW; +using nanopb_cpp::LP_NCHW; +using nanopb_cpp::LP_NHWC; +using nanopb_cpp::LP_NCHW_C4; +using nanopb_cpp::LP_NCHW_C8; +using nanopb_cpp::LP_NCHW_C16; +using nanopb_cpp::LP_OIHW16I16O; +using nanopb_cpp::LP_GOIHW16I16O; +using nanopb_cpp::LP_NCHW_C8R; +using nanopb_cpp::LP_NCHW_C16R; + +#endif diff --git a/framework/model_parser/parser/nanopb/node.pb.cpp b/framework/model_parser/parser/nanopb/node.pb.cpp new file mode 100644 index 000000000..1ebcf5a64 --- /dev/null +++ b/framework/model_parser/parser/nanopb/node.pb.cpp @@ -0,0 +1,136 @@ +#include + +#include +#include + +#include "node.pb.hpp" + +#include + +namespace nanopb_cpp { + +void valueType::fill(Nanopb *pb) { + + // s: optional bytes + pb->s.funcs.decode = decode_string; + pb->s.arg = &_s; + + // i: optional int32 + + // f: optional float + + // b: optional bool + + // cache_list: optional CacheDate + _cache_list.fill(&pb->cache_list); + + // tensor: optional TensorProto + _tensor.fill(&pb->tensor); + + // type: optional DateTypeProto + +} + +void valueType::retrieve(const Nanopb *pb) { + + // s: optional bytes + + // i: optional int32 + _i = static_cast(pb->i); + + // f: optional float + _f = static_cast(pb->f); + + // b: optional bool + _b = static_cast(pb->b); + + // cache_list: optional CacheDate + _cache_list.retrieve(&pb->cache_list); + + // tensor: optional TensorProto + _tensor.retrieve(&pb->tensor); + + // type: optional DateTypeProto + _type = static_cast(pb->type); + +} + +IMPLEMENT_PARSING_WRAPPERS(valueType); + +void NodeProto::AttrEntry::fill(Nanopb *pb) { + + // key: optional string + pb->key.funcs.decode = decode_string; + pb->key.arg = &_key; + + // value: optional valueType + _value.fill(&pb->value); + +} + +void NodeProto::AttrEntry::retrieve(const Nanopb *pb) { + + // key: optional string + + // value: optional valueType + _value.retrieve(&pb->value); + +} + + +void NodeProto::fill(Nanopb *pb) { + + // name: optional string + pb->name.funcs.decode = decode_string; + pb->name.arg = &_name; + + // ins: repeated string + pb->ins.funcs.decode = decode_repeated; + pb->ins.arg = &_ins; + + // outs: repeated string + pb->outs.funcs.decode = decode_repeated; + pb->outs.arg = &_outs; + + // attr: repeated NodeProto.AttrEntry + pb->attr.funcs.decode = decode_map; + pb->attr.arg = &_attr; + + // lane: optional int32 + + // need_wait: optional bool + + // Op: optional OpProto + _op.fill(&pb->Op); + + // bit_type: optional DateTypeProto + +} + +void NodeProto::retrieve(const Nanopb *pb) { + + // name: optional string + + // ins: repeated string + + // outs: repeated string + + // attr: repeated NodeProto.AttrEntry + + // lane: optional int32 + _lane = static_cast(pb->lane); + + // need_wait: optional bool + _need_wait = static_cast(pb->need_wait); + + // Op: optional OpProto + _op.retrieve(&pb->Op); + + // bit_type: optional DateTypeProto + _bit_type = static_cast(pb->bit_type); + +} + +IMPLEMENT_PARSING_WRAPPERS(NodeProto); + +} // namespace nanopb_cpp diff --git a/framework/model_parser/parser/nanopb/node.pb.hpp b/framework/model_parser/parser/nanopb/node.pb.hpp new file mode 100644 index 000000000..fc0eaf6aa --- /dev/null +++ b/framework/model_parser/parser/nanopb/node.pb.hpp @@ -0,0 +1,67 @@ +#ifndef NANOPB_CPP_NODE_PROTO_HPP +#define NANOPB_CPP_NODE_PROTO_HPP + +#include + +#include "operator.pb.hpp" +#include "tensor.pb.hpp" + +#define valueType Nanopb_valueType +#define NodeProto Nanopb_NodeProto +#define NodeProto_AttrEntry Nanopb_NodeProto_AttrEntry +#define OpProto Nanopb_OpProto +#define TensorShape Nanopb_TensorShape +#define CacheDate Nanopb_CacheDate +#define TensorProto Nanopb_TensorProto +#define TensorShape_Dim Nanopb_TensorShape_Dim +#include "node.pb.h" +#undef valueType +#undef NodeProto +#undef NodeProto_AttrEntry +#undef OpProto +#undef TensorShape +#undef CacheDate +#undef TensorProto +#undef TensorShape_Dim + +namespace nanopb_cpp { + +class valueType { + PROTO_FIELD(std::string, s); + PROTO_FIELD(int32_t, i); + PROTO_FIELD(float, f); + PROTO_FIELD(bool, b); + PROTO_FIELD(nanopb_cpp::CacheDate, cache_list); + PROTO_FIELD(nanopb_cpp::TensorProto, tensor); + PROTO_FIELD(nanopb_cpp::DateTypeProto, type); + + PARSING_MEMBERS(valueType); +}; // end class valueType; + +class NodeProto { + class AttrEntry { + PROTO_MAP_ENTRY_KEY_FIELD(std::string); + PROTO_MAP_ENTRY_VALUE_FIELD(nanopb_cpp::valueType); + + PROTO_MAP_ENTRY_MEMBERS(NodeProto_AttrEntry); + }; // end class AttrEntry; + + PROTO_FIELD(std::string, name); + REPEATED_PROTO_FIELD(std::string, ins); + REPEATED_PROTO_FIELD(std::string, outs); + PROTO_FIELD((std::map), attr); + PROTO_FIELD(int32_t, lane); + PROTO_FIELD(bool, need_wait); + PROTO_FIELD(nanopb_cpp::OpProto, op); + PROTO_FIELD(nanopb_cpp::DateTypeProto, bit_type); + + PARSING_MEMBERS(NodeProto); +}; // end class NodeProto; + +} // namespace nanopb_cpp + +using nanopb_cpp::valueType; +using nanopb_cpp::NodeProto; + + +#endif diff --git a/framework/model_parser/parser/nanopb/operator.pb.cpp b/framework/model_parser/parser/nanopb/operator.pb.cpp new file mode 100644 index 000000000..795fb346e --- /dev/null +++ b/framework/model_parser/parser/nanopb/operator.pb.cpp @@ -0,0 +1,49 @@ +#include + +#include +#include + +#include "operator.pb.hpp" + +#include + +namespace nanopb_cpp { + +void OpProto::fill(Nanopb *pb) { + + // name: optional string + pb->name.funcs.decode = decode_string; + pb->name.arg = &_name; + + // is_commutative: optional bool + + // in_num: optional int32 + + // out_num: optional int32 + + // description: optional string + pb->description.funcs.decode = decode_string; + pb->description.arg = &_description; + +} + +void OpProto::retrieve(const Nanopb *pb) { + + // name: optional string + + // is_commutative: optional bool + _is_commutative = static_cast(pb->is_commutative); + + // in_num: optional int32 + _in_num = static_cast(pb->in_num); + + // out_num: optional int32 + _out_num = static_cast(pb->out_num); + + // description: optional string + +} + +IMPLEMENT_PARSING_WRAPPERS(OpProto); + +} // namespace nanopb_cpp diff --git a/framework/model_parser/parser/nanopb/operator.pb.hpp b/framework/model_parser/parser/nanopb/operator.pb.hpp new file mode 100644 index 000000000..156717c5a --- /dev/null +++ b/framework/model_parser/parser/nanopb/operator.pb.hpp @@ -0,0 +1,28 @@ +#ifndef NANOPB_CPP_OPERATOR_PROTO_HPP +#define NANOPB_CPP_OPERATOR_PROTO_HPP + +#include + + +#define OpProto Nanopb_OpProto +#include "operator.pb.h" +#undef OpProto + +namespace nanopb_cpp { + +class OpProto { + PROTO_FIELD(std::string, name); + PROTO_FIELD(bool, is_commutative); + PROTO_FIELD(int32_t, in_num); + PROTO_FIELD(int32_t, out_num); + PROTO_FIELD(std::string, description); + + PARSING_MEMBERS(OpProto); +}; // end class OpProto; + +} // namespace nanopb_cpp + +using nanopb_cpp::OpProto; + + +#endif diff --git a/framework/model_parser/parser/nanopb/pb_cpp_common.h b/framework/model_parser/parser/nanopb/pb_cpp_common.h new file mode 100644 index 000000000..d5e0a64f2 --- /dev/null +++ b/framework/model_parser/parser/nanopb/pb_cpp_common.h @@ -0,0 +1,84 @@ +#ifndef _PB_CPP_COMMON_ +#define _PB_CPP_COMMON_ + +#include +#include +#include +#include + +template struct bool_adaptor {}; +template<> struct bool_adaptor<1> { using type = uint8_t; }; +template<> struct bool_adaptor<2> { using type = uint16_t; }; +template<> struct bool_adaptor<4> { using type = uint32_t; }; +template<> struct bool_adaptor<8> { using type = uint64_t; }; + +template +struct vec_functor { + using type = std::vector; +}; + +template<> +struct vec_functor { + using type = std::vector::type>; +}; + +template struct argument_type {}; +template struct argument_type { + using type = T; +}; + +#define PROTO_TY(TYPE) typename argument_type::type + +#define PROTO_FIELD(TYPE, NAME) \ +private: \ + PROTO_TY(TYPE) _##NAME; \ +public: \ + PROTO_TY(TYPE) *mutable_##NAME() { return &_##NAME; } \ + void set_##NAME(const PROTO_TY(TYPE) &x) { _##NAME = x; } \ + const PROTO_TY(TYPE) &NAME() const { return _##NAME; } + +#define REPEATED_PROTO_FIELD(TYPE, NAME) \ + PROTO_FIELD(vec_functor::type, NAME) \ + const TYPE &NAME(int idx) const { \ + auto *ptr = &_##NAME.at(idx); \ + return *reinterpret_cast(ptr); \ + } \ + TYPE *add_##NAME() { \ + _##NAME.push_back(TYPE()); \ + return reinterpret_cast(&_##NAME.back()); \ + } \ + TYPE *add_##NAME(const TYPE &x) { \ + _##NAME.push_back(x); \ + return reinterpret_cast(&_##NAME.back()); \ + } \ + size_t NAME##_size() const { return _##NAME.size(); } + +#define PARSING_MEMBERS(NANOPB_NAME) \ +public: \ + using Nanopb = ::Nanopb_##NANOPB_NAME; \ + static constexpr const pb_field_t *PBFields = NANOPB_NAME##_fields; \ + bool parse_from_buffer(const char *bytes, size_t len); \ + bool parse_from_file(FILE *f); \ + void fill(Nanopb *p); \ + void retrieve(const Nanopb *p); \ + bool parse(pb_istream_t *stream); + +#define PROTO_MAP_ENTRY_MEMBERS(NANOPB_NAME) \ +public: \ + using Nanopb = ::Nanopb_##NANOPB_NAME; \ + static constexpr const pb_field_t *PBFields = NANOPB_NAME##_fields; \ + void fill(Nanopb *p); \ + void retrieve(const Nanopb *p); + +#define PROTO_MAP_ENTRY_KEY_FIELD(TYPE) \ +public: \ + using KeyType = TYPE; \ + PROTO_FIELD(TYPE, key) + +#define PROTO_MAP_ENTRY_VALUE_FIELD(TYPE) \ +public: \ + using ValueType = TYPE; \ + PROTO_FIELD(TYPE, value) + +#endif // _NANOPB_CPP_COMMON_ + diff --git a/framework/model_parser/parser/nanopb/pb_cpp_decode.h b/framework/model_parser/parser/nanopb/pb_cpp_decode.h new file mode 100644 index 000000000..c9df51209 --- /dev/null +++ b/framework/model_parser/parser/nanopb/pb_cpp_decode.h @@ -0,0 +1,137 @@ +#ifndef NANOPB_DECODE_CPP_H +#define NANOPB_DECODE_CPP_H + +#include + +#include +#include +#include +#include + +#include + +#include "anakin_config.h" + +template +static bool decode_varint(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto dest = static_cast(*arg); +#ifndef PB_WITHOUT_64BIT + uint64_t delegate; + if (!pb_decode_varint(stream, &delegate)) return false; +#else + uint32_t delegate; + if (!pb_decode_varint32(stream, &delegate)) return false; +#endif + *dest = static_cast(delegate); + return true; +} + +template +static bool decode_svarint(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto dest = static_cast(*arg); +#ifndef PB_WITHOUT_64BIT + int64_t delegate; +#else + int32_t delegate; +#endif + if (!pb_decode_svarint(stream, &delegate)) return false; + *dest = static_cast(delegate); + return true; +} + +template +bool decode_fixed32(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto dest = static_cast(*arg); + auto ret = pb_decode_fixed32(stream, dest); + return ret; +} + +#ifndef PB_WITHOUT_64BIT +template +bool decode_fixed64(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto dest = static_cast(*arg); + return pb_decode_fixed64(stream, dest); +} +#endif + +template +bool decode_message(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto *dest = static_cast(*arg); + return dest->parse(stream); +} + +using decoder_t = bool (*)(pb_istream_t *, const pb_field_t *, void **); + +template +bool decode_repeated(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto *repeated = static_cast::type *>(*arg); + repeated->push_back(T()); + void *sub_arg = &repeated->back(); + return D(stream, field, &sub_arg); +} + +template +bool decode_map(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto *mapping = static_cast *>(*arg); + T adapter_entry; + typename T::Nanopb pb_entry; + adapter_entry.fill(&pb_entry); + if (!pb_decode(stream, T::PBFields, &pb_entry)) + return false; + adapter_entry.retrieve(&pb_entry); + mapping->emplace(std::move(*adapter_entry.mutable_key()), + std::move(*adapter_entry.mutable_value())); + return true; +} + +template +bool decode_string(pb_istream_t *stream, const pb_field_t *field, void **arg) { + auto str = static_cast(*arg); + const size_t len = stream->bytes_left; + str->resize(len); + std::string::iterator it(str->begin()); + if (!pb_read(stream, reinterpret_cast(&*str->begin()), len)) + return false; + return true; +} + +static size_t file_size(FILE *f) { + size_t file_len; + + fseek(f, 0, SEEK_END); + file_len = ftell(f); + fseek(f, 0, SEEK_SET); + + return file_len; +} + +#define IMPLEMENT_PARSING_WRAPPERS(PROTO) \ + bool PROTO::parse_from_file(FILE *f) { \ + size_t file_len = file_size(f); \ + auto callback = [](pb_istream_t *stream, pb_byte_t *buf, \ + size_t count) { \ + FILE *f = static_cast(stream->state); \ + return count == fread(buf, sizeof(pb_byte_t), count, f); \ + }; \ + pb_istream_t stream { \ + .callback = callback, \ + .state = f, \ + .bytes_left = file_len, \ + }; \ + return parse(&stream); \ + } \ + bool PROTO::parse_from_buffer(const char *buffer, size_t len) { \ + auto stream = pb_istream_from_buffer( \ + reinterpret_cast(buffer), len); \ + return parse(&stream); \ + } \ + bool PROTO::parse(pb_istream_t *stream) { \ + Nanopb pb_proto; \ + fill(&pb_proto); \ + if (!pb_decode(stream, PBFields, &pb_proto)) \ + return false; \ + retrieve(&pb_proto); \ + return true; \ + } + +#endif diff --git a/framework/model_parser/parser/nanopb/tensor.pb.cpp b/framework/model_parser/parser/nanopb/tensor.pb.cpp new file mode 100644 index 000000000..2a66cff85 --- /dev/null +++ b/framework/model_parser/parser/nanopb/tensor.pb.cpp @@ -0,0 +1,156 @@ +#include + +#include +#include + +#include "tensor.pb.hpp" + +#include + +namespace nanopb_cpp { + +void TensorShape::Dim::fill(Nanopb *pb) { + + // value: repeated int32 + pb->value.funcs.decode = decode_repeated>; + pb->value.arg = &_value; + + // size: optional int64 + +} + +void TensorShape::Dim::retrieve(const Nanopb *pb) { + + // value: repeated int32 + + // size: optional int64 + _size = static_cast(pb->size); + +} + +IMPLEMENT_PARSING_WRAPPERS(TensorShape::Dim); + +void TensorShape::fill(Nanopb *pb) { + + // dim: optional TensorShape.Dim + _dim.fill(&pb->dim); + +} + +void TensorShape::retrieve(const Nanopb *pb) { + + // dim: optional TensorShape.Dim + _dim.retrieve(&pb->dim); + +} + +IMPLEMENT_PARSING_WRAPPERS(TensorShape); + +void CacheDate::fill(Nanopb *pb) { + + // s: repeated bytes + pb->s.funcs.decode = decode_repeated; + pb->s.arg = &_s; + + // i: repeated int32 + pb->i.funcs.decode = decode_repeated>; + pb->i.arg = &_i; + + // f: repeated float + pb->f.funcs.decode = decode_repeated>; + pb->f.arg = &_f; + + // b: repeated bool + pb->b.funcs.decode = decode_repeated>; + pb->b.arg = &_b; + + // l: repeated CacheDate + pb->l.funcs.decode = decode_repeated>; + pb->l.arg = &_l; + + // c: optional bytes + pb->c.funcs.decode = decode_string; + pb->c.arg = &_c; + + // type: optional DateTypeProto + + // size: optional int64 + +} + +void CacheDate::retrieve(const Nanopb *pb) { + + // s: repeated bytes + + // i: repeated int32 + + // f: repeated float + + // b: repeated bool + + // l: repeated CacheDate + + // c: optional bytes + + // type: optional DateTypeProto + _type = static_cast(pb->type); + + // size: optional int64 + _size = static_cast(pb->size); + +} + +IMPLEMENT_PARSING_WRAPPERS(CacheDate); + +void TensorProto::fill(Nanopb *pb) { + + // name: optional bytes + pb->name.funcs.decode = decode_string; + pb->name.arg = &_name; + + // shared: optional bool + + // share_from: optional bytes + pb->share_from.funcs.decode = decode_string; + pb->share_from.arg = &_share_from; + + // shape: optional TensorShape + _shape.fill(&pb->shape); + + // valid_shape: optional TensorShape + _valid_shape.fill(&pb->valid_shape); + + // data: optional CacheDate + _data.fill(&pb->data); + + // scale: optional CacheDate + _scale.fill(&pb->scale); + +} + +void TensorProto::retrieve(const Nanopb *pb) { + + // name: optional bytes + + // shared: optional bool + _shared = static_cast(pb->shared); + + // share_from: optional bytes + + // shape: optional TensorShape + _shape.retrieve(&pb->shape); + + // valid_shape: optional TensorShape + _valid_shape.retrieve(&pb->valid_shape); + + // data: optional CacheDate + _data.retrieve(&pb->data); + + // scale: optional CacheDate + _scale.retrieve(&pb->scale); + +} + +IMPLEMENT_PARSING_WRAPPERS(TensorProto); + +} // namespace nanopb_cpp diff --git a/framework/model_parser/parser/nanopb/tensor.pb.hpp b/framework/model_parser/parser/nanopb/tensor.pb.hpp new file mode 100644 index 000000000..1883c237f --- /dev/null +++ b/framework/model_parser/parser/nanopb/tensor.pb.hpp @@ -0,0 +1,85 @@ +#ifndef NANOPB_CPP_TENSOR_PROTO_HPP +#define NANOPB_CPP_TENSOR_PROTO_HPP + +#include + + +#define TensorShape Nanopb_TensorShape +#define CacheDate Nanopb_CacheDate +#define TensorProto Nanopb_TensorProto +#define TensorShape_Dim Nanopb_TensorShape_Dim +#include "tensor.pb.h" +#undef TensorShape +#undef CacheDate +#undef TensorProto +#undef TensorShape_Dim + +namespace nanopb_cpp { + +enum DateTypeProto { + STR = 0, + INT8 = 2, + INT32 = 4, + FLOAT16 = 8, + FLOAT = 13, + DOUBLE = 14, + BOOLEN = 20, + CACHE_LIST = 30, + TENSOR = 31, +}; + +class TensorShape { + class Dim { + REPEATED_PROTO_FIELD(int32_t, value); + PROTO_FIELD(int64_t, size); + + PARSING_MEMBERS(TensorShape_Dim); + }; // end class Dim; + + PROTO_FIELD(nanopb_cpp::TensorShape::Dim, dim); + + PARSING_MEMBERS(TensorShape); +}; // end class TensorShape; + +class CacheDate { + REPEATED_PROTO_FIELD(std::string, s); + REPEATED_PROTO_FIELD(int32_t, i); + REPEATED_PROTO_FIELD(float, f); + REPEATED_PROTO_FIELD(bool, b); + REPEATED_PROTO_FIELD(nanopb_cpp::CacheDate, l); + PROTO_FIELD(std::string, c); + PROTO_FIELD(nanopb_cpp::DateTypeProto, type); + PROTO_FIELD(int64_t, size); + + PARSING_MEMBERS(CacheDate); +}; // end class CacheDate; + +class TensorProto { + PROTO_FIELD(std::string, name); + PROTO_FIELD(bool, shared); + PROTO_FIELD(std::string, share_from); + PROTO_FIELD(nanopb_cpp::TensorShape, shape); + PROTO_FIELD(nanopb_cpp::TensorShape, valid_shape); + PROTO_FIELD(nanopb_cpp::CacheDate, data); + PROTO_FIELD(nanopb_cpp::CacheDate, scale); + + PARSING_MEMBERS(TensorProto); +}; // end class TensorProto; + +} // namespace nanopb_cpp + +using nanopb_cpp::TensorShape; +using nanopb_cpp::CacheDate; +using nanopb_cpp::TensorProto; + +using nanopb_cpp::STR; +using nanopb_cpp::INT8; +using nanopb_cpp::INT32; +using nanopb_cpp::FLOAT16; +using nanopb_cpp::FLOAT; +using nanopb_cpp::DOUBLE; +using nanopb_cpp::BOOLEN; +using nanopb_cpp::CACHE_LIST; +using nanopb_cpp::TENSOR; + +#endif diff --git a/framework/model_parser/parser/parser.cpp b/framework/model_parser/parser/parser.cpp index 73d7f6102..7f94fcf66 100644 --- a/framework/model_parser/parser/parser.cpp +++ b/framework/model_parser/parser/parser.cpp @@ -1,95 +1,101 @@ #include "framework/model_parser/parser/parser.h" #include "framework/model_parser/parser/model_io.h" -#include "framework/model_parser/proto/graph.pb.h" -#include "framework/model_parser/proto/node.pb.h" -#include "framework/model_parser/proto/operator.pb.h" -#include "framework/model_parser/proto/tensor.pb.h" -#include -#include -#include -#include +#ifdef USE_NANOPB +#include "graph.pb.hpp" +#include "node.pb.hpp" +#include "operator.pb.hpp" +#include "tensor.pb.hpp" +#else #include #include #include +#include +#include +#include +#include + +#include "graph.pb.h" +#include "node.pb.h" +#include "operator.pb.h" +#include "tensor.pb.h" +#endif namespace anakin { namespace parser { +const char * WaterMark = "Anakin@right"; + template Status load(graph::Graph* graph, std::string& model_path) { return load(graph, model_path.c_str()); } Status parse_graph_proto(GraphProto& graph_proto, const char* buffer, size_t len) { - google::protobuf::io::ArrayInputStream* raw_input = new google::protobuf::io::ArrayInputStream(buffer, len); - google::protobuf::io::CodedInputStream* coded_input = new google::protobuf::io::CodedInputStream(raw_input); - coded_input->SetTotalBytesLimit(INT_MAX, 536870912); - bool success = graph_proto.ParseFromCodedStream(coded_input) && coded_input->ConsumedEntireMessage(); +#ifdef USE_NANOPB + bool success = graph_proto.parse_from_buffer(buffer, len); +#else + google::protobuf::io::ArrayInputStream raw_input(buffer, len); + google::protobuf::io::CodedInputStream coded_input(&raw_input); + coded_input.SetTotalBytesLimit(INT_MAX, 536870912); + bool success = graph_proto.ParseFromCodedStream(&coded_input) && coded_input.ConsumedEntireMessage(); +#endif if (!success) { - LOG(FATAL) << " Parsing GraphProto " << " ERROR"; - } - - delete coded_input; - delete raw_input; - return Status::OK(); -} - -Status parse_graph_proto(GraphProto& graph_proto, std::istream* instream){ - if (!graph_proto.ParseFromIstream(instream)) { - DLOG(ERROR) << "Fail to parse GraphProto."; - return Status::ANAKINFAIL("Fail to parse GraphProto."); + LOG(ERROR) << " Parsing GraphProto " << " ERROR"; + return Status::ANAKINFAIL("Parsing GraphProto ERROR"); } return Status::OK(); } - Status parse_graph_proto(GraphProto& graph_proto, const char* model_path) { -#if 0 - std::fstream input(model_path, std::ios::in | std::ios::binary); - - if (!input) { - DLOG(ERROR) << model_path << " : File not found. "; - return Status::ANAKINFAIL("File not found"); - } - - GraphProto graph_proto; - - // parsing GraphProto from model - if (!graph_proto.ParseFromIstream(&input)) { - DLOG(ERROR) << "Fail to parse GraphProto."; - return Status::ANAKINFAIL("Fail to parse GraphProto."); - } - +#ifdef USE_NANOPB + FILE *f = fopen(model_path, "rb"); + graph_proto.parse_from_file(f); + fclose(f); + return Status::OK(); #else int file_descriptor = open(model_path, O_RDONLY); if (file_descriptor == -1) { - LOG(FATAL) << " Cant open " << model_path; + LOG(FATAL) << " Can't open " << model_path; } - google::protobuf::io::ZeroCopyInputStream* raw_input = new google::protobuf::io::FileInputStream( - file_descriptor); + google::protobuf::io::FileInputStream raw_input(file_descriptor); - google::protobuf::io::CodedInputStream* coded_input = new google::protobuf::io::CodedInputStream( - raw_input); + google::protobuf::io::CodedInputStream coded_input(&raw_input); - coded_input->SetTotalBytesLimit(ProtoReadBytesLimit, 536870912); + coded_input.SetTotalBytesLimit(ProtoReadBytesLimit, 536870912); - bool success = graph_proto.ParseFromCodedStream(coded_input); + bool success = graph_proto.ParseFromCodedStream(&coded_input); if (!success) { - LOG(FATAL) << " Parsing GraphProto " << model_path << " ERROR"; + LOG(ERROR) << " Parsing GraphProto " << model_path << " ERROR"; + return Status::ANAKINFAIL("Parsing GraphProto ERROR"); } - delete coded_input; - delete raw_input; close(file_descriptor); -#endif return Status::OK(); +#endif } +bool InspectAnakin(const std::string& model_path) { + GraphProto graph_proto; + auto ret = parse_graph_proto(graph_proto, model_path.c_str()); + if(ret) { + return true; + } + return false; +} -template +bool InspectAnakin(const char* buffer, size_t len) { + GraphProto graph_proto; + auto ret = parse_graph_proto(graph_proto, buffer, len); + if(ret) { + return true; + } + return false; +} + +template Status generate_graph_with_graph_proto(graph::Graph* graph, GraphProto& graph_proto) { // fill the graph with name LOG(INFO) << "graph name: " << graph_proto.name(); @@ -121,20 +127,34 @@ Status generate_graph_with_graph_proto(graph::Graph* graph, GraphP auto it_in = graph_proto.edges_in().begin(); for (; it_in != graph_proto.edges_in().end(); ++it_in) { -#ifdef ENABLE_DEBUG - LOG(WARNING) << " Parsing in edges of node : " << it_in->first; -#endif auto& key = it_in->first; auto& second = it_in->second; - - for (int i = 0; i < second.val().size(); i++) { - //Tensor4dPtr tensor_p = std::make_shared>(); - graph::Edge edge(second.val()[i], key); - //edge.weight() = new Tensor4d(); - //edge.weight() = std::make_shared >(); - edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); - edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); - graph->add_in_arc(edge); + if (second.target().size() > 0) { + for (int i = 0; i < second.target().size(); i++) { + DLOG(INFO) << "Parsing in edges of node with scale: " << key; + graph::Edge edge(second.target()[i].node(), key); + std::vector scale; + for (int j = 0; j < second.target()[i].scale_size(); j++) { + scale.push_back(second.target()[i].scale(j)); + } + auto layout = second.target()[i].layout(); + if (layout == 0){ + layout = LP_NCHW; + } + edge.set_scale(scale); + edge.set_layout((anakin::saber::LayoutType)layout); + edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); + edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); + graph->add_in_arc(edge); + } + } else { + for (int i = 0; i < second.val().size(); i++) { + DLOG(INFO) << "Parsing in edges of node without scale: " << key; + graph::Edge edge(second.val()[i], key); + edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); + edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); + graph->add_in_arc(edge); + } } } @@ -143,35 +163,35 @@ Status generate_graph_with_graph_proto(graph::Graph* graph, GraphP for (; it_out != graph_proto.edges_out().end(); ++it_out) { auto& key = it_out->first; auto& second = it_out->second; - - for (int i = 0; i < second.val().size(); i++) { - //Tensor4dPtr tensor_p = std::make_shared>(); - graph::Edge edge(key, second.val()[i]); - //edge.weight() = new Tensor4d(); - //edge.weight() = std::make_shared >(); - edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); - edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); - graph->add_out_arc(edge); - } - } - - - // fill the graph with edges - /*for(int i=0; i < node_io.get_node_name_in_order().size(); i++) { - auto& node_name = node_io.get_node_name_in_order()[i]; - if (graph_proto.edges().count(node_name) > 0) { - auto& second_node_name_list = graph_proto.edges().at(node_name); - for(int j = 0; j < second_node_name_list.val().size(); j++) { - graph::Edge edge(node_name, second_node_name_list.val()[j]); - edge.weight() = std::make_shared >(); + if (second.target().size() > 0) { + for (int i = 0; i < second.target().size(); i++) { + DLOG(INFO) << "Parsing out edges of node with scale: " << key; + graph::Edge edge(key, second.target()[i].node()); + std::vector scale; + for (int j = 0; j < second.target()[i].scale_size(); j++) { + scale.push_back(second.target()[i].scale(j)); + } + auto layout = second.target()[i].layout(); + DLOG(ERROR) << "layout:" << layout; + if (layout == 0){ + layout = LP_NCHW; + } + edge.set_scale(scale); + edge.set_layout((anakin::saber::LayoutType)layout); edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); - graph->add_arc(edge); + graph->add_out_arc(edge); } } else { - LOG(FATAL) << " Node : " << node_name << " not found!"; + for (int i = 0; i < second.val().size(); i++) { + DLOG(INFO) << "Parsing in edges of node without scale: " << key; + graph::Edge edge(key, second.val()[i]); + edge.shared() = (*graph_proto.mutable_edges_info())[edge.name()].shared(); + edge.share_from() = (*graph_proto.mutable_edges_info())[edge.name()].share_from(); + graph->add_out_arc(edge); + } } - }*/ + } // fill the graph with info (only use the key value: is_optimized) graph->statistics.template set_info(graph_proto.summary().is_optimized()); @@ -180,7 +200,6 @@ Status generate_graph_with_graph_proto(graph::Graph* graph, GraphP (graph_proto.summary().original_temp_mem_used()); graph->statistics.template set_info(graph_proto.summary().system_mem_used()); graph->statistics.template set_info(graph_proto.summary().model_mem_used()); - graph->change_name(); return Status::OK(); } @@ -193,13 +212,12 @@ Status load(graph::Graph* graph, const char* model_path) { template Status load(graph::Graph* graph, const char* buffer, size_t len) { - GraphProto graph_proto; parse_graph_proto(graph_proto, buffer, len); - return generate_graph_with_graph_proto(graph, graph_proto);; + return generate_graph_with_graph_proto(graph, graph_proto); } - +#ifndef USE_NANOPB template Status save(graph::Graph* graph, std::string& model_path) { return save(graph, model_path.c_str()); @@ -252,19 +270,16 @@ Status save(graph::Graph* graph, const char* model_path) { auto edges_in = graph_proto.mutable_edges_in(); auto edges_out = graph_proto.mutable_edges_out(); auto edges_info = graph_proto.mutable_edges_info(); - /*auto insert_edge = [&](graph::Edge& edge) { - (*edges)[edge.first()].add_val(edge.second()); - TensorProto ts; - ts.set_name(edge.name()); - ts.set_shared(edge.shared()); - ts.set_share_from(edge.share_from()); - (*edges_info)[edge.name()].CopyFrom(ts); - };*/ auto insert_edge = [&](graph::NodePtr& node_p) { auto& arcs_it_in = graph->get_in_arc_its(node_p->name()); auto& arcs_it_out = graph->get_out_arc_its(node_p->name()); for (auto& edge_it : arcs_it_in) { - (*edges_in)[edge_it->second()].add_val(edge_it->first()); + auto tg = (*edges_in)[edge_it->second()].add_target(); + tg->set_node(edge_it->first()); + for (auto scale: edge_it->scale()){ + tg->add_scale(scale); + } + tg->set_layout((LayoutProto)edge_it->layout()); TensorProto ts; ts.set_name(edge_it->name()); ts.set_shared(edge_it->shared()); @@ -273,7 +288,12 @@ Status save(graph::Graph* graph, const char* model_path) { } for (auto& edge_it : arcs_it_out) { - (*edges_out)[edge_it->first()].add_val(edge_it->second()); + auto tg = (*edges_out)[edge_it->first()].add_target(); + tg->set_node(edge_it->second()); + for (auto scale: edge_it->scale()){ + tg->add_scale(scale); + } + tg->set_layout((LayoutProto)edge_it->layout()); TensorProto ts; ts.set_name(edge_it->name()); ts.set_shared(edge_it->shared()); @@ -298,7 +318,7 @@ Status save(graph::Graph* graph, const char* model_path) { return Status::OK(); } - +#endif #ifdef USE_CUDA template @@ -344,12 +364,14 @@ Status load(graph::Graph* graph, con template Status load(graph::Graph* graph, const char* model_path); +#ifndef USE_NANOPB template Status save(graph::Graph* graph, std::string& model_path); template Status save(graph::Graph* graph, std::string& model_path); template Status save(graph::Graph* graph, std::string& model_path); +#endif template Status load(graph::Graph* graph, std::string& model_path); @@ -358,12 +380,14 @@ Status load(graph::Graph* graph, std template Status load(graph::Graph* graph, std::string& model_path); +#ifndef USE_NANOPB template Status save(graph::Graph* graph, const char* model_path); template Status save(graph::Graph* graph, const char* model_path); template Status save(graph::Graph* graph, const char* model_path); +#endif template Status load(graph::Graph* graph, const char* buffer, size_t len); @@ -374,48 +398,47 @@ Status load(graph::Graph* graph, con #endif #ifdef USE_ARM_PLACE -#ifdef ANAKIN_TYPE_FP32 template Status load(graph::Graph* graph, const char* model_path); template -Status save(graph::Graph* graph, std::string& model_path); -template Status load(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, const char* model_path); +Status load(graph::Graph* graph, const char* buffer, size_t len); +#ifndef USE_NANOPB template -Status load(graph::Graph* graph, std::string& model_path); +Status save(graph::Graph* graph, std::string& model_path); template -Status load(graph::Graph* graph, const char* buffer, size_t len); +Status save(graph::Graph* graph, const char* model_path); #endif -#ifdef ANAKIN_TYPE_FP16 template Status load(graph::Graph* graph, const char* model_path); template -Status save(graph::Graph* graph, std::string& model_path); -template Status load(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, const char* model_path); -template Status load(graph::Graph* graph, const char* buffer, size_t len); + +#ifndef USES_NANOPB +template +Status save(graph::Graph* graph, std::string& model_path); +template +Status save(graph::Graph* graph, const char* model_path); #endif -#ifdef ANAKIN_TYPE_INT8 template Status load(graph::Graph* graph, const char* model_path); template -Status save(graph::Graph* graph, std::string& model_path); -template Status load(graph::Graph* graph, std::string& model_path); template -Status save(graph::Graph* graph, const char* model_path); -template Status load(graph::Graph* graph, const char* buffer, size_t len); -#endif +#ifndef USE_NANOPB +template +Status save(graph::Graph* graph, const char* model_path); +template +Status save(graph::Graph* graph, std::string& model_path); #endif +#endif // ifdef USE_ARM_PLACE #ifdef AMD_GPU @@ -440,6 +463,7 @@ Status load(graph::Graph* graph, con template Status load(graph::Graph* graph, const char* buffer, size_t len); +#ifndef USE_NANOPB template Status save(graph::Graph* graph, std::string& model_path); template @@ -454,6 +478,7 @@ Status save(graph::Graph* graph, con template Status save(graph::Graph* graph, const char* model_path); #endif +#endif } /* parser */ diff --git a/framework/model_parser/parser/parser.h b/framework/model_parser/parser/parser.h index 28805824f..8486b279c 100644 --- a/framework/model_parser/parser/parser.h +++ b/framework/model_parser/parser/parser.h @@ -33,6 +33,10 @@ Status load(graph::Graph* graph, std::string& model_path); template Status load(graph::Graph* graph, const char* model_path); +bool InspectAnakin(const std::string& model_path); + +bool InspectAnakin(const char* buffer, size_t len); + template Status load(graph::Graph* graph, const char* buffer, size_t len); diff --git a/framework/model_parser/proto/graph.proto b/framework/model_parser/proto/graph.proto index 21120a56d..01ebd5a51 100644 --- a/framework/model_parser/proto/graph.proto +++ b/framework/model_parser/proto/graph.proto @@ -27,9 +27,38 @@ message Info { bool is_optimized = 10; }; +//this proto correspond to LayoutType +//you should't change the index +enum LayoutProto { + Invalid = 0; + LP_W = 1; + LP_HW = 2; + LP_WH = 3; + LP_NC = 4; + LP_NH = 5; + LP_NW = 6; + LP_NHW = 7; + LP_NCHW = 8; + LP_NHWC = 9; + LP_NCHW_C4 = 10; + LP_NCHW_C8 = 11; + LP_NCHW_C16 = 12; + LP_OIHW16I16O = 13; + LP_GOIHW16I16O = 14; + LP_NCHW_C8R=15; + LP_NCHW_C16R=16; +}; + +message TargetProto { + string node = 1; + repeated float scale = 2; + LayoutProto layout = 3; +}; + // string list message List { - repeated string val = 1; + repeated string val = 1; // Will be deprecated + repeated TargetProto target = 2; }; // Anakin Graph define @@ -44,7 +73,7 @@ repeated NodeProto nodes = 2; // map: node name ---> node name // edges saves storage of anakin model. map edges_in = 3; -map edges_out =4; +map edges_out = 4; // edges info [optional] // map: node_name_0 + "_" + node_name_1 ---> edge tensor (tensor not hold data) diff --git a/framework/model_parser/proto/node.options b/framework/model_parser/proto/node.options new file mode 100644 index 000000000..118e1ab9e --- /dev/null +++ b/framework/model_parser/proto/node.options @@ -0,0 +1,2 @@ +# node.proto +valueType.data no_unions:true diff --git a/framework/model_parser/proto/node.proto b/framework/model_parser/proto/node.proto index c926d5cae..284b978a8 100644 --- a/framework/model_parser/proto/node.proto +++ b/framework/model_parser/proto/node.proto @@ -10,10 +10,10 @@ message valueType { int32 i = 2; // int float f = 3; // float bool b = 4; // bool - CacheDate cache_list = 8; // cache list - TensorProto tensor = 10; // tensor + CacheDate cache_list = 8; // cache list + TensorProto tensor = 10; // tensor } - DateTypeProto type = 14; + DateTypeProto type = 14; }; message NodeProto { @@ -28,7 +28,7 @@ message NodeProto { // map :attr name ---> Attributes map attr = 10; - + // op execute lane [optional] // ( only used when anakin generates optimized model ) int32 lane = 11; @@ -39,5 +39,8 @@ message NodeProto { // Operator of node. OpProto Op = 15; + + // Quantitative information + DateTypeProto bit_type = 16; }; diff --git a/framework/model_parser/proto/tensor.proto b/framework/model_parser/proto/tensor.proto index f46c643ca..432f7fd50 100644 --- a/framework/model_parser/proto/tensor.proto +++ b/framework/model_parser/proto/tensor.proto @@ -12,13 +12,15 @@ message TensorShape { // anakin data type. // maybe need to be improved enum DateTypeProto { - STR = 0; + STR = 0; // When used as bit type, enum 0 means invalid. + INT8 = 2; INT32 = 4; + FLOAT16 = 8; FLOAT = 13; DOUBLE = 14; BOOLEN = 20; - CACHE_LIST = 30; - TENSOR = 31; + CACHE_LIST = 30; + TENSOR = 31; }; // list data cache @@ -28,7 +30,8 @@ message CacheDate { repeated float f = 3; /// list float repeated bool b = 4; /// list bool repeated CacheDate l = 5; /// list list - DateTypeProto type = 6; + bytes c = 8; /// string for int8 + DateTypeProto type = 6; int64 size = 7; }; @@ -55,6 +58,9 @@ message TensorProto { // tensor data cache. CacheDate data = 10; + + // scale for int8 + CacheDate scale = 11; }; diff --git a/framework/operators/activation.cpp b/framework/operators/activation.cpp index 4438ae423..6fd245434 100644 --- a/framework/operators/activation.cpp +++ b/framework/operators/activation.cpp @@ -23,7 +23,7 @@ ActivationHelper::~ActivationHelper() { template Status ActivationHelper::InitParam() { - DLOG(WARNING) << "Parsing Activation op parameter."; + LOG(WARNING) << "Parsing Activation op parameter."; auto type = GET_PARAMETER(std::string, type); if (type == "TanH") { ActivationParam param_activation(Active_tanh); @@ -44,14 +44,21 @@ Status ActivationHelper::InitParam() { ActivationParam param_activation(Active_stanh); _param_activation = param_activation; } else if (type == "Relu") { - ActivationParam param_activation(Active_relu); + auto alpha = GET_PARAMETER(float, alpha); + ActivationParam param_activation(Active_relu, alpha); _param_activation = param_activation; } else if (type == "ClippedRelu") { - ActivationParam param_activation(Active_clipped_relu); + float coef = GET_PARAMETER(float, clip_relu_num); + ActivationParam param_activation(Active_clipped_relu, 0.f, coef); _param_activation = param_activation; } else if (type == "Elu") { ActivationParam param_activation(Active_elu); _param_activation = param_activation; + } else if (type == "Swish") { + //the float beta(=coef) of swish op + float coef = GET_PARAMETER(float, clip_relu_num); + ActivationParam param_activation(Active_swish, 0.f, coef); + _param_activation = param_activation; } else { LOG(FATAL) << "Other Activation type" << type << " should be replace by other ops."; } @@ -76,15 +83,9 @@ Status ActivationHelper::InferShape(const std::vector -Status ActivationHelper::Init(OpContext& ctx, - const std::vector< Tensor4dPtr > & ins, - std::vector< Tensor4dPtr >& outs) { - SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, STATIC, VENDER_IMPL, ctx)); - return Status::OK(); -} ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, NV, Precision::FP32); +INSTANCE_ACTIVATION(NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, NV, Precision::INT8); #endif #if defined USE_X86_PLACE || defined BUILD_LITE @@ -99,7 +100,7 @@ ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, X86, Precision::FP32); INSTANCE_ACTIVATION(ARM, Precision::FP32); template class ActivationHelper; ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, ARM, Precision::FP32); -#endif//arm +#endif #ifdef AMD_GPU INSTANCE_ACTIVATION(AMD, Precision::FP32); @@ -113,6 +114,7 @@ ANAKIN_REGISTER_OP(Activation) .Doc("Activation operator") #ifdef USE_CUDA .__alias__("activation") +.__alias__("activation") #endif #ifdef USE_ARM_PLACE .__alias__("activation") diff --git a/framework/operators/affine_channel.cpp b/framework/operators/affine_channel.cpp new file mode 100644 index 000000000..c89329bd2 --- /dev/null +++ b/framework/operators/affine_channel.cpp @@ -0,0 +1,106 @@ +#include "framework/operators/affine_channel.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ACTIVATION(Ttype, Ptype) \ +template<> \ +void AffineChannel::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_affine_channel; \ + impl->_funcs_affine_channel(ins, outs, param, ctx); \ +} + +/// set helper +template +AffineChannelHelper::~AffineChannelHelper() { +} + +template +Status AffineChannelHelper::InitParam() { + DLOG(WARNING) << "Parsing AffineChannel op parameter."; + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto bias = GET_PARAMETER(pblock_type, weight_2); + AffineChannelParam param_affine_channel(&(weights.d_tensor()), &(bias.d_tensor())); + _param_affine_channel = param_affine_channel; + + return Status::OK(); +} + +template +Status AffineChannelHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_affine_channel.init(ins, outs, _param_affine_channel, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status AffineChannelHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_affine_channel.compute_output_shape(ins, outs, _param_affine_channel)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ACTIVATION(NV, Precision::FP32); + +template<> +Status AffineChannelHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_affine_channel.init(ins, outs, _param_affine_channel, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(AffineChannel, AffineChannelHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ACTIVATION(X86, Precision::FP32); +INSTANCE_ACTIVATION(X86, Precision::FP16); +INSTANCE_ACTIVATION(X86, Precision::INT8); +template class AffineChannelHelper; +ANAKIN_REGISTER_OP_HELPER(AffineChannel, AffineChannelHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ACTIVATION(ARM, Precision::FP32); +template class AffineChannelHelper; +ANAKIN_REGISTER_OP_HELPER(AffineChannel, AffineChannelHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ACTIVATION(AMD, Precision::FP32); +template class AffineChannelHelper; +template class AffineChannelHelper; +template class AffineChannelHelper; +ANAKIN_REGISTER_OP_HELPER(AffineChannel, AffineChannelHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(AffineChannel) +.Doc("AffineChannel operator") +#ifdef USE_CUDA +.__alias__("affine_channel") +#endif +#ifdef USE_ARM_PLACE +.__alias__("affine_channel") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("affine_channel") +#endif +#ifdef AMD_GPU +.__alias__("affine_channel") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/affine_channel.h b/framework/operators/affine_channel.h new file mode 100644 index 000000000..7344ebd10 --- /dev/null +++ b/framework/operators/affine_channel.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_AFFINE_CHANNEL_H +#define ANAKIN_OPERATOR_AFFINE_CHANNEL_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/affine_channel.h" + +namespace anakin { + +namespace ops { + +template +class AffineChannelHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class AffineChannel : public Operator { +public: + AffineChannel() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator AffineChannel< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class AffineChannelHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class AffineChannelHelper : public OperatorHelper { +public: + AffineChannelHelper()=default; + + ~AffineChannelHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_affine_channel stand for affine_channel parameter + saber::AffineChannelParam _param_affine_channel; + ///< _funcs_affine_channel stand for affine_channel function + saber::AffineChannel::saber_type> _funcs_affine_channel; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/aligned_mat_mul.cpp b/framework/operators/aligned_mat_mul.cpp new file mode 100644 index 000000000..7d707bf5a --- /dev/null +++ b/framework/operators/aligned_mat_mul.cpp @@ -0,0 +1,109 @@ +#include "framework/operators/aligned_mat_mul.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ALIGNED_MAT_MUL(Ttype, Ptype) \ +template<> \ +void AlignedMatMul::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_aligned_mat_mul; \ + impl->_funcs_aligned_mat_mul(ins, outs, param, ctx); \ +} + +/// set helper +template +AlignedMatMulHelper::~AlignedMatMulHelper() { +} + +template +Status AlignedMatMulHelper::InitParam() { + LOG(WARNING) << "Parsing AlignedMatMul op parameter."; + auto transpose_x = GET_PARAMETER(bool, transpose_x); + auto transpose_y = GET_PARAMETER(bool, transpose_y); + auto scale = GET_PARAMETER(float, coeff); + AlignedMatMulParam param_aligned_mat_mul(transpose_x, transpose_y, scale); + _param_aligned_mat_mul = param_aligned_mat_mul; + + return Status::OK(); +} + +template +Status AlignedMatMulHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_aligned_mat_mul.init(ins, outs, _param_aligned_mat_mul, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status AlignedMatMulHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_aligned_mat_mul.compute_output_shape(ins, outs, _param_aligned_mat_mul)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ALIGNED_MAT_MUL(NV, Precision::FP32); + +template<> +Status AlignedMatMulHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_aligned_mat_mul.init(ins, outs, _param_aligned_mat_mul, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(AlignedMatMul, AlignedMatMulHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ALIGNED_MAT_MUL(X86, Precision::FP32); +INSTANCE_ALIGNED_MAT_MUL(X86, Precision::FP16); +INSTANCE_ALIGNED_MAT_MUL(X86, Precision::INT8); +template class AlignedMatMulHelper; +ANAKIN_REGISTER_OP_HELPER(AlignedMatMul, AlignedMatMulHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ALIGNED_MAT_MUL(ARM, Precision::FP32); +template class AlignedMatMulHelper; +ANAKIN_REGISTER_OP_HELPER(AlignedMatMul, AlignedMatMulHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ALIGNED_MAT_MUL(AMD, Precision::FP32); +template class AlignedMatMulHelper; +template class AlignedMatMulHelper; +template class AlignedMatMulHelper; +ANAKIN_REGISTER_OP_HELPER(AlignedMatMul, AlignedMatMulHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(AlignedMatMul) +.Doc("AlignedMatMul operator") +#ifdef USE_CUDA +.__alias__("aligned_mat_mul") +#endif +#ifdef USE_ARM_PLACE +.__alias__("aligned_mat_mul") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("aligned_mat_mul") +#endif +#ifdef AMD_GPU +.__alias__("aligned_mat_mul") +#endif +.num_in(2) +.num_out(1) +.Args("is_transpose_X", "Is X transpose or not") +.Args("is_transpose_Y", "Is Y transpose or not ") +.Args("scale", "Z = scale * X * Y"); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/aligned_mat_mul.h b/framework/operators/aligned_mat_mul.h new file mode 100644 index 000000000..172a5128c --- /dev/null +++ b/framework/operators/aligned_mat_mul.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ALIGNED_MAT_MUL_H +#define ANAKIN_OPERATOR_ALIGNED_MAT_MUL_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/aligned_mat_mul.h" + +namespace anakin { + +namespace ops { + +template +class AlignedMatMulHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class AlignedMatMul : public Operator { +public: + AlignedMatMul() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator AlignedMatMul< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class AlignedMatMulHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class AlignedMatMulHelper : public OperatorHelper { +public: + AlignedMatMulHelper()=default; + + ~AlignedMatMulHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_aligned_mat_mul stand for aligned_mat_mul parameter + saber::AlignedMatMulParam _param_aligned_mat_mul; + ///< _funcs_aligned_mat_mul stand for aligned_mat_mul function + saber::AlignedMatMul::saber_type> _funcs_aligned_mat_mul; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/anchor_generator.cpp b/framework/operators/anchor_generator.cpp new file mode 100644 index 000000000..d042b5e4d --- /dev/null +++ b/framework/operators/anchor_generator.cpp @@ -0,0 +1,117 @@ +#include "framework/operators/anchor_generator.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ACTIVATION(Ttype, Ptype) \ +template<> \ +void AnchorGenerator::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_anchor_generator; \ + impl->_funcs_anchor_generator(ins, outs, param, ctx); \ +} + +/// set helper +template +AnchorGeneratorHelper::~AnchorGeneratorHelper() { +} + +template +Status AnchorGeneratorHelper::InitParam() { + DLOG(WARNING) << "Parsing AnchorGenerator op parameter."; + auto offset = GET_PARAMETER(float, offset); + auto anchor_sizes = GET_PARAMETER(PTuple, anchor_sizes); + auto aspect_ratios = GET_PARAMETER(PTuple, aspect_ratios); + auto variances = GET_PARAMETER(PTuple, variances); + auto stride = GET_PARAMETER(PTuple, stride); + AnchorGeneratorParam param_anchor_generator(anchor_sizes.vector(), + aspect_ratios.vector(), + variances.vector(), + stride.vector(), + offset); + _param_anchor_generator = param_anchor_generator; + + return Status::OK(); +} + +template +Status AnchorGeneratorHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_anchor_generator.init(ins, outs, _param_anchor_generator, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status AnchorGeneratorHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_anchor_generator.compute_output_shape(ins, outs, _param_anchor_generator)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ACTIVATION(NV, Precision::FP32); + +template<> +Status AnchorGeneratorHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_anchor_generator.init(ins, outs, _param_anchor_generator, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(AnchorGenerator, AnchorGeneratorHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ACTIVATION(X86, Precision::FP32); +INSTANCE_ACTIVATION(X86, Precision::FP16); +INSTANCE_ACTIVATION(X86, Precision::INT8); +template class AnchorGeneratorHelper; +ANAKIN_REGISTER_OP_HELPER(AnchorGenerator, AnchorGeneratorHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ACTIVATION(ARM, Precision::FP32); +template class AnchorGeneratorHelper; +ANAKIN_REGISTER_OP_HELPER(AnchorGenerator, AnchorGeneratorHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ACTIVATION(AMD, Precision::FP32); +template class AnchorGeneratorHelper; +template class AnchorGeneratorHelper; +template class AnchorGeneratorHelper; +ANAKIN_REGISTER_OP_HELPER(AnchorGenerator, AnchorGeneratorHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(AnchorGenerator) +.Doc("AnchorGenerator operator") +#ifdef USE_CUDA +.__alias__("anchor_generator") +#endif +#ifdef USE_ARM_PLACE +.__alias__("anchor_generator") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("anchor_generator") +#endif +#ifdef AMD_GPU +.__alias__("anchor_generator") +#endif +.num_in(1) +.num_out(2) +.Args>("anchor_sizes", " box size in image ") +.Args>("aspect_ratios", " box height and width ratio in image ") +.Args>("variances", " variances ") +.Args>("stride", " stride ") +.Args("offset", " offset "); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/anchor_generator.h b/framework/operators/anchor_generator.h new file mode 100644 index 000000000..670ab78a6 --- /dev/null +++ b/framework/operators/anchor_generator.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ANCHOR_GENERATOR_H +#define ANAKIN_OPERATOR_ANCHOR_GENERATOR_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/anchor_generator.h" + +namespace anakin { + +namespace ops { + +template +class AnchorGeneratorHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class AnchorGenerator : public Operator { +public: + AnchorGenerator() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator AnchorGenerator< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class AnchorGeneratorHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class AnchorGeneratorHelper : public OperatorHelper { +public: + AnchorGeneratorHelper()=default; + + ~AnchorGeneratorHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_anchor_generator stand for anchor_generator parameter + saber::AnchorGeneratorParam _param_anchor_generator; + ///< _funcs_anchor_generator stand for anchor_generator function + saber::AnchorGenerator::saber_type> _funcs_anchor_generator; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/arg_max.cpp b/framework/operators/arg_max.cpp index c392460fe..b4158817d 100644 --- a/framework/operators/arg_max.cpp +++ b/framework/operators/arg_max.cpp @@ -4,19 +4,6 @@ namespace anakin { namespace ops { -//#ifdef USE_CUDA -//template<> -//void Argmax::operator()( -// OpContext& ctx, -// const std::vector >& ins, -// std::vector >& outs) { -// auto* impl = -// static_cast*>(this->_helper); -// auto& param = impl->_param_argmax; -// impl->_funcs_argmax(ins, outs, param, ctx); -//} -//#endif - /// TODO ... specialization other type of operator #define INSTANCE_ARGMAX(Ttype, Ptype) \ template<> \ @@ -76,6 +63,12 @@ template class ArgmaxHelper; ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_ARGMAX(AMD, Precision::FP32); +template class ArgmaxHelper; +ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, AMD, Precision::FP32); +#endif + #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_ARGMAX(X86, Precision::FP32); template class ArgmaxHelper; @@ -85,21 +78,11 @@ ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE - -#ifdef ANAKIN_TYPE_FP32 INSTANCE_ARGMAX(ARM, Precision::FP32); template class ArgmaxHelper; -ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, ARM, Precision::FP32); -#endif //fp32 - -#ifdef ANAKIN_TYPE_FP16 template class ArgmaxHelper; -#endif //fp16 - -#ifdef ANAKIN_TYPE_INT8 template class ArgmaxHelper; -#endif //int8 - +ANAKIN_REGISTER_OP_HELPER(Argmax, ArgmaxHelper, ARM, Precision::FP32); #endif //arm //! register op @@ -115,6 +98,9 @@ ANAKIN_REGISTER_OP(Argmax) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("Argmax") #endif +#ifdef AMD_GPU +.__alias__("Argmax") +#endif .num_in(1) .num_out(1) .Args("out_max_val", " out_max_val for argmax ") diff --git a/framework/operators/arithmetic.cpp b/framework/operators/arithmetic.cpp new file mode 100644 index 000000000..ff4314f3c --- /dev/null +++ b/framework/operators/arithmetic.cpp @@ -0,0 +1,109 @@ +#include "framework/operators/arithmetic.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ARITHMETIC(Ttype, Ptype) \ +template<> \ +void Arithmetic::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_arithmetic; \ + impl->_funcs_arithmetic(ins, outs, param, ctx); \ +} + +/// set helper +template +ArithmeticHelper::~ArithmeticHelper() { +} + +template +Status ArithmeticHelper::InitParam() { + LOG(WARNING) << "Parsing Arithmetic op parameter."; + auto type = GET_PARAMETER(int, op_type); + if (type <= 3) { + ArithmeticParam param_arithmetic(ArithmeticType(type-1)); + _param_arithmetic = param_arithmetic; + } else { + LOG(FATAL) << "Other Arithmetic type" << type << " should be replace by other ops."; + } + + return Status::OK(); +} + +template +Status ArithmeticHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_arithmetic.init(ins, outs, _param_arithmetic, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status ArithmeticHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_arithmetic.compute_output_shape(ins, outs, _param_arithmetic)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ARITHMETIC(NV, Precision::FP32); + +template<> +Status ArithmeticHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_arithmetic.init(ins, outs, _param_arithmetic, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Arithmetic, ArithmeticHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ARITHMETIC(X86, Precision::FP32); +INSTANCE_ARITHMETIC(X86, Precision::FP16); +INSTANCE_ARITHMETIC(X86, Precision::INT8); +template class ArithmeticHelper; +ANAKIN_REGISTER_OP_HELPER(Arithmetic, ArithmeticHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ARITHMETIC(ARM, Precision::FP32); +template class ArithmeticHelper; +ANAKIN_REGISTER_OP_HELPER(Arithmetic, ArithmeticHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ARITHMETIC(AMD, Precision::FP32); +template class ArithmeticHelper; +template class ArithmeticHelper; +template class ArithmeticHelper; +ANAKIN_REGISTER_OP_HELPER(Arithmetic, ArithmeticHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Arithmetic) +.Doc("Arithmetic operator") +#ifdef USE_CUDA +.__alias__("arithmetic") +#endif +#ifdef USE_ARM_PLACE +.__alias__("arithmetic") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("arithmetic") +#endif +#ifdef AMD_GPU +.__alias__("arithmetic") +#endif +.num_in(2) +.num_out(1) +.Args("op_type", " type of Arithmetic "); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/arithmetic.h b/framework/operators/arithmetic.h new file mode 100644 index 000000000..89ca44351 --- /dev/null +++ b/framework/operators/arithmetic.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ARITHMETIC_H +#define ANAKIN_OPERATOR_ARITHMETIC_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/arithmetic.h" + +namespace anakin { + +namespace ops { + +template +class ArithmeticHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class Arithmetic : public Operator { +public: + Arithmetic() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Arithmetic< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class ArithmeticHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class ArithmeticHelper : public OperatorHelper { +public: + ArithmeticHelper()=default; + + ~ArithmeticHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_arithmetic stand for arithmetic parameter + saber::ArithmeticParam _param_arithmetic; + ///< _funcs_arithmetic stand for arithmetic function + saber::Arithmetic::saber_type> _funcs_arithmetic; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/attension_lstm.cpp b/framework/operators/attension_lstm.cpp index de947b4dd..f2112bcb5 100644 --- a/framework/operators/attension_lstm.cpp +++ b/framework/operators/attension_lstm.cpp @@ -4,7 +4,7 @@ namespace anakin { namespace ops { -#define INSTANCE_SEQUENCE_EXPAND(Ttype, Ptype) \ +#define INSTANCE_ATTENTION_LSTM(Ttype, Ptype) \ template<> \ void AttensionLstm::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -75,18 +75,24 @@ Status AttensionLstmHelper::InferShape(const } #ifdef USE_CUDA -INSTANCE_SEQUENCE_EXPAND(NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, NV, Precision::FP32); +INSTANCE_ATTENTION_LSTM(NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, NV, Precision::FP32); #endif #ifdef USE_X86_PLACE -INSTANCE_SEQUENCE_EXPAND(X86, Precision::FP32); +INSTANCE_ATTENTION_LSTM(X86, Precision::FP32); template class AttensionLstmHelper; -ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, X86, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_ATTENTION_LSTM(AMD, Precision::FP32); +template class AttensionLstmHelper; +ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, AMD, Precision::FP32); #endif #ifdef USE_ARM_PLACE -INSTANCE_SEQUENCE_EXPAND(ARM, Precision::FP32); +INSTANCE_ATTENTION_LSTM(ARM, Precision::FP32); template class AttensionLstmHelper; ANAKIN_REGISTER_OP_HELPER(AttensionLstm, AttensionLstmHelper, ARM, Precision::FP32); #endif//arm @@ -103,6 +109,9 @@ ANAKIN_REGISTER_OP(AttensionLstm) #ifdef USE_X86_PLACE .__alias__("attension_lstm") #endif +#ifdef AMD_GPU +.__alias__("attension_lstm") +#endif .num_in(1) .num_out(1) .Args("is_reverse", " is_reverse for lstm.") diff --git a/framework/operators/attention_padding_mask.cpp b/framework/operators/attention_padding_mask.cpp new file mode 100644 index 000000000..1d1b06b71 --- /dev/null +++ b/framework/operators/attention_padding_mask.cpp @@ -0,0 +1,98 @@ +#include "framework/operators/attention_padding_mask.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ATTENTION_PADDING_MASK(Ttype, Ptype) \ +template<> \ +void AttentionPaddingMask::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_attention_padding_mask; \ + impl->_funcs_attention_padding_mask(ins, outs, param, ctx); \ +} + +/// set helper +template +AttentionPaddingMaskHelper::~AttentionPaddingMaskHelper() { +} + +template +Status AttentionPaddingMaskHelper::InitParam() { + LOG(WARNING) << "Parsing AttentionPaddingMask op parameter."; + auto mask = GET_PARAMETER(float, mask); + AttentionPaddingMaskParam param_attention_padding_mask(mask, 12800001); + _param_attention_padding_mask = param_attention_padding_mask; + + return Status::OK(); +} + +template +Status AttentionPaddingMaskHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_attention_padding_mask.init(ins, outs, _param_attention_padding_mask, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status AttentionPaddingMaskHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_attention_padding_mask.compute_output_shape(ins, outs, _param_attention_padding_mask)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ATTENTION_PADDING_MASK(NV, Precision::FP32); +template class AttentionPaddingMaskHelper; +ANAKIN_REGISTER_OP_HELPER(AttentionPaddingMask, AttentionPaddingMaskHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ATTENTION_PADDING_MASK(X86, Precision::FP32); +INSTANCE_ATTENTION_PADDING_MASK(X86, Precision::FP16); +INSTANCE_ATTENTION_PADDING_MASK(X86, Precision::INT8); +template class AttentionPaddingMaskHelper; +ANAKIN_REGISTER_OP_HELPER(AttentionPaddingMask, AttentionPaddingMaskHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ATTENTION_PADDING_MASK(ARM, Precision::FP32); +template class AttentionPaddingMaskHelper; +ANAKIN_REGISTER_OP_HELPER(AttentionPaddingMask, AttentionPaddingMaskHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ATTENTION_PADDING_MASK(AMD, Precision::FP32); +template class AttentionPaddingMaskHelper; +template class AttentionPaddingMaskHelper; +template class AttentionPaddingMaskHelper; +ANAKIN_REGISTER_OP_HELPER(AttentionPaddingMask, AttentionPaddingMaskHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(AttentionPaddingMask) +.Doc("AttentionPaddingMask operator") +#ifdef USE_CUDA +.__alias__("attention_padding_mask") +#endif +#ifdef USE_ARM_PLACE +.__alias__("attention_padding_mask") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("attention_padding_mask") +#endif +#ifdef AMD_GPU +.__alias__("attention_padding_mask") +#endif +.num_in(2) +.num_out(1) +.Args("mask", "padding data need to be set to mask"); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/attention_padding_mask.h b/framework/operators/attention_padding_mask.h new file mode 100644 index 000000000..e8019d3cd --- /dev/null +++ b/framework/operators/attention_padding_mask.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ATTENTION_PADDING_MASK_H +#define ANAKIN_OPERATOR_ATTENTION_PADDING_MASK_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/attention_padding_mask.h" + +namespace anakin { + +namespace ops { + +template +class AttentionPaddingMaskHelper; + +/// pooling op +/** + * \brief AttentionPaddingMask operation class + * public inheritance Operator + */ +template +class AttentionPaddingMask : public Operator { +public: + AttentionPaddingMask() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator AttentionPaddingMask< Ttype(" + //<< target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class AttentionPaddingMaskHelper; +}; + +/** + * \brief AttentionPaddingMask helper class + * public inherit OperatorHelper + * including init resource and shape size in attention_padding_mask context + */ +template +class AttentionPaddingMaskHelper : public OperatorHelper { +public: + AttentionPaddingMaskHelper()=default; + + ~AttentionPaddingMaskHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for AttentionPaddingMask operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_attention_padding_mask stand for AttentionPaddingMask parameter + saber::AttentionPaddingMaskParam _param_attention_padding_mask; + ///< _funcs_attention_padding_mask stand for AttentionPaddingMask function + saber::AttentionPaddingMask::saber_type> _funcs_attention_padding_mask; + +private: +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/axpy.cpp b/framework/operators/axpy.cpp index 583b27a03..6263497ad 100644 --- a/framework/operators/axpy.cpp +++ b/framework/operators/axpy.cpp @@ -1,22 +1,23 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/axpy.h" namespace anakin { namespace ops { -//#ifdef USE_CUDA -//template<> -//void Axpy::operator()( -// OpContext& ctx, -// const std::vector >& ins, -// std::vector >& outs) { -// auto* impl = -// static_cast*>(this->_helper); -// auto& param = impl->_param_axpy; -// impl->_funcs_axpy(ins, outs, param, ctx); -//} -//#endif - /// TODO ... specialization other type of operator #define INSTANCE_AXPY(Ttype, Ptype) \ template<> \ @@ -67,6 +68,14 @@ template class AxpyHelper; ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_AXPY(AMD, Precision::FP32); +template class AxpyHelper; +template class AxpyHelper; +template class AxpyHelper; +ANAKIN_REGISTER_OP_HELPER(Axpy, AxpyHelper, AMD, Precision::FP32); +#endif + #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_AXPY(X86, Precision::FP32); template class AxpyHelper; @@ -105,6 +114,9 @@ ANAKIN_REGISTER_OP(Axpy) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("axpy") #endif +#ifdef AMD_GPU +.__alias__("axpy") +#endif .num_in(3) .num_out(1); diff --git a/framework/operators/batch_norm.cpp b/framework/operators/batch_norm.cpp index 816b36dfd..d5a2dac81 100644 --- a/framework/operators/batch_norm.cpp +++ b/framework/operators/batch_norm.cpp @@ -1,10 +1,24 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/batch_norm.h" namespace anakin { namespace ops { -#define INSTANCE_BATCHNORM(Ttype, Ptype) \ +#define INSTANCE_BATCH_NORM(Ttype, Ptype) \ template<> \ void BatchNorm::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -14,18 +28,6 @@ void BatchNorm::operator()(OpContext& ctx, \ impl->_funcs_scale(ins, outs, param, ctx); \ } -#if 0//def USE_CUDA -template<> -void BatchNorm::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_scale; - impl->_funcs_scale(ins, outs, param, ctx); -} -#endif - template Status BatchNormHelper::InitParam() { DLOG(WARNING) << "Parsing Scale op parameter."; @@ -71,23 +73,29 @@ Status BatchNormHelper::InferShape(const // register helper #ifdef USE_CUDA -INSTANCE_BATCHNORM(NV, Precision::FP32); +INSTANCE_BATCH_NORM(NV, Precision::FP32); template class BatchNormHelper; ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, NV, Precision::FP32); #endif #if defined USE_X86_PLACE || defined BUILD_LITE -INSTANCE_BATCHNORM(X86, Precision::FP32); +INSTANCE_BATCH_NORM(X86, Precision::FP32); template class BatchNormHelper; ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -INSTANCE_BATCHNORM(ARM, Precision::FP32); +INSTANCE_BATCH_NORM(ARM, Precision::FP32); template class BatchNormHelper; ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, ARM, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_BATCH_NORM(AMD, Precision::FP32); +template class BatchNormHelper; +ANAKIN_REGISTER_OP_HELPER(BatchNorm, BatchNormHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(BatchNorm) .Doc("BatchNorm operator") @@ -100,6 +108,9 @@ ANAKIN_REGISTER_OP(BatchNorm) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("eps") #endif +#ifdef AMD_GPU +.__alias__("eps") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/box_clip.cpp b/framework/operators/box_clip.cpp new file mode 100644 index 000000000..ad3915160 --- /dev/null +++ b/framework/operators/box_clip.cpp @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "framework/operators/box_clip.h" + +namespace anakin { + +namespace ops { + +template +Status BoxClipHelper::InitParam() { + DLOG(WARNING) << "Parsing BoxClip op parameter."; + EmptyParam param_concat; + _param_concat = param_concat; + return Status::OK(); +} + +template +Status BoxClipHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_concat.init(ins, outs, _param_concat, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status BoxClipHelper::InferShape(const std::vector>& ins, + std::vector>& outs) { + SABER_CHECK(_funcs_concat.compute_output_shape(ins, outs, _param_concat)); + return Status::OK(); +} + + +#define INSTANCE_CONCAT(Ttype, Ptype) \ +template<> \ +void BoxClip::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_concat; \ + impl->_funcs_concat(ins, outs, param, ctx); \ +} + +#ifdef USE_CUDA +INSTANCE_CONCAT(NV, Precision::FP32); +template class BoxClipHelper; +ANAKIN_REGISTER_OP_HELPER(BoxClip, BoxClipHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_CONCAT(AMD, Precision::FP32); +template class BoxClipHelper; +ANAKIN_REGISTER_OP_HELPER(BoxClip, BoxClipHelper, AMD, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_CONCAT(ARM, Precision::FP32); +template class BoxClipHelper; +ANAKIN_REGISTER_OP_HELPER(BoxClip, BoxClipHelper, ARM, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_CONCAT(X86, Precision::FP32); +template class BoxClipHelper; +ANAKIN_REGISTER_OP_HELPER(BoxClip, BoxClipHelper, X86, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(BoxClip) +.Doc("BoxClip operator") +#ifdef USE_CUDA +.__alias__("box_clip") +#endif +#ifdef USE_ARM_PLACE +.__alias__("box_clip") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("box_clip") +#endif +#ifdef AMD_GPU +.__alias__("box_clip") +#endif +.num_in(2) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/box_clip.h b/framework/operators/box_clip.h new file mode 100644 index 000000000..acb47a9fe --- /dev/null +++ b/framework/operators/box_clip.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_OPERATORS_BOX_CLIP_H +#define ANAKIN_FRAMEWORK_OPERATORS_BOX_CLIP_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/box_clip.h" + +namespace anakin { + +namespace ops { + +template +class BoxClipHelper; + +/// pooling op +/** + * \brief contct class + * public inherit Operator + */ +template +class BoxClip : public Operator { +public: + BoxClip() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator BoxClip< Ttype(" + << target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class BoxClipHelper; +}; + +/** + * \brief contact helper class + * public inherit OperatorHelper + * including init resource and shape size in contact context + */ +template +class BoxClipHelper : public OperatorHelper { +public: + BoxClipHelper() = default; + + ~BoxClipHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for contact operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_concat stand for contact parameter + saber::EmptyParam _param_concat; + ///< _funcs_concat stand for contact function + saber::BoxClip::saber_type> _funcs_concat; + +private: + ///< _dims stand for contact size + PTuple _dims; +}; + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif //ANAKIN_BOX_CLIP_H diff --git a/framework/operators/box_coder.cpp b/framework/operators/box_coder.cpp new file mode 100644 index 000000000..09fe33fd4 --- /dev/null +++ b/framework/operators/box_coder.cpp @@ -0,0 +1,134 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "framework/operators/box_coder.h" + +namespace anakin { + +namespace ops { + +/// TODO ... specialization other type of operator +#define INSTANCE_AXPY(Ttype, Ptype) \ +template<> \ +void BoxCoder::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = impl->_param_box_coder; \ + impl->_funcs_box_coder(ins, outs, param, ctx); \ +} + +/// set helper +template +BoxCoderHelper::~BoxCoderHelper() { +} + +template +Status BoxCoderHelper::InitParam() { + DLOG(WARNING) << "Parsing BoxCoder op parameter."; + auto axis = GET_PARAMETER(int, axis); + auto box_normalized = GET_PARAMETER(bool, box_normalized); + Tensor* variance = nullptr; + + if (FIND_PARAMETER(variance)) { + variance = &((GET_PARAMETER(PBlock, variance)).d_tensor()); + } + + saber::BoxCoderParam box_coder_param(variance, box_normalized, axis); + _param_box_coder = box_coder_param; + return Status::OK(); +} + +template +Status BoxCoderHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_box_coder.init(ins, outs, _param_box_coder, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status BoxCoderHelper::InferShape(const std::vector >& + ins, + std::vector >& outs) { + SABER_CHECK(_funcs_box_coder.compute_output_shape(ins, outs, _param_box_coder)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_AXPY(NV, Precision::FP32); +template class BoxCoderHelper; +template class BoxCoderHelper; +template class BoxCoderHelper; +ANAKIN_REGISTER_OP_HELPER(BoxCoder, BoxCoderHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_AXPY(AMD, Precision::FP32); +template class BoxCoderHelper; +template class BoxCoderHelper; +template class BoxCoderHelper; +ANAKIN_REGISTER_OP_HELPER(BoxCoder, BoxCoderHelper, AMD, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_AXPY(X86, Precision::FP32); +template class BoxCoderHelper; +template class BoxCoderHelper; +template class BoxCoderHelper; +ANAKIN_REGISTER_OP_HELPER(BoxCoder, BoxCoderHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE + +#ifdef ANAKIN_TYPE_FP32 +INSTANCE_AXPY(ARM, Precision::FP32); +template class BoxCoderHelper; +ANAKIN_REGISTER_OP_HELPER(BoxCoder, BoxCoderHelper, ARM, Precision::FP32); +#endif + +#ifdef ANAKIN_TYPE_FP16 +template class BoxCoderHelper; +#endif + +#ifdef ANAKIN_TYPE_INT8 +template class BoxCoderHelper; +#endif + +#endif//arm + +//! register op +ANAKIN_REGISTER_OP(BoxCoder) +.Doc("BoxCoder operator") +#ifdef USE_CUDA +.__alias__("box_coder") +#endif +#ifdef USE_ARM_PLACE +.__alias__("box_coder") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("box_coder") +#endif +#ifdef AMD_GPU +.__alias__("box_coder") +#endif +.num_in(3) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/box_coder.h b/framework/operators/box_coder.h new file mode 100644 index 000000000..92634cc91 --- /dev/null +++ b/framework/operators/box_coder.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_OPERATORS_BOX_CODER_H +#define ANAKIN_FRAMEWORK_OPERATORS_BOX_CODER_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/box_coder.h" + +namespace anakin { + +namespace ops { + +template +class BoxCoderHelper; + +/// axpy op +/** + * \brief operation of BoxCoder class + * public inheritance Operator + */ +template +class BoxCoder : public Operator { +public: + BoxCoder() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator BoxCoder< Ttype(" + << target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class BoxCoderHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class BoxCoderHelper : public OperatorHelper { +public: + BoxCoderHelper() = default; + + ~BoxCoderHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by BoxCoder + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_axpy stand for axpy parameter + saber::BoxCoderParam _param_box_coder; + ///< _funcs_box_coder stand for axpy function + saber::BoxCoder::saber_type> _funcs_box_coder; + +private: + ///< _dims stand for axpy size + PTuple _dims; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/cast.cpp b/framework/operators/cast.cpp new file mode 100644 index 000000000..53be8757b --- /dev/null +++ b/framework/operators/cast.cpp @@ -0,0 +1,109 @@ + +#include "framework/operators/cast.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Cast::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>( + this->_helper); + auto& param = static_cast*>( + this->_helper)->_param_cast; + impl->_funcs_cast(ins, outs, param, ctx); +} +#endif + +#ifdef USE_X86_PLACE +template<> +void Cast::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>( + this->_helper); + auto& param = static_cast*>( + this->_helper)->_param_cast; + impl->_funcs_cast(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + +/// set helper +template +CastHelper::~CastHelper() { +} + +template +Status CastHelper::InitParam() { + DLOG(WARNING) << "Parsing Cast op parameter."; + auto in_type = GET_PARAMETER(int, in_type); + auto out_type = GET_PARAMETER(int, out_type); + CastParam param_cast(in_type, out_type); + _param_cast = param_cast; + + return Status::OK(); +} + +template +Status CastHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_cast.init(ins, outs, _param_cast, + SPECIFY, SABER_IMPL, ctx)); + + return Status::OK(); +} + +template +Status CastHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_cast.compute_output_shape(ins, outs, _param_cast)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class CastHelper; +template class CastHelper; +template class CastHelper; +#endif +#ifdef USE_X86_PLACE +template class CastHelper; +template class CastHelper; +template class CastHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Cast, CastHelper, NV, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Cast, CastHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Cast) +.Doc("Cast operator") +#ifdef USE_CUDA +.__alias__("cast") +#endif +#ifdef USE_X86_PLACE +.__alias__("cast") +#endif +.num_in(1) +.num_out(1) +.Args("in_type", "in_type of cast param") +.Args("out_type", "out_type of cast param"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/cast.h b/framework/operators/cast.h new file mode 100644 index 000000000..27c346156 --- /dev/null +++ b/framework/operators/cast.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_CAST_H +#define ANAKIN_OPERATOR_CAST_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/cast.h" + +namespace anakin { + +namespace ops { + +template +class CastHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class Cast : public Operator { +public: + Cast() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Cast< Ttype(" + << target_name::value << "), Precision(" + << Ptype << ") >"; + } + + friend class CastHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class CastHelper : public OperatorHelper { +public: + CastHelper() = default; + + ~CastHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_match_matrix stand for cast parameter + saber::CastParam _param_cast; + ///< _funcs_match_matrix stand for cast function + saber::Cast::saber_type> _funcs_cast; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/concat.cpp b/framework/operators/concat.cpp index 7a06ee490..fd1112aaa 100644 --- a/framework/operators/concat.cpp +++ b/framework/operators/concat.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/concat.h" namespace anakin { @@ -46,6 +60,12 @@ template class ConcatHelper; ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_CONCAT(AMD, Precision::FP32); +template class ConcatHelper; +ANAKIN_REGISTER_OP_HELPER(Concat, ConcatHelper, AMD, Precision::FP32); +#endif + #ifdef USE_ARM_PLACE INSTANCE_CONCAT(ARM, Precision::FP32); template class ConcatHelper; @@ -70,6 +90,9 @@ ANAKIN_REGISTER_OP(Concat) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("concat") #endif +#ifdef AMD_GPU +.__alias__("concat") +#endif .num_in(2) .num_out(1) .Args("axis", " axis for concat the input "); diff --git a/framework/operators/conv_3x3.cpp b/framework/operators/conv_3x3.cpp deleted file mode 100644 index fb94497d3..000000000 --- a/framework/operators/conv_3x3.cpp +++ /dev/null @@ -1,222 +0,0 @@ -#include "framework/operators/conv_3x3.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVOLUTION(Ttype, Ptype) \ -template<> \ -void SassConvolution::operator()(OpContext& ctx, \ - const std::vector >& ins, \ - std::vector >& outs) { \ - auto* impl = static_cast*>(this->_helper); \ - auto& param = static_cast*> \ - (this->_helper)->_param_conv; \ - impl->_funcs_conv(ins, outs, param, ctx); \ -} -/// TODO ... specialization other type of operator - -/// set helper -template -SassConvolutionHelper::~SassConvolutionHelper() {} - -template -Status SassConvolutionHelper::InitParam() { - DLOG(WARNING) << "Parsing SassConvolution op parameter."; - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - _param_conv = conv_param; - } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias); - _param_conv = conv_param; - } - - return Status::OK(); -} - -template -Status SassConvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device pleace change here.. - saber::ImplEnum impl_e = SABER_IMPL; - SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, impl_e, ctx)); - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - if (bias_term) { - SET_PARAMETER(is_weights_transed, true, bool); - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply(std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), - strides[0], strides[1], _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, group, impl_e); - weights.map_to_host(); - } else { - SET_PARAMETER(is_weights_transed, true, bool); - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply(std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6 ,_7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, - strides[0], strides[1], group, impl_e); - weights.map_to_host(); - } - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply(std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -//TODO!!! delete me when saber int8 is ready!!!! -#ifdef USE_CUDA -template<> -Status SassConvolutionHelper::Init(OpContext& ctx, - const std::vector >& ins, std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device pleace change here.. - saber::ImplEnum impl_e = VENDER_IMPL; - SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, impl_e, ctx)); - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv.pad_h, _param_conv.pad_w, _param_conv.dilation_h, _param_conv.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif -//TODO!!! end here - -template -Status SassConvolutionHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_conv.compute_output_shape(ins, outs, _param_conv)); - return Status::OK(); -} - -#ifdef USE_CUDA -template class SassConvolutionHelper; -template class SassConvolutionHelper; -template class SassConvolutionHelper; -#endif - -//#ifdef USE_ARM_PLACE -//template class SassConvolutionHelper; -//template class SassConvolutionHelper; -//template class SassConvolutionHelper; -//#endif - -#ifdef AMD_GPU -template class SassConvolutionHelper; -template class SassConvolutionHelper; -template class SassConvolutionHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVOLUTION(NV, Precision::FP32); -INSTANCE_SASSCONVOLUTION(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, NV, Precision::INT8); -#endif - -#ifdef USE_ARM_PLACE -//ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, ARM, Precision::FP32); -#endif - -#ifdef AMD_GPU -INSTANCE_SASSCONVOLUTION(AMD, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvolution, SassConvolutionHelper, AMD, Precision::FP32); -#endif - -//! register op -ANAKIN_REGISTER_OP(SassConvolution) -.Doc("SassConvolution operator") -#ifdef USE_CUDA -.__alias__("convolution") -.__alias__("convolution") -#endif -#ifdef AMD_GPU -.__alias__("convolution") -#endif -//#ifdef USE_ARM_PLACE -//.__alias__("convolution") -//#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/conv_unpadding_padding.cpp b/framework/operators/conv_unpadding_padding.cpp index 65155e0b6..7ba00c47d 100644 --- a/framework/operators/conv_unpadding_padding.cpp +++ b/framework/operators/conv_unpadding_padding.cpp @@ -27,7 +27,7 @@ Status ConvUnpaddingPaddingHelper::InferShape(const std::vector \ void ConvUnpaddingPadding::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -39,19 +39,25 @@ void ConvUnpaddingPadding::operator()(OpContext& ctx, \ } #ifdef USE_CUDA -INSTANCE_CONCAT(NV, Precision::FP32); +INSTANCE_CONV_UNPADDING_PADDING(NV, Precision::FP32); template class ConvUnpaddingPaddingHelper; ANAKIN_REGISTER_OP_HELPER(ConvUnpaddingPadding, ConvUnpaddingPaddingHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_CONV_UNPADDING_PADDING(AMD, Precision::FP32); +template class ConvUnpaddingPaddingHelper; +ANAKIN_REGISTER_OP_HELPER(ConvUnpaddingPadding, ConvUnpaddingPaddingHelper, AMD, Precision::FP32); +#endif + #ifdef USE_ARM_PLACE -INSTANCE_CONCAT(ARM, Precision::FP32); +INSTANCE_CONV_UNPADDING_PADDING(ARM, Precision::FP32); template class ConvUnpaddingPaddingHelper; ANAKIN_REGISTER_OP_HELPER(ConvUnpaddingPadding, ConvUnpaddingPaddingHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -INSTANCE_CONCAT(X86, Precision::FP32); +INSTANCE_CONV_UNPADDING_PADDING(X86, Precision::FP32); template class ConvUnpaddingPaddingHelper; ANAKIN_REGISTER_OP_HELPER(ConvUnpaddingPadding, ConvUnpaddingPaddingHelper, X86, Precision::FP32); #endif @@ -68,6 +74,9 @@ ANAKIN_REGISTER_OP(ConvUnpaddingPadding) #ifdef USE_X86_PLACE .__alias__("conv_unpadding_padding") #endif +#ifdef AMD_GPU +.__alias__("conv_unpadding_padding") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/convolution.cpp b/framework/operators/convolution.cpp index 04cd302f8..d6e4a1c6c 100644 --- a/framework/operators/convolution.cpp +++ b/framework/operators/convolution.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/convolution.h" namespace anakin { @@ -12,7 +26,7 @@ void Convolution::operator()(OpContext& ctx, \ auto* impl = static_cast*>(this->_helper); \ auto& param = static_cast*> \ (this->_helper)->_param_conv; \ - impl->_funcs_conv(ins, outs, param, ctx); \ + SABER_CHECK(impl->_funcs_conv(ins, outs, param, ctx));\ } template @@ -29,7 +43,13 @@ Status ConvolutionHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); saber::ConvParam conv_param(group, padding[0], padding[1], @@ -57,11 +77,17 @@ Status ConvolutionHelper::Init(OpContext& ctx, auto bias_term = GET_PARAMETER(bool, bias_term); //different device pleace change here.. +#ifdef AMD_GPU + saber::ImplEnum impl_e = SABER_IMPL; +#else saber::ImplEnum impl_e = VENDER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } - bool use_k1s1p0 = true; + if (std::is_same::value && Ptype == Precision::INT8) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); use_k1s1p0 = use_k1s1p0 && (_param_conv.weight()->height() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv.weight()->width() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv.pad_h == 0); @@ -72,7 +98,7 @@ Status ConvolutionHelper::Init(OpContext& ctx, use_k1s1p0 = use_k1s1p0 && (_param_conv.dilation_w == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv.group == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv.bias()->valid_size() > 0); - bool use_k3s1d1 = true; + bool use_k3s1d1 = (Ptype == Precision::FP32); use_k3s1d1 = use_k3s1d1 && (_param_conv.weight()->height() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv.weight()->width() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv.group == 1); @@ -80,15 +106,17 @@ Status ConvolutionHelper::Init(OpContext& ctx, use_k3s1d1 = use_k3s1d1 && (_param_conv.stride_w == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv.dilation_h == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv.dilation_w == 1); - bool use_depthwise = true; + bool use_depthwise = (Ptype == Precision::FP32); use_depthwise = use_depthwise && (_param_conv.group == ins[0]->channel()); use_depthwise = use_depthwise && (_param_conv.group == outs[0]->channel()); - bool use_direct_k = true; + bool use_direct_k = (Ptype == Precision::FP32); use_direct_k = use_direct_k && (_param_conv.weight()->channel() >= 16); use_direct_k = use_direct_k && (_param_conv.group == 1); - if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { impl_e = SABER_IMPL; } +#endif SABER_CHECK(_funcs_conv.init(ins, outs, _param_conv, SPECIFY, impl_e, ctx)); // check if weights have been transposed @@ -134,12 +162,11 @@ Status ConvolutionHelper::InferShape(const #ifdef USE_CUDA template class ConvolutionHelper; -template class ConvolutionHelper; -template class ConvolutionHelper; + INSTANCE_CONVOLUTION(NV, Precision::FP32); -INSTANCE_CONVOLUTION(NV, Precision::INT8); + ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, Precision::INT8); + #endif @@ -147,12 +174,16 @@ ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, NV, Precision::INT8); INSTANCE_CONVOLUTION(X86, Precision::FP32); template class ConvolutionHelper; ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, X86, Precision::FP32); + #endif #ifdef USE_ARM_PLACE INSTANCE_CONVOLUTION(ARM, Precision::FP32); +INSTANCE_CONVOLUTION(ARM, Precision::INT8); template class ConvolutionHelper; +template class ConvolutionHelper; ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Convolution, ConvolutionHelper, ARM, Precision::INT8); #endif #ifdef AMD_GPU @@ -174,6 +205,7 @@ ANAKIN_REGISTER_OP(Convolution) #endif #ifdef USE_ARM_PLACE .__alias__("convolution") +.__alias__("convolution") #endif #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("convolution") diff --git a/framework/operators/coord2patch.cpp b/framework/operators/coord2patch.cpp new file mode 100644 index 000000000..d3233671f --- /dev/null +++ b/framework/operators/coord2patch.cpp @@ -0,0 +1,85 @@ +#include "framework/operators/coord2patch.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_COORD2PATCH(Ttype, Ptype) \ +template<> \ +void Coord2Patch::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>\ + (this->_helper)->_param_coord2patch; \ + impl->_funcs_coord2patch(ins, outs, param, ctx); \ +} +template +Status Coord2PatchHelper::InitParam() { + auto img_h = GET_PARAMETER(int, img_h); + auto output_h = GET_PARAMETER(int, output_h); + auto output_w = GET_PARAMETER(int, output_w); + saber::Coord2PatchParam param(img_h, output_h, output_w); + _param_coord2patch = param; + return Status::OK(); +} + +template +Status Coord2PatchHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_coord2patch.init(ins, outs, _param_coord2patch, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status Coord2PatchHelper::InferShape(const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_coord2patch.compute_output_shape(ins, outs, _param_coord2patch)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_COORD2PATCH(NV, Precision::FP32); +template class Coord2PatchHelper; +ANAKIN_REGISTER_OP_HELPER(Coord2Patch, Coord2PatchHelper, NV, Precision::FP32); +#endif + +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +INSTANCE_COORD2PATCH(X86, Precision::FP32); +template class Coord2PatchHelper; +ANAKIN_REGISTER_OP_HELPER(Coord2Patch, Coord2PatchHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_COORD2PATCH(ARM, Precision::FP32); +template class Coord2PatchHelper; +ANAKIN_REGISTER_OP_HELPER(Coord2Patch, Coord2PatchHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(Coord2Patch) +.Doc("Coord2Patch operator") +#ifdef USE_CUDA +.__alias__("coord2patch") +#endif +#ifdef AMD_GPU +//.__alias__("coord2patch") +#endif +#ifdef USE_ARM_PLACE +.__alias__("coord2patch") +#endif +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +.__alias__("coord2patch") +#endif +.num_in(1) +.num_out(1) +.Args("img_h", " img_h for coord2patch ") +.Args("output_h", " output_h for coord2patch ") +.Args("output_w", " output_w for coord2patch "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/coord2patch.h b/framework/operators/coord2patch.h new file mode 100644 index 000000000..63db06ec0 --- /dev/null +++ b/framework/operators/coord2patch.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_COORD2PATCH_H +#define ANAKIN_OPERATOR_COORD2PATCH_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/coord2patch.h" + +namespace anakin { + +namespace ops { + +template +class Coord2PatchHelper; + +/// pooling op +/** + * \brief Coord2Patch implementation class + * public inherit Operator + */ +template +class Coord2Patch : public Operator { +public: + Coord2Patch() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Convolution< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class Coord2PatchHelper; +}; + +/** + * \brief Permut helper class to implement conv 3X3 + * public inherit OperatorHelper + * including init resource and shape size in Permut context + */ +template +class Coord2PatchHelper : public OperatorHelper { +public: + Coord2PatchHelper()=default; + + ~Coord2PatchHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Permut operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_coord2patch stand for Coord2Patch parameter + saber::Coord2PatchParam _param_coord2patch; + ///< _funcs_coord2patch stand for Coord2Patch function + saber::Coord2Patch::saber_type> _funcs_coord2patch; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif//ANAKIN_OPERATOR_COORD2PATCH_H diff --git a/framework/operators/cos_sim.cpp b/framework/operators/cos_sim.cpp new file mode 100644 index 000000000..b479f97b6 --- /dev/null +++ b/framework/operators/cos_sim.cpp @@ -0,0 +1,92 @@ +#include "framework/operators/cos_sim.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_COS_SIM(Ttype, Ptype) \ +template<> \ +void CosSim::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_cos_sim; \ + impl->_funcs_cos_sim(ins, outs, param, ctx); \ +} + +/// set helper +template +CosSimHelper::~CosSimHelper() { +} + +template +Status CosSimHelper::InitParam() { + DLOG(WARNING) << "Parsing CosSim op parameter."; + CosSimParam param_cos_sim; + _param_cos_sim = param_cos_sim; + + return Status::OK(); +} + +template +Status CosSimHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_cos_sim.init(ins, outs, _param_cos_sim, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status CosSimHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_cos_sim.compute_output_shape(ins, outs, _param_cos_sim)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_COS_SIM(NV, Precision::FP32); +template class CosSimHelper; +ANAKIN_REGISTER_OP_HELPER(CosSim, CosSimHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_COS_SIM(X86, Precision::FP32); +template class CosSimHelper; +ANAKIN_REGISTER_OP_HELPER(CosSim, CosSimHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_COS_SIM(ARM, Precision::FP32); +template class CosSimHelper; +ANAKIN_REGISTER_OP_HELPER(CosSim, CosSimHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_COS_SIM(AMD, Precision::FP32); +template class CosSimHelper; +ANAKIN_REGISTER_OP_HELPER(CosSim, CosSimHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(CosSim) +.Doc("CosSim operator") +#ifdef USE_CUDA +.__alias__("cos_sim") +#endif +#ifdef USE_ARM_PLACE +.__alias__("cos_sim") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("cos_sim") +#endif +#ifdef AMD_GPU +.__alias__("cos_sim") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/cos_sim.h b/framework/operators/cos_sim.h new file mode 100644 index 000000000..430508fe4 --- /dev/null +++ b/framework/operators/cos_sim.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_COS_SIM_H +#define ANAKIN_OPERATOR_COS_SIM_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/cos_sim.h" + +namespace anakin { + +namespace ops { + +template +class CosSimHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class CosSim : public Operator { +public: + CosSim() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator CosSim< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class CosSimHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class CosSimHelper : public OperatorHelper { +public: + CosSimHelper()=default; + + ~CosSimHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_cos_sim stand for cos_sim parameter + saber::CosSimParam _param_cos_sim; + ///< _funcs_cos_sim stand for cos_sim function + saber::CosSim::saber_type> _funcs_cos_sim; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/crf_decoding.cpp b/framework/operators/crf_decoding.cpp index 24fee68e6..b7b151218 100644 --- a/framework/operators/crf_decoding.cpp +++ b/framework/operators/crf_decoding.cpp @@ -4,17 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_X86_PLACE -template<> -void CrfDecoding::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_crf_decoding; - impl->_funcs_crf_decoding(ins, outs, param, ctx); +#define INSTANCE_CRF_DECODING(Ttype, Ptype) \ +template<> \ +void CrfDecoding::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_crf_decoding; \ + impl->_funcs_crf_decoding(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator @@ -53,33 +52,32 @@ Status CrfDecodingHelper::InferShape( } #ifdef USE_CUDA +INSTANCE_CRF_DECODING(NV, Precision::FP32); template class CrfDecodingHelper; template class CrfDecodingHelper; template class CrfDecodingHelper; +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_CRF_DECODING(AMD, Precision::FP32); +template class CrfDecodingHelper; +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, AMD, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_CRF_DECODING(ARM, Precision::FP32); template class CrfDecodingHelper; template class CrfDecodingHelper; template class CrfDecodingHelper; +ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_CRF_DECODING(X86, Precision::FP32); template class CrfDecodingHelper; template class CrfDecodingHelper; template class CrfDecodingHelper; -#endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, NV, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, ARM, Precision::FP32); -#endif - -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(CrfDecoding, CrfDecodingHelper, X86, Precision::FP32); #endif @@ -95,6 +93,9 @@ ANAKIN_REGISTER_OP(CrfDecoding) #ifdef USE_X86_PLACE .__alias__("CrfDecoding") #endif +#ifdef AMD_GPU +.__alias__("CrfDecoding") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/crop.cpp b/framework/operators/crop.cpp index e69de29bb..9fe6487b0 100644 --- a/framework/operators/crop.cpp +++ b/framework/operators/crop.cpp @@ -0,0 +1,111 @@ +#include "framework/operators/crop.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_CROP(Ttype, Ptype) \ +template<> \ +void Crop::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_crop; \ + impl->_funcs_crop(ins, outs, param, ctx); \ +} +/// set helper +template +CropHelper::~CropHelper() { +} + +template +Status CropHelper::InitParam() { + DLOG(WARNING) << "Parsing Crop op parameter."; + + using pblock_type = PBlock; + auto axis = GET_PARAMETER(int, axis); + auto offset_in = GET_PARAMETER(PTuple, cropping); + std::vector shape; + shape.push_back(axis); + for(int i = 0; i < offset_in.size(); i++){ + shape.push_back(offset_in[i]); + } + saber::CropParam crop_param(axis, offset_in.vector(), shape); + _param_crop = crop_param; + + return Status::OK(); +} + +template +Status CropHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_crop.init(ins, outs, _param_crop, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status CropHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_crop.compute_output_shape(ins, outs, _param_crop)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class CropHelper; +template class CropHelper; +template class CropHelper; +#endif + +#ifdef USE_ARM_PLACE +template class CropHelper; +template class CropHelper; +template class CropHelper; +#endif + +#if defined USE_X86_PLACE || defined(BUILD_LITE) +template class CropHelper; +template class CropHelper; +template class CropHelper; +#endif + +// register helper +#ifdef USE_CUDA +INSTANCE_CROP(NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Crop, CropHelper, NV, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_CROP(ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Crop, CropHelper, ARM, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined(BUILD_LITE) +INSTANCE_CROP(X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Crop, CropHelper, X86, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(Crop) +.Doc("Crop operator") +#ifdef USE_CUDA +.__alias__("Crop") +#endif +#ifdef USE_ARM_PLACE +.__alias__("Crop") +#endif +#if defined (USE_X86_PLACE) || defined(BUILD_LITE) +.__alias__("Crop") +#endif +.num_in(1) +.num_out(1) +.Args("axis", "axis of crop") +.Args>("offset", "offset_in crop"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/crop.h b/framework/operators/crop.h index e69de29bb..61173d1f4 100644 --- a/framework/operators/crop.h +++ b/framework/operators/crop.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_CROP_H +#define ANAKIN_OPERATOR_CROP_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/crop.h" + +namespace anakin { + +namespace ops { + +template +class CropHelper; + +/// pooling op +/** + * \brief Crop operation class + * public inheritance Operator + */ +template +class Crop : public Operator { +public: + Crop() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + } + + friend class CropHelper; +}; + +/** + * \brief Crop helper class + * public inherit OperatorHelper + * including init resource and shape size in crf_decoding context + */ +template +class CropHelper : public OperatorHelper { +public: + CropHelper()=default; + + ~CropHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Crop operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_crop stand for Crop parameter + saber::CropParam _param_crop; + ///< _funcs_crop stand for Crop function + saber::Crop::saber_type> _funcs_crop; + +private: + ///< _dims stand for Crop size + PTuple _dims; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/ctc_align.cpp b/framework/operators/ctc_align.cpp index 1d0de9934..87b078a69 100644 --- a/framework/operators/ctc_align.cpp +++ b/framework/operators/ctc_align.cpp @@ -4,16 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void CtcAlign::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_ctc_align; - impl->_funcs_ctc_align(ins, outs, param, ctx); +#define INSTANCE_CTC_ALIGN(Ttype, Ptype) \ +template<> \ +void CtcAlign::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_ctc_align; \ + impl->_funcs_ctc_align(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator @@ -51,25 +51,24 @@ Status CtcAlignHelper::InferShape(const std::vector; template class CtcAlignHelper; template class CtcAlignHelper; +ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_CTC_ALIGN(AMD, Precision::FP32); +template class CtcAlignHelper; +ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, AMD, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_CTC_ALIGN(ARM, Precision::FP32); template class CtcAlignHelper; template class CtcAlignHelper; template class CtcAlignHelper; -#endif - -//template class CtcAlignHelper; -//template class CtcAlignHelper; -//template class CtcAlignHelper; -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE ANAKIN_REGISTER_OP_HELPER(CtcAlign, CtcAlignHelper, ARM, Precision::FP32); #endif @@ -81,6 +80,9 @@ ANAKIN_REGISTER_OP(CtcAlign) #endif #ifdef USE_ARM_PLACE .__alias__("ctc_align") +#endif +#ifdef AMD_GPU + .__alias__("ctc_align") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/deconvolution.cpp b/framework/operators/deconvolution.cpp index 641697710..4520f0f3d 100644 --- a/framework/operators/deconvolution.cpp +++ b/framework/operators/deconvolution.cpp @@ -28,7 +28,13 @@ Status DeconvolutionHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); saber::ConvParam conv_param(group, padding[0], padding[1], @@ -52,6 +58,10 @@ template Status DeconvolutionHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { + if (std::is_same::value){ + SABER_CHECK(_funcs_deconv.init(ins, outs, _param_deconv, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); + } SABER_CHECK(_funcs_deconv.init(ins, outs, _param_deconv, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } diff --git a/framework/operators/deformconvolution.cpp b/framework/operators/deformconvolution.cpp index 4752b5743..d84028928 100644 --- a/framework/operators/deformconvolution.cpp +++ b/framework/operators/deformconvolution.cpp @@ -4,18 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void DeformConvolution::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_deform_conv; - impl->_funcs_deform_conv(ins, outs, param, ctx); +#define INSTANCE_DEFORMCONVOLUTION(Ttype, Ptype) \ +template<> \ +void DeformConvolution::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_deform_conv; \ + impl->_funcs_deform_conv(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator @@ -76,23 +74,25 @@ Status DeformConvolutionHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_DEFORMCONVOLUTION(NV, Precision::FP32); template class DeformConvolutionHelper; template class DeformConvolutionHelper; template class DeformConvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, NV, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; -template class DeformConvolutionHelper; +#ifdef AMD_GPU +INSTANCE_DEFORMCONVOLUTION(AMD, Precision::FP32); +template class DeformConvolutionHelper; +ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, AMD, Precision::FP32); #endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, NV, Precision::FP32); -#endif #ifdef USE_ARM_PLACE +INSTANCE_DEFORMCONVOLUTION(ARM, Precision::FP32); +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; +template class DeformConvolutionHelper; ANAKIN_REGISTER_OP_HELPER(DeformConvolution, DeformConvolutionHelper, ARM, Precision::FP32); #endif @@ -105,6 +105,9 @@ ANAKIN_REGISTER_OP(DeformConvolution) #ifdef USE_ARM_PLACE .__alias__("defromable_convolution") #endif +#ifdef AMD_GPU +.__alias__("deformable_convolution") +#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/dense.cpp b/framework/operators/dense.cpp index 6563dc4aa..e1852deab 100644 --- a/framework/operators/dense.cpp +++ b/framework/operators/dense.cpp @@ -23,7 +23,12 @@ Status DenseHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(out_dim, scale_tmp); + w.set_scale(w_scale); + } if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); saber::FcParam fc_param(&(weights.d_tensor()), &(bias.d_tensor()), out_dim, @@ -44,7 +49,15 @@ Status DenseHelper::Init(OpContext& ctx, SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, STATIC, SABER_IMPL, ctx)); return Status::OK(); } - +#ifdef USE_CUDA +template<> +Status DenseHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +#endif template<> Status DenseHelper::Init(OpContext& ctx, const std::vector >& ins, @@ -77,14 +90,17 @@ Status DenseHelper::InferShape(const std::vector; ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, NV, Precision::INT8); template class DenseHelper; template class DenseHelper; #endif #ifdef USE_ARM_PLACE INSTANCE_DENSE(ARM, Precision::FP32); +INSTANCE_DENSE(ARM, Precision::INT8); template<> Status DenseHelper::Init(OpContext &ctx,\ const std::vector >& ins, \ @@ -92,13 +108,24 @@ Status DenseHelper::Init(OpContext &ctx,\ SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } +template<> +Status DenseHelper::Init(OpContext &ctx,\ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_dense.init(ins, outs, _param_dense, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, ARM, Precision::INT8); #endif #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_DENSE(X86, Precision::FP32); template class DenseHelper; ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, X86, Precision::FP32); +INSTANCE_DENSE(X86, Precision::INT8); +template class DenseHelper; +ANAKIN_REGISTER_OP_HELPER(Dense, DenseHelper, X86, Precision::INT8); #endif #ifdef AMD_GPU @@ -119,10 +146,12 @@ ANAKIN_REGISTER_OP(Dense) #ifdef USE_CUDA .__alias__("fullconnect") .__alias__("fc") + .__alias__("fc") #endif #ifdef USE_ARM_PLACE .__alias__("fullconnect") .__alias__("fc") +.__alias__("fc") #endif #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("fullconnect") diff --git a/framework/operators/detection_output.cpp b/framework/operators/detection_output.cpp index 1340e8bfd..d64dc98be 100644 --- a/framework/operators/detection_output.cpp +++ b/framework/operators/detection_output.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/detection_output.h" namespace anakin { @@ -67,6 +81,11 @@ template class DetectionOutputHelper; ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_DETECTIONOUTPUT(AMD, Precision::FP32); +template class DetectionOutputHelper; +ANAKIN_REGISTER_OP_HELPER(DetectionOutput, DetectionOutputHelper, AMD, Precision::FP32); +#endif #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_DETECTIONOUTPUT(X86, Precision::FP32); template class DetectionOutputHelper; @@ -91,6 +110,9 @@ ANAKIN_REGISTER_OP(DetectionOutput) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("detectionoutput") #endif +#ifdef AMD_GPU +.__alias__("detectionoutput") +#endif .num_in(1) .num_out(1) .Args("share_location", " flag whether all classes share location ") diff --git a/framework/operators/dfm_ps_roi_align.cpp b/framework/operators/dfm_ps_roi_align.cpp index 8cc292f9f..4605222da 100644 --- a/framework/operators/dfm_ps_roi_align.cpp +++ b/framework/operators/dfm_ps_roi_align.cpp @@ -13,6 +13,18 @@ void DFMBPSROIAlign::operator()( impl->_funcs_dfm_ps_roi_align(ins, outs, param, ctx); } #endif +#ifdef USE_ARM_PLACE +template<> +void DFMBPSROIAlign::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_dfm_ps_roi_align; + impl->_funcs_dfm_ps_roi_align(ins, outs, param, ctx); +} +#endif /// TODO ... specialization other type of operator /// set helper template @@ -92,7 +104,10 @@ ANAKIN_REGISTER_OP(DFMBPSROIAlign) .__alias__("rpn_proposal_ssd") #endif #ifdef USE_ARM_PLACE -.__alias__("rpn_proposal_ssd") +//.__alias__("rpn_proposal_ssd") +#endif +#ifdef AMD_GPU +//.__alias__("rpn_proposal_ssd") #endif .num_in(1) .num_out(1) @@ -109,4 +124,4 @@ ANAKIN_REGISTER_OP(DFMBPSROIAlign) .Args("part_height", " of dfmb_psroi_pooling_param") .Args("part_width", " of dfmb_psroi_pooling_param"); } /* namespace ops */ -} /* namespace anakin */ \ No newline at end of file +} /* namespace anakin */ diff --git a/framework/operators/eltwise_op.cpp b/framework/operators/eltwise_op.cpp index d171adfee..c090a5e77 100644 --- a/framework/operators/eltwise_op.cpp +++ b/framework/operators/eltwise_op.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/eltwise_op.h" namespace anakin { @@ -26,8 +40,12 @@ Status EltwiseHelper::InitParam() { elt_type = Eltwise_sum; } else if (type == "Max") { elt_type = Eltwise_max; - } else { + } else if (type == "Prod"){ elt_type = Eltwise_prod; + } else if (type == "Div") { + elt_type = Eltwise_div; + } else { + LOG(FATAL) << "eltwise type is not supported" << elt_type; } saber::EltwiseParam eltwise_param(elt_type, coeff.vector()); _param_eltwise = eltwise_param; @@ -66,6 +84,11 @@ template class EltwiseHelper; ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, ARM, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_ELTWISE(AMD, Precision::FP32); +template class EltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(Eltwise, EltwiseHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(Eltwise) .Doc("Eltwise operator") @@ -78,6 +101,9 @@ ANAKIN_REGISTER_OP(Eltwise) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("eltwise") #endif +#ifdef AMD_GPU +.__alias__("eltwise") +#endif .num_in(1) .num_out(1) .Args("type", " eltwise type( string )") diff --git a/framework/operators/embedding.cpp b/framework/operators/embedding.cpp index 9689ed0c6..f42803210 100644 --- a/framework/operators/embedding.cpp +++ b/framework/operators/embedding.cpp @@ -4,37 +4,19 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Embedding::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_embedding; - impl->_funcs_embedding(ins, outs, param, ctx); +#define INSTANCE_EMBEDDING(Ttype, Ptype) \ +template<> \ +void Embedding::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_embedding; \ + impl->_funcs_embedding(ins, outs, param, ctx); \ } -#endif -#ifdef USE_X86_PLACE -template<> -void Embedding::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_embedding; - impl->_funcs_embedding(ins, outs, param, ctx); -} -#endif /// TODO ... specialization other type of operator - - /// set helper template EmbeddingHelper::~EmbeddingHelper() { @@ -76,30 +58,32 @@ Status EmbeddingHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_EMBEDDING(NV, Precision::FP32); template class EmbeddingHelper; template class EmbeddingHelper; template class EmbeddingHelper; +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, NV, Precision::FP32); +#endif +#ifdef AMD_GPU +INSTANCE_EMBEDDING(AMD, Precision::FP32); +template class EmbeddingHelper; +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, AMD, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_EMBEDDING(ARM, Precision::FP32); template class EmbeddingHelper; template class EmbeddingHelper; template class EmbeddingHelper; +ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_EMBEDDING(X86, Precision::FP32); template class EmbeddingHelper; template class EmbeddingHelper; template class EmbeddingHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, ARM, Precision::FP32); -#endif -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(Embedding, EmbeddingHelper, X86, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Embedding) .Doc("Embedding operator") @@ -112,6 +96,9 @@ ANAKIN_REGISTER_OP(Embedding) #ifdef USE_X86_PLACE .__alias__("embedding") #endif +#ifdef AMD_GPU +.__alias__("embedding") +#endif .num_in(1) .num_out(1) .Args("word_num", "word_num") diff --git a/framework/operators/flatten.cpp b/framework/operators/flatten.cpp index 85f1866ab..9253be16c 100644 --- a/framework/operators/flatten.cpp +++ b/framework/operators/flatten.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/flatten.h" namespace anakin { @@ -45,6 +59,12 @@ template class FlattenHelper; ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_FLATTEN(AMD, Precision::FP32); +template class FlattenHelper; +ANAKIN_REGISTER_OP_HELPER(Flatten, FlattenHelper, AMD, Precision::FP32); +#endif + #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_FLATTEN(X86, Precision::FP32); template class FlattenHelper; @@ -69,6 +89,9 @@ ANAKIN_REGISTER_OP(Flatten) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("flatten") #endif +#ifdef AMD_GPU +.__alias__("flatten") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/fusion_ops/batchnorm_scale.cpp b/framework/operators/fusion_ops/batchnorm_scale.cpp index b6cfe96bd..73cf25379 100644 --- a/framework/operators/fusion_ops/batchnorm_scale.cpp +++ b/framework/operators/fusion_ops/batchnorm_scale.cpp @@ -96,6 +96,12 @@ template class BatchnormScaleHelper; ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, ARM, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_BATCHNORMSCALE(AMD, Precision::FP32); +template class BatchnormScaleHelper; +ANAKIN_REGISTER_OP_HELPER(BatchnormScale, BatchnormScaleHelper, AMD, Precision::FP32); +#endif + #ifdef USE_CUDA INSTANCE_BATCHNORMSCALE(NV, Precision::FP32); template<> @@ -124,6 +130,9 @@ ANAKIN_REGISTER_OP(BatchnormScale) #ifdef USE_ARM_PLACE .__alias__("batchnorm_scale") #endif +#ifdef AMD_GPU +.__alias__("batchnorm_scale") +#endif .num_in(1) .num_out(1) .Args("axis", "axis of conv") diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp deleted file mode 100644 index 6d5ba8227..000000000 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.cpp +++ /dev/null @@ -1,285 +0,0 @@ -#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVBATCHNORMSCALE(Ttype, Ptype) \ -template<> \ -void SassConvBatchnormScale::operator()(\ - OpContext& ctx,\ - const std::vector >& ins,\ - std::vector >& outs) {\ - auto* impl = static_cast*>(this->_helper);\ - auto& param = static_cast*>\ - (this->_helper)->_param_conv_batchnorm_scale;\ - SABER_CHECK(impl->_funcs_conv_batchnorm_scale(ins, outs, param, ctx));\ -} - -/// TODO ... specialization other type of operator - - -/// set helper -template -SassConvBatchnormScaleHelper::~SassConvBatchnormScaleHelper() { -} - -template -Status SassConvBatchnormScaleHelper::InitParam() { - LOG(WARNING) << "Parsing SassConvBatchnormScale op parameter."; - - // get conv param - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weights_shape = weights.shape(); - - // get batchnorm param - auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); - auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); - auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); - auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); - auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); - auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); - auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); - auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - - // get scale param - auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); - auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); - auto scale_axis = GET_PARAMETER(int, scale_0_axis); - auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); - auto scale_weight_1_vector = scale_weight_1.vector(); - auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); - auto scale_weight_2_vector = scale_weight_2.vector(); - - // check if batchnorm parameters have been optimized - auto is_param_updated = CHECK_PARAMETER(is_param_updated); - if(!is_param_updated) { - SET_PARAMETER(is_param_updated, true, bool); - if(bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - - _param_conv_batchnorm_scale = conv_param; - } else { - pblock_type* bias = new pblock_type(); - SET_PARAMETER(bias_term, true, bool); // set attr bias_term true - SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias->d_tensor())); - - _param_conv_batchnorm_scale = conv_param; - } - } else { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor())); - - _param_conv_batchnorm_scale = conv_param; - } - return Status::OK(); -} - -template -Status SassConvBatchnormScaleHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device please change here!!! - saber::ImplEnum impl_e = SABER_IMPL; - SABER_CHECK(_funcs_conv_batchnorm_scale.init(ins, outs, \ - _param_conv_batchnorm_scale, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(),_param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} - -//TODO!!! delete me when saber int8 is ready!!!! -#ifdef USE_CUDA -template<> -Status SassConvBatchnormScaleHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device please change here!!! - saber::ImplEnum impl_e = SABER_IMPL; - SABER_CHECK(_funcs_conv_batchnorm_scale.init(ins, outs, \ - _param_conv_batchnorm_scale, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(),_param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale.pad_h, _param_conv_batchnorm_scale.pad_w, _param_conv_batchnorm_scale.dilation_h, _param_conv_batchnorm_scale.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif -//TODO!!! end here - -template -Status SassConvBatchnormScaleHelper::InferShape( - const std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale.compute_output_shape(ins, outs, _param_conv_batchnorm_scale); - return Status::OK(); -} - -#ifdef USE_CUDA -template class SassConvBatchnormScaleHelper; -template class SassConvBatchnormScaleHelper; -template class SassConvBatchnormScaleHelper; -#endif - -#ifdef USE_ARM_PLACE -template class SassConvBatchnormScaleHelper; -template class SassConvBatchnormScaleHelper; -template class SassConvBatchnormScaleHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVBATCHNORMSCALE(NV, Precision::FP32); -INSTANCE_SASSCONVBATCHNORMSCALE(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, NV, Precision::INT8); -#endif - -#ifdef USE_X86_PLACE -INSTANCE_SASSCONVBATCHNORMSCALE(X86, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, X86, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -INSTANCE_SASSCONVBATCHNORMSCALE(ARM, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScale, SassConvBatchnormScaleHelper, ARM, Precision::FP32); -#endif - -//! register op -ANAKIN_REGISTER_OP(SassConvBatchnormScale) -.Doc("SassConvBatchnormScale fusion operator") -#ifdef USE_CUDA -.__alias__("convolution3x3_batchnorm_scale") -.__alias__("convolution3x3_batchnorm_scale") -#endif -#ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") -#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv") -.Args("relu_0_alpha", " alpha for relu") -.Args("scale_0_num_axes", " num axes for scale") -.Args("scale_0_bias_term", "whether scale has bias") -.Args("scale_0_axis", "axis for scale") -.Args("batchnorm_0_epsilon", "epsilon for batchnorm") -.Args("batchnorm_0_momentum", "momentum for batchnorm"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp deleted file mode 100644 index 6b0a80a93..000000000 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVBATCHNORMSCALERELU(Ttype, Ptype) \ -template<> \ -void SassConvBatchnormScaleRelu::operator()(\ - OpContext& ctx,\ - const std::vector >& ins,\ - std::vector >& outs) {\ - auto* impl = static_cast*>\ - (this->_helper);\ - auto& param = static_cast*>\ - (this->_helper)->_param_conv_batchnorm_scale_relu;\ - SABER_CHECK(impl->_funcs_conv_batchnorm_scale_relu(ins, outs, param, ctx));\ -} - -/// TODO ... specialization other type of operator - - -/// set helper -template -SassConvBatchnormScaleReluHelper::~SassConvBatchnormScaleReluHelper() { -} - -template -Status SassConvBatchnormScaleReluHelper::InitParam() { - DLOG(WARNING) << "Parsing SassConvBatchnormScaleRelu op parameter."; - - // get conv param - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - - - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weights_shape = weights.shape(); - - // get batchnorm param - auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); - auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); - auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); - auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); - auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); - auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); - auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); - auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - - // get scale param - auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); - auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); - auto scale_axis = GET_PARAMETER(int, scale_0_axis); - auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); - auto scale_weight_1_vector = scale_weight_1.vector(); - auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); - auto scale_weight_2_vector = scale_weight_2.vector(); - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam active_param(Active_relu, alpha); // TEMP - - // check if batchnorm parameters have been optimized - auto is_param_updated = CHECK_PARAMETER(is_param_updated); - if(!is_param_updated) { - SET_PARAMETER(is_param_updated, true, bool); - - if(bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], - dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), - active_param); - _param_conv_batchnorm_scale_relu = conv_param; - } else { - pblock_type* bias = new pblock_type(); - SET_PARAMETER(bias_term, true, bool); // set attr bias_term true - SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias->d_tensor()), active_param); - _param_conv_batchnorm_scale_relu = conv_param; - } - } else { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), active_param); - _param_conv_batchnorm_scale_relu = conv_param; - } - - return Status::OK(); -} - -template -Status SassConvBatchnormScaleReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - saber::ImplEnum impl_e = SABER_IMPL; - - SABER_CHECK(_funcs_conv_batchnorm_scale_relu.init(ins, outs, - _param_conv_batchnorm_scale_relu, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} - -//TODO!!! delete me when saber int8 is ready!!!! -#ifdef USE_CUDA -template<> -Status SassConvBatchnormScaleReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - saber::ImplEnum impl_e = VENDER_IMPL; - - SABER_CHECK(_funcs_conv_batchnorm_scale_relu.init(ins, outs, - _param_conv_batchnorm_scale_relu, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu.pad_h, _param_conv_batchnorm_scale_relu.pad_w, _param_conv_batchnorm_scale_relu.dilation_h, _param_conv_batchnorm_scale_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif -//TODO!!! end here - -template -Status SassConvBatchnormScaleReluHelper::InferShape( - const std::vector >& ins, - std::vector >& outs) { - _funcs_conv_batchnorm_scale_relu.compute_output_shape(ins, outs, - _param_conv_batchnorm_scale_relu); - return Status::OK(); -} - -#ifdef USE_CUDA -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -#endif - -#ifdef USE_ARM_PLACE -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -template class SassConvBatchnormScaleReluHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVBATCHNORMSCALERELU(NV, Precision::FP32); -INSTANCE_SASSCONVBATCHNORMSCALERELU(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, NV, Precision::INT8); -#endif - -#ifdef USE_X86_PLACE -INSTANCE_SASSCONVBATCHNORMSCALERELU(X86, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, X86, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -INSTANCE_SASSCONVBATCHNORMSCALERELU(ARM, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleRelu, SassConvBatchnormScaleReluHelper, ARM, Precision::FP32); -#endif - -//! register op -ANAKIN_REGISTER_OP(SassConvBatchnormScaleRelu) -.Doc("SassConvBatchnormScaleRelu fusion operator") -#ifdef USE_CUDA -.__alias__("convolution3x3_batchnorm_scale_relu") -.__alias__("convolution3x3_batchnorm_scale_relu") -#endif -#ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") -#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv") -.Args("relu_0_alpha", " alpha for relu") -.Args("scale_0_num_axes", " num axes for scale") -.Args("scale_0_bias_term", "whether scale has bias") -.Args("scale_0_axis", "axis for scale") -.Args("batchnorm_0_epsilon", "epsilon for batchnorm") -.Args("batchnorm_0_momentum", "momentum for batchnorm"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp deleted file mode 100644 index 1518714bd..000000000 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.cpp +++ /dev/null @@ -1,319 +0,0 @@ -#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVBATCHNORMSCALERELUPOOLING(Ttype, Ptype) \ -template<> \ -void SassConvBatchnormScaleReluPool::operator()(\ - OpContext& ctx,\ - const std::vector >& ins,\ - std::vector >& outs) {\ - auto* impl = static_cast*>\ - (this->_helper);\ - auto& param = static_cast*>\ - (this->_helper)->_param_conv_batchnorm_scale_relu_pooling;\ - SABER_CHECK(impl->_funcs_conv_batchnorm_scale_relu_pooling(ins, outs, param, ctx));\ -} - -/// TODO ... specialization other type of operator -/// set helper -template -SassConvBatchnormScaleReluPoolHelper::~SassConvBatchnormScaleReluPoolHelper() { -} - -template -Status SassConvBatchnormScaleReluPoolHelper::InitParam() { - DLOG(WARNING) << "Parsing SassConvBatchnormScaleReluPool op parameter."; - ConvParam conv_param_temp; - PoolingParam pooling_param_temp; - - // get conv param - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - - - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weights_shape = weights.shape(); - - // get batchnorm param - auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); - auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); - auto batch_norm_weight_1 = GET_PARAMETER(pblock_type, batchnorm_0_weight_1); - auto batch_norm_weight_1_vector = batch_norm_weight_1.vector(); - auto batch_norm_weight_2 = GET_PARAMETER(pblock_type, batchnorm_0_weight_2); - auto batch_norm_weight_2_vector = batch_norm_weight_2.vector(); - auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); - auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - - // get scale param - auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); - auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); - auto scale_axis = GET_PARAMETER(int, scale_0_axis); - auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); - auto scale_weight_1_vector = scale_weight_1.vector(); - auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); - auto scale_weight_2_vector = scale_weight_2.vector(); - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam active_param(Active_relu);//, alpha); // Temp - - // get pooling param - auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); - auto pool_padding = GET_PARAMETER(PTuple, pooling_0_padding); - auto pool_strides = GET_PARAMETER(PTuple, pooling_0_strides); - auto pool_size = GET_PARAMETER(PTuple, pooling_0_pool_size); - auto pool_method = GET_PARAMETER(std::string, pooling_0_method); - auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); - if (pool_method == "MAX") { - PoolingParam pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, cmp_out_shape_floor_as_conv); - pooling_param_temp = pooling_param; - } else if (pool_method == "AVG") { - PoolingParam pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], - Pooling_average_include_padding, global_pooling, - cmp_out_shape_floor_as_conv); - pooling_param_temp = pooling_param; - } else { - LOG(FATAL) << " SassConvBatchnormScaleReluPool fusion op doesn't support : " - << pool_method << " pooling."; - } - - // check if batchnorm parameters have been optimized - auto is_param_updated = CHECK_PARAMETER(is_param_updated); - if(!is_param_updated) { - SET_PARAMETER(is_param_updated, true, bool); - - if(bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), active_param); - - conv_param_temp = conv_param; - } else { - pblock_type* bias = new pblock_type(); - SET_PARAMETER(bias_term, true, bool); // set attr bias_term true - SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias->d_tensor()), active_param); - - conv_param_temp = conv_param; - } - } else { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), active_param); - conv_param_temp = conv_param; - - } - - ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); - _param_conv_batchnorm_scale_relu_pooling = conv_act_pooling_param; - return Status::OK(); -} - -template -Status SassConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - saber::ImplEnum impl_e = SABER_IMPL; - _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, - _param_conv_batchnorm_scale_relu_pooling, SPECIFY, impl_e, ctx); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} - -#ifdef USE_CUDA -template<> -Status SassConvBatchnormScaleReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - saber::ImplEnum impl_e = VENDER_IMPL; - _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, - _param_conv_batchnorm_scale_relu_pooling, SPECIFY, impl_e, ctx); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif - -template -Status SassConvBatchnormScaleReluPoolHelper::InferShape( - const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_conv_batchnorm_scale_relu_pooling.compute_output_shape(ins, outs, - _param_conv_batchnorm_scale_relu_pooling)); - return Status::OK(); -} - -#ifdef USE_CUDA -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -#endif - -#ifdef USE_ARM_PLACE -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -template class SassConvBatchnormScaleReluPoolHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVBATCHNORMSCALERELUPOOLING(NV, Precision::INT8) -INSTANCE_SASSCONVBATCHNORMSCALERELUPOOLING(NV, Precision::FP32) -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, NV, Precision::INT8); -#endif - -#ifdef USE_X86_PLACE -INSTANCE_SASSCONVBATCHNORMSCALERELUPOOLING(X86, Precision::FP32) -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, X86, Precision::FP32); - -#endif - -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SassConvBatchnormScaleReluPool, SassConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32); -#endif - -//! register op -ANAKIN_REGISTER_OP(SassConvBatchnormScaleReluPool) -.Doc("SassConvBatchnormScaleReluPool fusion operator") -#ifdef USE_CUDA -.__alias__("convolution_batchnorm_scale_relu_pooling") -#endif -#ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu_pooling") -#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv") -.Args("pooling_0_global_pooling", " whether use pooling for all input area.") -.Args>("pooling_0_padding", " paddding of pooling ") -.Args>("pooling_0_strides", " strides of pooling ") -.Args>("pooling_0_pool_size", "pooling size of pooling") -.Args("pooling_0_method", " pooling methods") -.Args("pooling_0_cmp_out_shape_floor_as_conv", "cmp_out_shape_floor_as_conv") -.Args("relu_0_alpha", " alpha for relu") -.Args("scale_0_num_axes", " num axes for scale") -.Args("scale_0_bias_term", "whether scale has bias") -.Args("scale_0_axis", "axis for scale") -.Args("batchnorm_0_epsilon", "epsilon for batchnorm") -.Args("batchnorm_0_momentum", "momentum for batchnorm"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h b/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h deleted file mode 100644 index edcf981a9..000000000 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_POOL_H -#define ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_POOL_H - -#include "framework/core/base.h" -#include "framework/core/data_types.h" -#include "framework/core/operator/operator.h" -#include "utils/logger/logger.h" -#include "saber/funcs/conv_pooling.h" - -namespace anakin { - -namespace ops { - -template -class SassConvBatchnormScaleReluPoolHelper; - -/// pooling op -/** - * \brief SassConvBatchnormScaleReluPool implementation class - * public inherit Operator - */ -template -class SassConvBatchnormScaleReluPool : public Operator { -public: - SassConvBatchnormScaleReluPool() {} - - /// forward impl - virtual void operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScaleReluPool< Ttype(" - << target_name::value << "), Precision("<< Ptype <<") >"; - } - - friend class SassConvBatchnormScaleReluPoolHelper; -}; - -/** - * \brief SassConvBatchnormScaleReluPool helper class to implement it - * public inherit OperatorHelper - * including init resource and shape size in SassConvBatchnormScaleReluPool context - */ -template -class SassConvBatchnormScaleReluPoolHelper : public OperatorHelper { -public: - SassConvBatchnormScaleReluPoolHelper()=default; - - ~SassConvBatchnormScaleReluPoolHelper(); - - Status InitParam() override; - - /** - * \brief initial all the resource needed by pooling - * \param ctx stand for SassConvBatchnormScaleReluPool operation context - * \param ins stand for input tensor vector - * \param outs stand for output tensor vector - * \return status - */ - Status Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) override; - - /** - * \brief infer the shape of output and input. - * \param ins stand for input tensor vector - * \param outs stand for output tensor vector - * \return status - */ - Status InferShape(const std::vector >& ins, - std::vector >& outs) override; - -public: - ///< _param_conv_batchnorm_scale_relu_pooling stand for SassConvBatchnormScaleReluPool parameter - saber::ConvPoolingParam _param_conv_batchnorm_scale_relu_pooling; - ///< _funcs_conv_batchnorm_scale_relu_pooling stand for SassConvBatchnormScaleReluPool function - saber::ConvPooling::saber_type> _funcs_conv_batchnorm_scale_relu_pooling; - -private: - ///< _dims stand for SassConvBatchnormScaleReluPool size - PTuple _dims; -}; - - - -} /* namespace ops */ - -} /* namespace anakin */ - -#endif diff --git a/framework/operators/fusion_ops/conv_3x3_relu.cpp b/framework/operators/fusion_ops/conv_3x3_relu.cpp deleted file mode 100644 index 059c799a8..000000000 --- a/framework/operators/fusion_ops/conv_3x3_relu.cpp +++ /dev/null @@ -1,239 +0,0 @@ -#include "framework/operators/fusion_ops/conv_3x3_relu.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVRELU(Ttype, Ptype) \ -template<> \ -void SassConvRelu::operator()(\ - OpContext& ctx,\ - const std::vector >& ins,\ - std::vector >& outs) {\ - auto* impl =\ - static_cast*>(this->_helper);\ - auto& param = static_cast*>\ - (this->_helper)->_param_conv_relu;\ - impl->_funcs_conv_relu(ins, outs, param, ctx);\ -} -/// TODO ... specialization other type of operator - -/// set helper -template -SassConvReluHelper::~SassConvReluHelper() { -} - -template -Status SassConvReluHelper::InitParam() { - DLOG(WARNING) << "Parsing SassConvRelu op parameter."; - - // get conv param - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam active_param(Active_relu, alpha); // TEMP - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), active_param); - _param_conv_relu = conv_param; - } else { - Tensor4d* bias = new Tensor4d();; - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias, active_param); - _param_conv_relu = conv_param; - } - - return Status::OK(); -} - -template -Status SassConvReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device please change here!!! - saber::ImplEnum impl_e = SABER_IMPL; - - SABER_CHECK(_funcs_conv_relu.init(ins, outs, - _param_conv_relu, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -// TODO -#ifdef USE_CUDA -template<> -Status SassConvReluHelper::Init(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - //different device please change here!!! - saber::ImplEnum impl_e = VENDER_IMPL; - - SABER_CHECK(_funcs_conv_relu.init(ins, outs, - _param_conv_relu, SPECIFY, impl_e, ctx)); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&Conv::saber_type>::trans_weights, - &_funcs_conv_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_relu.pad_h, _param_conv_relu.pad_w, _param_conv_relu.dilation_h, _param_conv_relu.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif - -template -Status SassConvReluHelper::InferShape(const - std::vector >& ins, - std::vector >& outs) { - _funcs_conv_relu.compute_output_shape(ins, outs, _param_conv_relu); - return Status::OK(); -} - -#ifdef USE_CUDA -template class SassConvReluHelper; -template class SassConvReluHelper; -template class SassConvReluHelper; -#endif - -#ifdef USE_ARM_PLACE -template class SassConvReluHelper; -template class SassConvReluHelper; -template class SassConvReluHelper; -#endif - -#ifdef AMD_GPU -template class SassConvReluHelper; -template class SassConvReluHelper; -template class SassConvReluHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVRELU(NV, Precision::FP32); -INSTANCE_SASSCONVRELU(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, NV, Precision::INT8); -#endif - -#ifdef USE_X86_PLACE -INSTANCE_SASSCONVRELU(X86, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, X86, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -INSTANCE_SASSCONVRELU(ARM, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, ARM, Precision::FP32); -#endif - -#ifdef AMD_GPU -INSTANCE_SASSCONVRELU(AMD, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvRelu, SassConvReluHelper, AMD, Precision::FP32); -#endif -//! register op -ANAKIN_REGISTER_OP(SassConvRelu) -.Doc("SassConvRelu fusion operator") -#ifdef USE_CUDA -.__alias__("convolution3x3_relu") -.__alias__("convolution3x3_relu") -#endif -#ifdef USE_ARM_PLACE -.__alias__("convolution3x3_relu") -#endif -#ifdef AMD_GPU -.__alias__("convolution_batchnorm_scale_relu") -#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv") -.Args("relu_0_alpha", " alpha for relu"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp b/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp deleted file mode 100644 index 70f57af09..000000000 --- a/framework/operators/fusion_ops/conv_3x3_relu_pool.cpp +++ /dev/null @@ -1,279 +0,0 @@ -#include "framework/operators/fusion_ops/conv_3x3_relu_pool.h" - -namespace anakin { - -namespace ops { - -#define INSTANCE_SASSCONVRELUPOOL(Ttype, Ptype) \ -template<> \ -void SassConvReluPool::operator()(\ - OpContext& ctx,\ - const std::vector >& ins,\ - std::vector >& outs) {\ - auto* impl =\ - static_cast*>(this->_helper);\ - auto& param = static_cast*>\ - (this->_helper)->_param_conv_relu_pooling;\ - impl->_funcs_conv_relu_pooling(ins, outs, param, ctx);\ -} - -/// set helper -template -SassConvReluPoolHelper::~SassConvReluPoolHelper() {} - -template -Status SassConvReluPoolHelper::InitParam() { - DLOG(WARNING) << "Parsing SassConvReluPool op parameter."; - - saber::ConvParam conv_param_temp; - PoolingParam pooling_param_temp; - // get conv param - auto group = GET_PARAMETER(int, group); - auto bias_term = GET_PARAMETER(bool, bias_term); - auto padding = GET_PARAMETER(PTuple, padding); - auto strides = GET_PARAMETER(PTuple, strides); - auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); - auto filter_num = GET_PARAMETER(int, filter_num); - auto kernel_size = GET_PARAMETER(PTuple, kernel_size); - auto axis = GET_PARAMETER(int, axis); - - using pblock_type = PBlock; - auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weight_vec = weights.vector(); - - // get relu param - auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam active_param(Active_relu, alpha); // Temp - - // get pooling param - auto global_pooling = GET_PARAMETER(bool, pooling_0_global_pooling); - auto pool_padding = GET_PARAMETER(PTuple, pooling_0_padding); - auto pool_strides = GET_PARAMETER(PTuple, pooling_0_strides); - auto pool_size = GET_PARAMETER(PTuple, pooling_0_pool_size); - auto pool_method = GET_PARAMETER(std::string, pooling_0_method); - auto cmp_out_shape_floor_as_conv = GET_PARAMETER(bool, pooling_0_cmp_out_shape_floor_as_conv); - if (pool_method == "MAX") { - PoolingParam pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], - Pooling_max, global_pooling, cmp_out_shape_floor_as_conv); - - pooling_param_temp = pooling_param; - } else if (pool_method == "AVG") { - PoolingParam pooling_param(pool_size[0], pool_size[1], - pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], - Pooling_average_include_padding, global_pooling, - cmp_out_shape_floor_as_conv); - - pooling_param_temp = pooling_param; - } else { - LOG(FATAL) << " SassConvReluPool fusion op doesn't support : " - << pool_method << " pooling."; - } - - if (bias_term) { - auto bias = GET_PARAMETER(pblock_type, weight_2); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), &(bias.d_tensor()), - active_param); - conv_param_temp = conv_param; - } else { - Tensor4d* bias = new Tensor4d(); - saber::ConvParam conv_param(group, padding[0], padding[1], - strides[0], strides[1], dilation_rate[0], dilation_rate[1], - &(weights.d_tensor()), bias, active_param); - conv_param_temp = conv_param; - } - - ConvPoolingParam conv_act_pooling_param(conv_param_temp, pooling_param_temp); - _param_conv_relu_pooling = conv_act_pooling_param; - - return Status::OK(); -} - -template -Status SassConvReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - saber::ImplEnum impl_e = SABER_IMPL; - _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, SPECIFY, - impl_e, ctx); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} - -template -Status SassConvReluPoolHelper::InferShape(const std::vector >& ins, - std::vector >& outs) { - SABER_CHECK(_funcs_conv_relu_pooling.compute_output_shape(ins, outs, _param_conv_relu_pooling)); - return Status::OK(); -} - -#ifdef USE_CUDA -template<> -Status SassConvReluPoolHelper::Init(OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - - auto group = GET_PARAMETER(int, group); - auto strides = GET_PARAMETER(PTuple, strides); - auto weights = GET_PARAMETER(PBlock, weight_1); - auto bias_term = GET_PARAMETER(bool, bias_term); - - saber::ImplEnum impl_e = VENDER_IMPL; - _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, SPECIFY, - impl_e, ctx); - - // check if weights have been transposed - auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); - if (!is_weights_transed) { - SET_PARAMETER(is_weights_transed, true, bool); - if (bias_term) { - auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - bias.map_to_host(); - } else { - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weights.d_tensor(), bias_empty.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - weights.map_to_host(); - - } else { - PBlock weight_empty; - PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvPooling::saber_type>::trans_weights, - &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), - weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, - strides[0], strides[1], group, impl_e); - } - return Status::OK(); -} -#endif - -#ifdef USE_CUDA -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -#endif - -#ifdef USE_ARM_PLACE -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -#endif - -#ifdef AMD_GPU -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -template class SassConvReluPoolHelper; -#endif - -// register helper -#ifdef USE_CUDA -INSTANCE_SASSCONVRELUPOOL(NV, Precision::FP32); -INSTANCE_SASSCONVRELUPOOL(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, NV, Precision::INT8); -#endif - -#ifdef USE_X86_PLACE -INSTANCE_SASSCONVRELUPOOL(X86, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, X86, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -INSTANCE_SASSCONVRELUPOOL(ARM, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, ARM, Precision::FP32); -#endif - -#ifdef AMD_GPU -INSTANCE_SASSCONVRELUPOOL(AMD, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(SassConvReluPool, SassConvReluPoolHelper, AMD, Precision::FP32); -#endif - -//! register op -ANAKIN_REGISTER_OP(SassConvReluPool) -.Doc("SassConvReluPool fusion operator") -#ifdef USE_CUDA -.__alias__("convolution_relu_pooling") -.__alias__("convolution_relu_pooling") -#endif -#ifdef USE_ARM_PLACE -.__alias__("convolution_relu_pooling") -#endif -#ifdef AMD_GPU -.__alias__("convolution_relu_pooling") -#endif -.num_in(1) -.num_out(1) -.Args("group", " group of conv ") -.Args("bias_term", " whether conv weights have bias") -.Args>("padding", "padding of conv (x, y)") -.Args>("strides", "strides of conv (x)") -.Args>("dilation_rate", "dilation rate of conv (x)") -.Args("filter_num", "filter(kernel) number of weights") -.Args>("kernel_size", "kernel size of kernel (x, y)") -.Args("axis", "axis of conv") -.Args("pooling_0_global_pooling", " whether use pooling for all input area.") -.Args>("pooling_0_padding", " paddding of pooling ") -.Args>("pooling_0_strides", " strides of pooling ") -.Args>("pooling_0_pool_size", "pooling size of pooling") -.Args("pooling_0_method", " pooling methods") -.Args("pooling_0_cmp_out_shape_floor_as_conv", "cmp_out_shape_floor_as_conv") -.Args("relu_0_alpha", " alpha for relu"); - -} /* namespace ops */ - -} /* namespace anakin */ - - diff --git a/framework/operators/fusion_ops/conv_act.cpp b/framework/operators/fusion_ops/conv_act.cpp index f5431836d..fd569f745 100644 --- a/framework/operators/fusion_ops/conv_act.cpp +++ b/framework/operators/fusion_ops/conv_act.cpp @@ -34,6 +34,13 @@ Status ConvActHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get act param ActivationParam param_act; diff --git a/framework/operators/fusion_ops/conv_affine_channel.cpp b/framework/operators/fusion_ops/conv_affine_channel.cpp new file mode 100644 index 000000000..ccbb54f0d --- /dev/null +++ b/framework/operators/fusion_ops/conv_affine_channel.cpp @@ -0,0 +1,242 @@ +#include "framework/operators/fusion_ops/conv_affine_channel.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(Ttype, Ptype) \ +template<> \ +void ConvAffineChannel::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_affine_channel;\ + SABER_CHECK(impl->_funcs_conv_affine_channel(ins, outs, param, ctx));\ +} + +template +Status ConvAffineChannelHelper::InitParam() { + LOG(WARNING) << "Parsing ConvAffineChannel op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + auto axis = GET_PARAMETER(int, axis); + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } + // get affine_channel param + auto affine_channel_weight_1 = GET_PARAMETER(pblock_type, affine_channel_0_weight_1); + auto affine_channel_w = affine_channel_weight_1.vector(); + auto affine_channel_weight_2 = GET_PARAMETER(pblock_type, affine_channel_0_weight_2); + auto affine_channel_b = affine_channel_weight_2.vector(); + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if (!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if (!bias_term) { + Shape4d tmp_shape({1, affine_channel_w.size(), 1, 1}); + pblock_type* bias = graph::GraphGlobalMem::Global().template new_block(tmp_shape); + void* new_bias_data = bias->h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias->h_tensor().size()); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + } + auto bias = GET_PARAMETER(pblock_type, weight_2); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_conv_affine_channel_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + affine_channel_w, affine_channel_b); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_conv_affine_channel_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + affine_channel_w, affine_channel_b); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + + _param_conv_affine_channel = conv_param; + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + + _param_conv_affine_channel = conv_param; + } + return Status::OK(); +} + +template +Status ConvAffineChannelHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + auto bias_term = GET_PARAMETER(bool, bias_term); + + //different device please change here!!! + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.weight()->height() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.weight()->width() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.pad_h == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.pad_w == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.stride_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.stride_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.dilation_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.dilation_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.group == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel.bias()->valid_size() > 0); + bool use_k3s1d1 = (Ptype == Precision::FP32); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.weight()->height() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.weight()->width() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.group == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.stride_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.stride_w == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.dilation_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel.dilation_w == 1); + bool use_depthwise = (Ptype == Precision::FP32); + use_depthwise = use_depthwise && (_param_conv_affine_channel.group == ins[0]->channel()); + use_depthwise = use_depthwise && (_param_conv_affine_channel.group == outs[0]->channel()); + bool use_direct_k = (Ptype == Precision::FP32); + use_direct_k = use_direct_k && (_param_conv_affine_channel.weight()->channel() >= 16); + use_direct_k = use_direct_k && (_param_conv_affine_channel.group == 1); + if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + impl_e = SABER_IMPL; + } + + SABER_CHECK(_funcs_conv_affine_channel.init(ins, outs, \ + _param_conv_affine_channel, SPECIFY, impl_e, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if (!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + if (bias_term) { + auto bias = GET_PARAMETER(PBlock, weight_2); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias.d_tensor(), _param_conv_affine_channel.pad_h, _param_conv_affine_channel.pad_w, _param_conv_affine_channel.dilation_h, _param_conv_affine_channel.dilation_w, + strides[0], strides[1], group, impl_e); + bias.map_to_host(); + } else { + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias_empty.d_tensor(), _param_conv_affine_channel.pad_h, _param_conv_affine_channel.pad_w, _param_conv_affine_channel.dilation_h, _param_conv_affine_channel.dilation_w, + strides[0], strides[1], group, impl_e); + } + weights.map_to_host(); + } else { + PBlock weight_empty; + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_affine_channel.pad_h, _param_conv_affine_channel.pad_w, _param_conv_affine_channel.dilation_h, _param_conv_affine_channel.dilation_w, + strides[0], strides[1], group, impl_e); + } + return Status::OK(); +} + +template +Status ConvAffineChannelHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_affine_channel.compute_output_shape(ins, outs, \ + _param_conv_affine_channel)); + return Status::OK(); +} + +#ifdef USE_ARM_PLACE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(ARM, Precision::FP32); +template class ConvAffineChannelHelper; +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(NV, Precision::FP32); +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, NV, Precision::INT8); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, X86, Precision::FP32); +#endif + +#if defined BUILD_LITE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(X86, Precision::FP32); +template class ConvAffineChannelHelper; +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, X86, Precision::FP32); +#endif + +//#ifdef USE_X86_PLACE +//INSTANCE_CONVBATCHNORMAFFINE_CHANNEL(X86, Precision::FP32); +//template class ConvAffineChannelHelper; +//ANAKIN_REGISTER_OP_HELPER(ConvAffineChannel, ConvAffineChannelHelper, X86, +// Precision::FP32); +//#endif + +//! register op +ANAKIN_REGISTER_OP(ConvAffineChannel) +.Doc("ConvAffineChannel fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_affine_channel") +.__alias__("convolution_affine_channel") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_affine_channel") +#endif +#if defined BUILD_LITE +.__alias__("convolution_affine_channel") +#endif +#ifdef AMD_GPU +//.__alias__("convolution_affine_channel") +//.__alias__("convolution_affine_channel") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/conv_3x3_relu.h b/framework/operators/fusion_ops/conv_affine_channel.h similarity index 69% rename from framework/operators/fusion_ops/conv_3x3_relu.h rename to framework/operators/fusion_ops/conv_affine_channel.h index fc266f116..01aabce5a 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu.h +++ b/framework/operators/fusion_ops/conv_affine_channel.h @@ -13,60 +13,61 @@ limitations under the License. */ -#ifndef ANAKIN_OPERATOR_CONV_SASS_RELU_H -#define ANAKIN_OPERATOR_CONV_SASS_RELU_H +#ifndef ANAKIN_OPERATOR_CONV_AFFINE_CHANNEL_H +#define ANAKIN_OPERATOR_CONV_AFFINE_CHANNEL_H #include "framework/core/base.h" #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" #include "saber/funcs/conv.h" +#include "framework/utils/parameter_fusion.h" namespace anakin { namespace ops { template -class SassConvReluHelper; +class ConvAffineChannelHelper; /// pooling op /** - * \brief SassConvRelu implementation class + * \brief ConvAffineChannelHelper implementation class * public inherit Operator */ template -class SassConvRelu : public Operator { +class ConvAffineChannel : public Operator { public: - SassConvRelu() {} + ConvAffineChannel() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvRelu< Ttype(" + LOG(ERROR) << "Not Impl Yet Operator ConvAffineChannel< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvReluHelper; + friend class ConvAffineChannelHelper; }; /** - * \brief SassConvRelu helper class to implement SassConvRelu + * \brief ConvAffineChannel helper class to implement it * public inherit OperatorHelper - * including init resource and shape size in SassConvRelu context + * including init resource and shape size in ConvAffineChannelHelper context */ template -class SassConvReluHelper : public OperatorHelper { +class ConvAffineChannelHelper : public OperatorHelper { public: - SassConvReluHelper()=default; + ConvAffineChannelHelper()=default; - ~SassConvReluHelper(); + ~ConvAffineChannelHelper() {} Status InitParam() override; /** * \brief initial all the resource needed by pooling - * \param ctx stand for SassConvRelu operation context + * \param ctx stand for ConvAffineChannel operation context * \param ins stand for input tensor vector * \param outs stand for output tensor vector * \return status @@ -85,18 +86,12 @@ class SassConvReluHelper : public OperatorHelper { std::vector >& outs) override; public: - ///< _param_conv stand for SassConvRelu parameter - saber::ConvParam _param_conv_relu; - ///< _funcs_conv_relu stand for SassConvRelu function - saber::Conv::saber_type> _funcs_conv_relu; - -private: - ///< _dims stand for SassConvRelu size - PTuple _dims; + ///< _param_conv_affine_channel stand for ConvAffineChannel parameter + saber::ConvParam _param_conv_affine_channel; + ///< _funcs_conv stand for ConvAffineChannel function + saber::Conv::saber_type> _funcs_conv_affine_channel; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_affine_channel_relu.cpp b/framework/operators/fusion_ops/conv_affine_channel_relu.cpp new file mode 100644 index 000000000..404dfd72e --- /dev/null +++ b/framework/operators/fusion_ops/conv_affine_channel_relu.cpp @@ -0,0 +1,245 @@ +#include "framework/operators/fusion_ops/conv_affine_channel_relu.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(Ttype, Ptype) \ +template<> \ +void ConvAffineChannelRelu::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_affine_channel_relu;\ + SABER_CHECK(impl->_funcs_conv_affine_channel_relu(ins, outs, param, ctx));\ +} + +template +Status ConvAffineChannelReluHelper::InitParam() { + LOG(WARNING) << "Parsing ConvAffineChannelRelu op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu, alpha); // TEMP + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } + + // get affine_channel param + auto affine_channel_weight_1 = GET_PARAMETER(pblock_type, affine_channel_0_weight_1); + auto affine_channel_w = affine_channel_weight_1.vector(); + auto affine_channel_weight_2 = GET_PARAMETER(pblock_type, affine_channel_0_weight_2); + auto affine_channel_b = affine_channel_weight_2.vector(); + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if (!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if (!bias_term) { + Shape4d shape_temp({1, affine_channel_w.size(), 1, 1}); + pblock_type* bias = graph::GraphGlobalMem::Global().template new_block(shape_temp); + void* new_bias_data = bias->h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias->h_tensor().size()); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + } + auto bias = GET_PARAMETER(pblock_type, weight_2); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_conv_affine_channel_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + affine_channel_w, affine_channel_b); + } else{ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_conv_affine_channel_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + affine_channel_w, affine_channel_b); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), active_param); + + _param_conv_affine_channel_relu = conv_param; + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), active_param); + + _param_conv_affine_channel_relu = conv_param; + } + return Status::OK(); +} + +template +Status ConvAffineChannelReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + auto bias_term = GET_PARAMETER(bool, bias_term); + + //different device please change here!!! + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.weight()->height() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.weight()->width() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.pad_h == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.pad_w == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.stride_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.stride_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.dilation_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.dilation_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.group == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_affine_channel_relu.bias()->valid_size() > 0); + bool use_k3s1d1 = (Ptype == Precision::FP32); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.weight()->height() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.weight()->width() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.group == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.stride_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.stride_w == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.dilation_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_affine_channel_relu.dilation_w == 1); + bool use_depthwise = (Ptype == Precision::FP32); + use_depthwise = use_depthwise && (_param_conv_affine_channel_relu.group == ins[0]->channel()); + use_depthwise = use_depthwise && (_param_conv_affine_channel_relu.group == outs[0]->channel()); + bool use_direct_k = (Ptype == Precision::FP32); + use_direct_k = use_direct_k && (_param_conv_affine_channel_relu.weight()->channel() >= 16); + use_direct_k = use_direct_k && (_param_conv_affine_channel_relu.group == 1); + if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + impl_e = SABER_IMPL; + } + + SABER_CHECK(_funcs_conv_affine_channel_relu.init(ins, outs, \ + _param_conv_affine_channel_relu, SPECIFY, impl_e, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if (!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + if (bias_term) { + auto bias = GET_PARAMETER(PBlock, weight_2); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias.d_tensor(), _param_conv_affine_channel_relu.pad_h, _param_conv_affine_channel_relu.pad_w, _param_conv_affine_channel_relu.dilation_h, _param_conv_affine_channel_relu.dilation_w, + strides[0], strides[1], group, impl_e); + bias.map_to_host(); + } else { + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias_empty.d_tensor(), _param_conv_affine_channel_relu.pad_h, _param_conv_affine_channel_relu.pad_w, _param_conv_affine_channel_relu.dilation_h, _param_conv_affine_channel_relu.dilation_w, + strides[0], strides[1], group, impl_e); + } + weights.map_to_host(); + } else { + PBlock weight_empty; + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_affine_channel_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_affine_channel_relu.pad_h, _param_conv_affine_channel_relu.pad_w, _param_conv_affine_channel_relu.dilation_h, _param_conv_affine_channel_relu.dilation_w, + strides[0], strides[1], group, impl_e); + } + return Status::OK(); +} + +template +Status ConvAffineChannelReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_affine_channel_relu.compute_output_shape(ins, outs, \ + _param_conv_affine_channel_relu)); + return Status::OK(); +} + +#ifdef USE_ARM_PLACE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(ARM, Precision::FP32); +template class ConvAffineChannelReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(NV, Precision::FP32); +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, NV, Precision::INT8); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, X86, Precision::FP32); +#endif + +#if defined BUILD_LITE +INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(X86, Precision::FP32); +template class ConvAffineChannelReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, X86, Precision::FP32); +#endif + +//#ifdef USE_X86_PLACE +//INSTANCE_CONVBATCHNORMAFFINE_CHANNEL_RELU(X86, Precision::FP32); +//template class ConvAffineChannelReluHelper; +//ANAKIN_REGISTER_OP_HELPER(ConvAffineChannelRelu, ConvAffineChannelReluHelper, X86, +// Precision::FP32); +//#endif + +//! register op +ANAKIN_REGISTER_OP(ConvAffineChannelRelu) +.Doc("ConvAffineChannelRelu fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_affine_channel") +.__alias__("convolution_affine_channel") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_affine_channel") +#endif +#if defined BUILD_LITE +.__alias__("convolution_affine_channel") +#endif +#ifdef AMD_GPU +//.__alias__("convolution_affine_channel") +//.__alias__("convolution_affine_channel") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h b/framework/operators/fusion_ops/conv_affine_channel_relu.h similarity index 65% rename from framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h rename to framework/operators/fusion_ops/conv_affine_channel_relu.h index 49cffab07..c1679d827 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale.h +++ b/framework/operators/fusion_ops/conv_affine_channel_relu.h @@ -13,64 +13,65 @@ limitations under the License. */ -#ifndef ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_H -#define ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_H +#ifndef ANAKIN_OPERATOR_CONV_AFFINE_CHANNEL_RELU_H +#define ANAKIN_OPERATOR_CONV_AFFINE_CHANNEL_RELU_H #include "framework/core/base.h" #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" #include "saber/funcs/conv.h" +#include "framework/utils/parameter_fusion.h" namespace anakin { namespace ops { template -class SassConvBatchnormScaleHelper; +class ConvAffineChannelReluHelper; /// pooling op /** - * \brief SassConvBatchnormScale implementation class + * \brief ConvAffineChannelReluHelper implementation class * public inherit Operator */ template -class SassConvBatchnormScale : public Operator { +class ConvAffineChannelRelu : public Operator { public: - SassConvBatchnormScale() {} + ConvAffineChannelRelu() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScale< Ttype(" + LOG(ERROR) << "Not Impl Yet Operator ConvAffineChannelRelu< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvBatchnormScaleHelper; + friend class ConvAffineChannelReluHelper; }; /** - * \brief SassConvBatchnormScale helper class to implement it + * \brief ConvAffineChannelRelu helper class to implement it * public inherit OperatorHelper - * including init resource and shape size in SassConvBatchnormScale context + * including init resource and shape size in ConvAffineChannelReluHelper context */ template -class SassConvBatchnormScaleHelper : public OperatorHelper { +class ConvAffineChannelReluHelper : public OperatorHelper { public: - SassConvBatchnormScaleHelper()=default; + ConvAffineChannelReluHelper()=default; - ~SassConvBatchnormScaleHelper(); + ~ConvAffineChannelReluHelper() {} Status InitParam() override; - + /** * \brief initial all the resource needed by pooling - * \param ctx stand for SassConvBatchnormScale operation context + * \param ctx stand for ConvAffineChannelRelu operation context * \param ins stand for input tensor vector * \param outs stand for output tensor vector * \return status - *///! initial all the resource needed by pooling + */ Status Init(OpContext &ctx, const std::vector >& ins, std::vector >& outs) override; @@ -85,18 +86,12 @@ class SassConvBatchnormScaleHelper : public OperatorHelper { std::vector >& outs) override; public: - ///< _param_conv_batchnorm_scale stand for SassConvBatchnormScale parameter - saber::ConvParam _param_conv_batchnorm_scale; - ///< _funcs_conv_batchnorm_scale stand for SassConvBatchnormScale function - saber::Conv::saber_type> _funcs_conv_batchnorm_scale; - -private: - ///< _dims stand for SassConvBatchnormScale size - PTuple _dims; + ///< _param_conv_affine_channel_relu stand for ConvAffineChannelRelu parameter + saber::ConvParam _param_conv_affine_channel_relu; + ///< _funcs_conv stand for ConvAffineChannelRelu function + saber::Conv::saber_type> _funcs_conv_affine_channel_relu; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_batchnorm.cpp b/framework/operators/fusion_ops/conv_batchnorm.cpp index d622b1ca6..896368b6b 100644 --- a/framework/operators/fusion_ops/conv_batchnorm.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm.cpp @@ -19,7 +19,7 @@ void ConvBatchnorm::operator()(\ template Status ConvBatchnormHelper::InitParam() { LOG(WARNING) << "Parsing ConvBatchnorm op parameter."; - + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -33,6 +33,14 @@ Status ConvBatchnormHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -44,18 +52,26 @@ Status ConvBatchnormHelper::InitParam() { auto batch_norm_weight_3 = GET_PARAMETER(pblock_type, batchnorm_0_weight_3); auto batch_norm_weight_3_vector = batch_norm_weight_3.vector(); - // check if batchnorm parameters have been optimized + // check if batchnorm parameters have been optimized auto is_param_updated = CHECK_PARAMETER(is_param_updated); if (!is_param_updated) { SET_PARAMETER(is_param_updated, true, bool); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights_without_scale, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_without_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_without_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], @@ -66,13 +82,21 @@ Status ConvBatchnormHelper::InitParam() { pblock_type* bias = new pblock_type(); SET_PARAMETER(bias_term, true, bool); // set attr bias_term true SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights_without_scale, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, - batch_norm_weight_2_vector); + if (weights_dtype == AK_FLOAT){ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_without_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_without_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -102,10 +126,10 @@ Status ConvBatchnormHelper::Init(OpContext& ctx, //different device please change here!!! saber::ImplEnum impl_e = VENDER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } - bool use_k1s1p0 = true; + bool use_k1s1p0 = (Ptype == Precision::FP32); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.weight()->height() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.weight()->width() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.pad_h == 0); @@ -116,7 +140,7 @@ Status ConvBatchnormHelper::Init(OpContext& ctx, use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.dilation_w == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.group == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm.bias()->valid_size() > 0); - bool use_k3s1d1 = true; + bool use_k3s1d1 = (Ptype == Precision::FP32); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.weight()->height() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.weight()->width() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.group == 1); @@ -124,16 +148,19 @@ Status ConvBatchnormHelper::Init(OpContext& ctx, use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.stride_w == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.dilation_h == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm.dilation_w == 1); - bool use_depthwise = true; + bool use_depthwise = (Ptype == Precision::FP32); use_depthwise = use_depthwise && (_param_conv_batchnorm.group == ins[0]->channel()); use_depthwise = use_depthwise && (_param_conv_batchnorm.group == outs[0]->channel()); - bool use_direct_k = true; + bool use_direct_k = (Ptype == Precision::FP32); use_direct_k = use_direct_k && (_param_conv_batchnorm.weight()->channel() >= 16); use_direct_k = use_direct_k && (_param_conv_batchnorm.group == 1); - if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { + impl_e = SABER_IMPL; + } + if (std::is_same::value && Ptype == Precision::INT8) { impl_e = SABER_IMPL; } - SABER_CHECK(_funcs_conv_batchnorm.init(ins, outs, \ _param_conv_batchnorm, SPECIFY, impl_e, ctx)); @@ -181,8 +208,11 @@ Status ConvBatchnormHelper::InferShape(const #ifdef USE_ARM_PLACE INSTANCE_CONVBATCHNORM(ARM, Precision::FP32); +INSTANCE_CONVBATCHNORM(ARM, Precision::INT8); template class ConvBatchnormHelper; +template class ConvBatchnormHelper; ANAKIN_REGISTER_OP_HELPER(ConvBatchnorm, ConvBatchnormHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnorm, ConvBatchnormHelper, ARM, Precision::INT8); #endif #ifdef USE_CUDA @@ -213,10 +243,15 @@ ANAKIN_REGISTER_OP(ConvBatchnorm) #endif #ifdef USE_ARM_PLACE .__alias__("convolution_batchnorm") +.__alias__("convolution_batchnorm") #endif #if defined BUILD_LITE .__alias__("convolution_batchnorm") #endif +#ifdef AMD_GPU +//.__alias__("convolution_batchnorm") +//.__alias__("convolution_batchnorm") +#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale.cpp index c3ccf89ea..2e1e29d5d 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/conv_batchnorm_scale.h" namespace anakin { @@ -19,7 +33,7 @@ void ConvBatchnormScale::operator()(\ template Status ConvBatchnormScaleHelper::InitParam() { LOG(WARNING) << "Parsing ConvBatchnormScale op parameter."; - + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -33,7 +47,14 @@ Status ConvBatchnormScaleHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); - + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); auto momentum = GET_PARAMETER(float, batchnorm_0_momentum); @@ -53,21 +74,30 @@ Status ConvBatchnormScaleHelper::InitParam() { auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - // check if batchnorm parameters have been optimized + // check if batchnorm parameters have been optimized auto is_param_updated = CHECK_PARAMETER(is_param_updated); if (!is_param_updated) { SET_PARAMETER(is_param_updated, true, bool); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); - + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } else{ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -77,17 +107,27 @@ Status ConvBatchnormScaleHelper::InitParam() { pblock_type* bias = new pblock_type(); SET_PARAMETER(bias_term, true, bool); // set attr bias_term true SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - scale_weight_1_vector, - scale_weight_2_vector, - scale_bias_term); - + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], &(weights.d_tensor()), &(bias->d_tensor())); @@ -115,11 +155,17 @@ Status ConvBatchnormScaleHelper::Init(OpContext& ctx, auto bias_term = GET_PARAMETER(bool, bias_term); //different device please change here!!! +#ifdef AMD_GPU + saber::ImplEnum impl_e = SABER_IMPL; +#else saber::ImplEnum impl_e = VENDER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { + impl_e = SABER_IMPL; + } + if (std::is_same::value && Ptype == Precision::INT8) { impl_e = SABER_IMPL; } - bool use_k1s1p0 = true; + bool use_k1s1p0 = (Ptype == Precision::FP32); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.weight()->height() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.weight()->width() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.pad_h == 0); @@ -130,7 +176,7 @@ Status ConvBatchnormScaleHelper::Init(OpContext& ctx, use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.dilation_w == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.group == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale.bias()->valid_size() > 0); - bool use_k3s1d1 = true; + bool use_k3s1d1 = (Ptype == Precision::FP32); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.weight()->height() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.weight()->width() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.group == 1); @@ -138,15 +184,17 @@ Status ConvBatchnormScaleHelper::Init(OpContext& ctx, use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.stride_w == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.dilation_h == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale.dilation_w == 1); - bool use_depthwise = true; + bool use_depthwise = (Ptype == Precision::FP32); use_depthwise = use_depthwise && (_param_conv_batchnorm_scale.group == ins[0]->channel()); use_depthwise = use_depthwise && (_param_conv_batchnorm_scale.group == outs[0]->channel()); - bool use_direct_k = true; + bool use_direct_k = (Ptype == Precision::FP32); use_direct_k = use_direct_k && (_param_conv_batchnorm_scale.weight()->channel() >= 16); use_direct_k = use_direct_k && (_param_conv_batchnorm_scale.group == 1); - if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { impl_e = SABER_IMPL; } +#endif SABER_CHECK(_funcs_conv_batchnorm_scale.init(ins, outs, \ _param_conv_batchnorm_scale, SPECIFY, impl_e, ctx)); @@ -195,8 +243,11 @@ Status ConvBatchnormScaleHelper::InferShape(const #ifdef USE_ARM_PLACE INSTANCE_CONVBATCHNORMSCALE(ARM, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALE(ARM, Precision::INT8); template class ConvBatchnormScaleHelper; +template class ConvBatchnormScaleHelper; ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, ARM, Precision::INT8); #endif #ifdef USE_CUDA @@ -209,6 +260,8 @@ ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, NV, Prec #ifdef USE_X86_PLACE INSTANCE_CONVBATCHNORMSCALE(X86, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALE(X86, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, Precision::INT8); #endif #if defined BUILD_LITE @@ -220,10 +273,16 @@ ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, Pre //#ifdef USE_X86_PLACE //INSTANCE_CONVBATCHNORMSCALE(X86, Precision::FP32); //template class ConvBatchnormScaleHelper; -//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, +//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, X86, // Precision::FP32); //#endif +#ifdef AMD_GPU +INSTANCE_CONVBATCHNORMSCALE(AMD, Precision::FP32); +template class ConvBatchnormScaleHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScale, ConvBatchnormScaleHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(ConvBatchnormScale) .Doc("ConvBatchnormScale fusion operator") @@ -233,10 +292,14 @@ ANAKIN_REGISTER_OP(ConvBatchnormScale) #endif #ifdef USE_ARM_PLACE .__alias__("convolution_batchnorm_scale") +.__alias__("convolution_batchnorm_scale") #endif #if defined BUILD_LITE .__alias__("convolution_batchnorm_scale") #endif +#ifdef AMD_GPU +.__alias__("convolution_batchnorm_scale") +#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp index c50f4a478..223824652 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/conv_batchnorm_scale_relu.h" namespace anakin { @@ -20,7 +34,7 @@ void ConvBatchnormScaleRelu::operator()(\ template Status ConvBatchnormScaleReluHelper::InitParam() { DLOG(WARNING) << "Parsing ConvBatchnormScaleRelu op parameter."; - + // get conv param auto group = GET_PARAMETER(int, group); auto bias_term = GET_PARAMETER(bool, bias_term); @@ -34,6 +48,14 @@ Status ConvBatchnormScaleReluHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -52,26 +74,36 @@ Status ConvBatchnormScaleReluHelper::InitParam() { auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); - auto scale_weight_2_vector = scale_weight_2.vector(); + auto scale_weight_2_vector = scale_weight_2.vector(); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); ActivationParam active_param(Active_relu, alpha); // TEMP - // check if batchnorm parameters have been optimized + // check if batchnorm parameters have been optimized auto is_param_updated = CHECK_PARAMETER(is_param_updated); if (!is_param_updated) { SET_PARAMETER(is_param_updated, true, bool); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, - weights, bias, weights_shape[0], weights_shape[1], - weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, + weights, bias, weights_shape[0], weights_shape[1], + weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, + weights, bias, weights_shape[0], weights_shape[1], + weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], @@ -83,13 +115,21 @@ Status ConvBatchnormScaleReluHelper::InitParam() { pblock_type* bias = new pblock_type(); SET_PARAMETER(bias_term, true, bool); // set attr bias_term true SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + } else{ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], @@ -121,11 +161,17 @@ Status ConvBatchnormScaleReluHelper::Init(OpContext& ctx, auto bias_term = GET_PARAMETER(bool, bias_term); //different device please change here!!! +#ifdef AMD_GPU + saber::ImplEnum impl_e = SABER_IMPL; +#else saber::ImplEnum impl_e = VENDER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } - bool use_k1s1p0 = true; + if (std::is_same::value && Ptype == Precision::INT8) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.weight()->height() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.weight()->width() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.pad_h == 0); @@ -136,7 +182,7 @@ Status ConvBatchnormScaleReluHelper::Init(OpContext& ctx, use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.dilation_w == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.group == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_batchnorm_scale_relu.bias()->valid_size() > 0); - bool use_k3s1d1 = true; + bool use_k3s1d1 = (Ptype == Precision::FP32); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.weight()->height() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.weight()->width() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.group == 1); @@ -144,15 +190,26 @@ Status ConvBatchnormScaleReluHelper::Init(OpContext& ctx, use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.stride_w == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.dilation_h == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_batchnorm_scale_relu.dilation_w == 1); - bool use_depthwise = true; + bool use_depthwise = (Ptype == Precision::FP32); use_depthwise = use_depthwise && (_param_conv_batchnorm_scale_relu.group == ins[0]->channel()); use_depthwise = use_depthwise && (_param_conv_batchnorm_scale_relu.group == outs[0]->channel()); - bool use_direct_k = true; + bool use_direct_k = (Ptype == Precision::FP32); use_direct_k = use_direct_k && (_param_conv_batchnorm_scale_relu.weight()->channel() >= 16); use_direct_k = use_direct_k && (_param_conv_batchnorm_scale_relu.group == 1); - if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { impl_e = SABER_IMPL; } + /*auto valid_shape = ins[0]->valid_shape(); + if((valid_shape[2] <=4) && (valid_shape[3] <= 8) && \ + (_param_conv_batchnorm_scale_relu.weight()->height() == \ + _param_conv_batchnorm_scale_relu.weight()->width() == 3) && \ + (_param_conv_batchnorm_scale_relu.stride_h == _param_conv_batchnorm_scale_relu.stride_w == 1) &&\ + (_param_conv_batchnorm_scale_relu.dilation_h == _param_conv_batchnorm_scale_relu.dilation_w ==1) && \ + (_param_conv_batchnorm_scale_relu.group == 1)) { + impl_e = VENDER_IMPL; + } */ +#endif SABER_CHECK(_funcs_conv_batchnorm_scale_relu.init(ins, outs, _param_conv_batchnorm_scale_relu, SPECIFY, impl_e, ctx)); @@ -210,19 +267,30 @@ ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, #ifdef USE_X86_PLACE INSTANCE_CONVBATCHNORMSCALERELU(X86, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, X86, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALERELU(X86, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, X86, Precision::INT8); #endif //#ifdef USE_X86_PLACE //template class ConvBatchnormScaleReluHelper; //INSTANCE_CONVBATCHNORMSCALERELU(X86, Precision::FP32); -//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, X86, +//ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, X86, // Precision::FP32); //#endif #ifdef USE_ARM_PLACE INSTANCE_CONVBATCHNORMSCALERELU(ARM, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALERELU(ARM, Precision::INT8); template class ConvBatchnormScaleReluHelper; +template class ConvBatchnormScaleReluHelper; ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, ARM, Precision::INT8); +#endif + +#ifdef AMD_GPU +INSTANCE_CONVBATCHNORMSCALERELU(AMD, Precision::FP32); +template class ConvBatchnormScaleReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleRelu, ConvBatchnormScaleReluHelper, AMD, Precision::FP32); #endif #if defined BUILD_LITE @@ -239,6 +307,10 @@ ANAKIN_REGISTER_OP(ConvBatchnormScaleRelu) #endif #ifdef USE_ARM_PLACE .__alias__("convolution_batchnorm_scale_relu") +.__alias__("convolution_batchnorm_scale_relu") +#endif +#ifdef AMD_GPU +.__alias__("convolution_batchnorm_scale_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp index 0b4925e1c..d8e561451 100644 --- a/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h" namespace anakin { @@ -26,7 +40,7 @@ template Status ConvBatchnormScaleReluPoolHelper::InitParam() { DLOG(WARNING) << "Parsing ConvBatchnormScaleReluPool op parameter."; - ConvParam conv_param_temp; + ConvParam conv_param_temp; PoolingParam pooling_param_temp; // get conv param @@ -39,10 +53,18 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -81,7 +103,7 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { pooling_param_temp = pooling_param; } else if (pool_method == "AVG") { - PoolingParam pooling_param(pool_size[0], pool_size[1], + PoolingParam pooling_param(pool_size[0], pool_size[1], pool_padding[0], pool_padding[1], pool_strides[0], pool_strides[1], Pooling_average_include_padding, global_pooling, cmp_out_shape_floor_as_conv); @@ -90,20 +112,30 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { LOG(FATAL) << " SassConvBatchnormScaleReluPool fusion op doesn't support : " << pool_method << " pooling."; } - // check if batchnorm parameters have been optimized + // check if batchnorm parameters have been optimized auto is_param_updated = CHECK_PARAMETER(is_param_updated); if (!is_param_updated) { SET_PARAMETER(is_param_updated, true, bool); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } else{ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -114,14 +146,23 @@ Status ConvBatchnormScaleReluPoolHelper::InitParam() { pblock_type* bias = new pblock_type(); SET_PARAMETER(bias_term, true, bool); // set attr bias_term true SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + }else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -152,11 +193,28 @@ Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ct auto strides = GET_PARAMETER(PTuple, strides); auto weights = GET_PARAMETER(PBlock, weight_1); auto bias_term = GET_PARAMETER(bool, bias_term); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); +#ifdef AMD_GPU saber::ImplEnum impl_e = SABER_IMPL; - if (std::is_same::value) { +#else + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } + if (std::is_same::value && (Ptype == Precision::INT8)) { + impl_e = SABER_IMPL; + } + /*auto valid_shape = ins[0]->valid_shape(); + if((valid_shape[2] <=4) && (valid_shape[3] <= 8) && \ + (weights.d_tensor().height() == \ + weights.d_tensor().width() == 3) && \ + (strides[0] == strides[1] == 1) &&\ + (dilation_rate[0] == dilation_rate[1] == 1) && \ + (group == 1)) { + impl_e = VENDER_IMPL; + }*/ +#endif _funcs_conv_batchnorm_scale_relu_pooling.init(ins, outs, _param_conv_batchnorm_scale_relu_pooling, SPECIFY, impl_e, ctx); @@ -166,7 +224,7 @@ Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ct SET_PARAMETER(is_weights_transed, true, bool); if (bias_term) { auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), @@ -175,7 +233,7 @@ Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ct bias.map_to_host(); } else { PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), @@ -187,7 +245,7 @@ Status ConvBatchnormScaleReluPoolHelper::Init(OpContext &ct } else { PBlock weight_empty; PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_batchnorm_scale_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.pad_w, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_h, _param_conv_batchnorm_scale_relu_pooling.conv_param.dilation_w, @@ -220,8 +278,11 @@ ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPool #ifdef USE_ARM_PLACE INSTANCE_CONVBATCHNORMSCALERELUPOOLING(ARM, Precision::FP32); +INSTANCE_CONVBATCHNORMSCALERELUPOOLING(ARM, Precision::INT8); template class ConvBatchnormScaleReluPoolHelper; +template class ConvBatchnormScaleReluPoolHelper; ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, ARM, Precision::INT8); #endif #ifdef BUILD_LITE @@ -230,6 +291,12 @@ template class ConvBatchnormScaleReluPoolHelper; ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, X86, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_CONVBATCHNORMSCALERELUPOOLING(AMD, Precision::FP32); +template class ConvBatchnormScaleReluPoolHelper; +ANAKIN_REGISTER_OP_HELPER(ConvBatchnormScaleReluPool, ConvBatchnormScaleReluPoolHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(ConvBatchnormScaleReluPool) .Doc("ConvBatchnormScaleReluPool fusion operator") @@ -239,10 +306,14 @@ ANAKIN_REGISTER_OP(ConvBatchnormScaleReluPool) #endif #ifdef USE_ARM_PLACE .__alias__("convolution_batchnorm_scale_relu_pooling") +.__alias__("convolution_batchnorm_scale_relu_pooling") #endif #ifdef BUILD_LITE .__alias__("convolution_batchnorm_scale_relu_pooling") #endif +#ifdef AMD_GPU +.__alias__("convolution_batchnorm_scale_relu_pooling") +#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/fusion_ops/conv_eltwise.cpp b/framework/operators/fusion_ops/conv_eltwise.cpp index d3f326e5e..a053dc1a3 100644 --- a/framework/operators/fusion_ops/conv_eltwise.cpp +++ b/framework/operators/fusion_ops/conv_eltwise.cpp @@ -6,17 +6,17 @@ namespace ops { #define INSTANCE_CONVOLUTION(Ttype, Ptype) \ template<> \ -void ConEltwise::operator()(OpContext& ctx, \ +void ConvEltwise::operator()(OpContext& ctx, \ const std::vector >& ins, \ std::vector >& outs) { \ - auto* impl = static_cast*>(this->_helper); \ - auto& param = static_cast*> \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ (this->_helper)->_param_conv_eltwise; \ impl->_funcs_conv_eltwise(ins, outs, param, ctx); \ } template -Status ConEltwiseHelper::InitParam() { +Status ConvEltwiseHelper::InitParam() { DLOG(WARNING) << "Parsing Conv_eltwise op parameter."; saber::ConvParam tmp_conv_param; saber::EltwiseParam tmp_eltwise_param; @@ -34,6 +34,14 @@ Status ConEltwiseHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // check if this op has batchnorm parameters auto has_batchnorm = CHECK_PARAMETER(batchnorm_0_epsilon); @@ -57,20 +65,30 @@ Status ConEltwiseHelper::InitParam() { auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); auto scale_weight_2_vector = scale_weight_2.vector(); - // check if batchnorm parameters have been optimized + // check if batchnorm parameters have been optimized auto is_param_updated = CHECK_PARAMETER(is_param_updated); if (!is_param_updated) { SET_PARAMETER(is_param_updated, true, bool); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply( - update_weights, weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, batch_norm_weight_2_vector, - scale_weight_1_vector, scale_weight_2_vector, - scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, batch_norm_weight_2_vector, + scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], @@ -81,16 +99,27 @@ Status ConEltwiseHelper::InitParam() { pblock_type* bias = new pblock_type(); SET_PARAMETER(bias_term, true, bool); // set attr bias_term true SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias - - graph::GraphGlobalMem::Global().template apply( - update_weights, weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - scale_weight_1_vector, - scale_weight_2_vector, - scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -126,7 +155,7 @@ Status ConEltwiseHelper::InitParam() { if (has_merge_type) { auto type = GET_PARAMETER(std::string, merge_type); auto coeff = GET_PARAMETER(PTuple, merge_coeff); - + auto has_alpha = CHECK_PARAMETER(merge_relu_0_alpha); EltwiseType elt_type; @@ -148,14 +177,22 @@ Status ConEltwiseHelper::InitParam() { saber::ConvEltwiseParam conv_eltwise_param(tmp_conv_param, tmp_eltwise_param); _param_conv_eltwise = conv_eltwise_param; } else { - LOG(FATAL) << "ConEltwise Op must have been merged eltwise or eltwise + activation."; + LOG(FATAL) << "ConvEltwise Op must have been merged eltwise or eltwise + activation."; + } + if ((std::is_same::value || std::is_same::value)&& Ptype == Precision::INT8) { + auto scale_0 = GET_PARAMETER(float, scale_0); + auto scale_3 = GET_PARAMETER(float, scale_3); + auto be_eltwise_dtype = GET_PARAMETER(DataType, be_eltwise_dtype); + float beta = scale_0; + _param_conv_eltwise.conv_param.beta = beta; + _param_conv_eltwise.conv_param.beta_type = be_eltwise_dtype; } - +// LOG(ERROR) << "framework alpha: "<< _param_conv_eltwise.conv_param.alpha << " beta: " << _param_conv_eltwise.conv_param.beta; return Status::OK(); } template -Status ConEltwiseHelper::Init(OpContext& ctx, +Status ConvEltwiseHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { auto group = GET_PARAMETER(int, group); @@ -165,6 +202,13 @@ Status ConEltwiseHelper::Init(OpContext& ctx, //different device pleace change here.. saber::ImplEnum impl_e = SABER_IMPL; + // TODO !! output scale is the eltwise_relu output scale!!! + // THIS IS NOT SUPPORT TO BE THIS WAY, the output scale is not the same with conv_eltwise output scale. + if ((std::is_same::value||std::is_same::value) && Ptype == Precision::INT8) { + auto scale_3 = GET_PARAMETER(float, scale_3); + outs[0]->set_scale({scale_3}); + } + SABER_CHECK(_funcs_conv_eltwise.init(ins, outs, _param_conv_eltwise, SPECIFY, impl_e, ctx)); // check if weights have been transposed @@ -173,16 +217,16 @@ Status ConEltwiseHelper::Init(OpContext& ctx, SET_PARAMETER(is_weights_transed, true, bool); if (bias_term) { auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvEltwise::saber_type>::trans_weights, + graph::GraphGlobalMem::Global().template apply( + std::bind(&saber::ConvEltwise::saber_type>::trans_weights, &_funcs_conv_eltwise, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), weights.d_tensor(), bias.d_tensor(), _param_conv_eltwise.conv_param.pad_h, _param_conv_eltwise.conv_param.pad_w, _param_conv_eltwise.conv_param.dilation_h, _param_conv_eltwise.conv_param.dilation_w, strides[0], strides[1], group, impl_e); bias.map_to_host(); } else { PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvEltwise::saber_type>::trans_weights, + graph::GraphGlobalMem::Global().template apply( + std::bind(&saber::ConvEltwise::saber_type>::trans_weights, &_funcs_conv_eltwise, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), weights.d_tensor(), bias_empty.d_tensor(), _param_conv_eltwise.conv_param.pad_h, _param_conv_eltwise.conv_param.pad_w, _param_conv_eltwise.conv_param.dilation_h, _param_conv_eltwise.conv_param.dilation_w, strides[0], strides[1], group, impl_e); @@ -192,16 +236,27 @@ Status ConEltwiseHelper::Init(OpContext& ctx, PBlock weight_empty; PBlock bias_empty; graph::GraphGlobalMem::Global().template apply( - std::bind(&ConvEltwise::saber_type>::trans_weights, + std::bind(&saber::ConvEltwise::saber_type>::trans_weights, &_funcs_conv_eltwise, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_eltwise.conv_param.pad_h, _param_conv_eltwise.conv_param.pad_w, _param_conv_eltwise.conv_param.dilation_h, _param_conv_eltwise.conv_param.dilation_w, strides[0], strides[1], group, impl_e); } + // TODO beta need some more data to compute!!! this part perhapes will lead some bugs... + // TODO at least check for scale + // FIXME don`t add other device for this + if (std::is_same::value && Ptype == Precision::INT8) { + float beta = _param_conv_eltwise.conv_param.beta; + float in_scale = ins[0]->get_scale()[0]; + float weight_scale = _param_conv_eltwise.conv_param.weight()->get_scale()[0]; + beta = beta / in_scale / weight_scale; +// LOG(ERROR) << " beta = " << beta ; + _param_conv_eltwise.conv_param.beta = beta; + } return Status::OK(); } template -Status ConEltwiseHelper::InferShape(const +Status ConvEltwiseHelper::InferShape(const std::vector >& ins, std::vector >& outs) { SABER_CHECK(_funcs_conv_eltwise.compute_output_shape(ins, outs, _param_conv_eltwise)); @@ -209,50 +264,51 @@ Status ConEltwiseHelper::InferShape(const } #ifdef USE_CUDA -template class ConEltwiseHelper; -template class ConEltwiseHelper; -template class ConEltwiseHelper; +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; INSTANCE_CONVOLUTION(NV, Precision::FP32); INSTANCE_CONVOLUTION(NV, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, NV, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, NV, Precision::INT8); #endif #ifdef USE_X86_PLACE -template class ConEltwiseHelper; -template class ConEltwiseHelper; -template class ConEltwiseHelper; +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; INSTANCE_CONVOLUTION(X86, Precision::FP32); INSTANCE_CONVOLUTION(X86, Precision::INT8); -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, X86, Precision::FP32); -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, X86, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, X86, Precision::INT8); #endif #ifdef USE_ARM_PLACE INSTANCE_CONVOLUTION(ARM, Precision::FP32); -template class ConEltwiseHelper; -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, ARM, Precision::FP32); +template class ConvEltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, ARM, Precision::FP32); #endif #ifdef AMD_GPU INSTANCE_CONVOLUTION(AMD, Precision::FP32); -template class ConEltwiseHelper; -template class ConEltwiseHelper; -template class ConEltwiseHelper; -ANAKIN_REGISTER_OP_HELPER(ConEltwise, ConEltwiseHelper, AMD, Precision::FP32); +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; +template class ConvEltwiseHelper; +ANAKIN_REGISTER_OP_HELPER(ConvEltwise, ConvEltwiseHelper, AMD, Precision::FP32); #endif //! register op -ANAKIN_REGISTER_OP(ConEltwise) +ANAKIN_REGISTER_OP(ConvEltwise) .Doc("ConvEltwise operator") #ifdef USE_X86_PLACE .__alias__("ConvEltwise") #endif #ifdef USE_CUDA .__alias__("ConvEltwise") +.__alias__("ConvEltwise") #endif #ifdef AMD_GPU .__alias__("ConvEltwise") diff --git a/framework/operators/fusion_ops/conv_eltwise.h b/framework/operators/fusion_ops/conv_eltwise.h index cca8377d6..a167e9b65 100644 --- a/framework/operators/fusion_ops/conv_eltwise.h +++ b/framework/operators/fusion_ops/conv_eltwise.h @@ -27,7 +27,7 @@ namespace anakin { namespace ops { template -class ConEltwiseHelper; +class ConvEltwiseHelper; /// pooling op /** @@ -35,19 +35,19 @@ class ConEltwiseHelper; * public inheritance Operator */ template -class ConEltwise : public Operator { +class ConvEltwise : public Operator { public: - ConEltwise() {} + ConvEltwise() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator ConEltwise< Ttype(" + LOG(ERROR) << "Not Impl Yet Operator ConvEltwise< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class ConEltwiseHelper; + friend class ConvEltwiseHelper; }; /** @@ -56,11 +56,11 @@ class ConEltwise : public Operator { * including init resource and shape size in convolution context */ template -class ConEltwiseHelper : public OperatorHelper { +class ConvEltwiseHelper : public OperatorHelper { public: - ConEltwiseHelper()=default; + ConvEltwiseHelper()=default; - ~ConEltwiseHelper(){} + ~ConvEltwiseHelper(){} Status InitParam() override; @@ -91,7 +91,7 @@ class ConEltwiseHelper : public OperatorHelper { saber::ConvEltwise::saber_type> _funcs_conv_eltwise; private: - ///< _dims stand for ConEltwise size + ///< _dims stand for ConvEltwise size PTuple _dims; }; diff --git a/framework/operators/fusion_ops/conv_relu.cpp b/framework/operators/fusion_ops/conv_relu.cpp index f0038c08a..17c9c57ef 100644 --- a/framework/operators/fusion_ops/conv_relu.cpp +++ b/framework/operators/fusion_ops/conv_relu.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/conv_relu.h" namespace anakin { @@ -30,10 +44,16 @@ Status ConvReluHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); ActivationParam active_param(Active_relu, alpha); // TEMP @@ -67,11 +87,17 @@ Status ConvReluHelper::Init(OpContext& ctx, auto bias_term = GET_PARAMETER(bool, bias_term); //different device please change here!!! +#ifdef AMD_GPU + saber::ImplEnum impl_e = SABER_IMPL; +#else saber::ImplEnum impl_e = VENDER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } - bool use_k1s1p0 = true; + if (std::is_same::value && Ptype == Precision::INT8) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.weight()->height() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.weight()->width() == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.pad_h == 0); @@ -82,7 +108,7 @@ Status ConvReluHelper::Init(OpContext& ctx, use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.dilation_w == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.group == 1); use_k1s1p0 = use_k1s1p0 && (_param_conv_relu.bias()->valid_size() > 0); - bool use_k3s1d1 = true; + bool use_k3s1d1 = (Ptype == Precision::FP32); use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.weight()->height() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.weight()->width() == 3); use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.group == 1); @@ -90,15 +116,28 @@ Status ConvReluHelper::Init(OpContext& ctx, use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.stride_w == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.dilation_h == 1); use_k3s1d1 = use_k3s1d1 && (_param_conv_relu.dilation_w == 1); - bool use_depthwise = true; + bool use_depthwise = (Ptype == Precision::FP32); use_depthwise = use_depthwise && (_param_conv_relu.group == ins[0]->channel()); use_depthwise = use_depthwise && (_param_conv_relu.group == outs[0]->channel()); - bool use_direct_k = true; + bool use_direct_k = (Ptype == Precision::FP32); use_direct_k = use_direct_k && (_param_conv_relu.weight()->channel() >= 16); use_direct_k = use_direct_k && (_param_conv_relu.group == 1); - if (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k) { + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { impl_e = SABER_IMPL; } + + /*auto valid_shape = ins[0]->valid_shape(); + if((valid_shape[2] <=4) && (valid_shape[3] <= 8) && \ + (_param_conv_relu.weight()->height() == \ + _param_conv_relu.weight()->width() == 3) && \ + (_param_conv_relu.stride_h == _param_conv_relu.stride_w == 1) &&\ + (_param_conv_relu.dilation_h == _param_conv_relu.dilation_w ==1) && \ + (_param_conv_relu.group == 1)) { + impl_e = VENDER_IMPL; + }*/ +#endif + SABER_CHECK(_funcs_conv_relu.init(ins, outs, _param_conv_relu, SPECIFY, impl_e, ctx)); @@ -152,15 +191,19 @@ ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, NV, Precision::INT8); #ifdef USE_X86_PLACE INSTANCE_CONVRELU(X86, Precision::FP32); -//template class ConvReluHelper; +INSTANCE_CONVRELU(X86, Precision::INT8); ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, X86, Precision::INT8); #endif #ifdef USE_ARM_PLACE INSTANCE_CONVRELU(ARM, Precision::FP32); +INSTANCE_CONVRELU(ARM, Precision::INT8); template class ConvReluHelper; +template class ConvReluHelper; ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, ARM, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvRelu, ConvReluHelper, ARM, Precision::INT8); #endif #ifdef AMD_GPU @@ -183,6 +226,7 @@ ANAKIN_REGISTER_OP(ConvRelu) #endif #ifdef USE_ARM_PLACE .__alias__("conv_relu") +.__alias__("conv_relu") #endif #ifdef AMD_GPU .__alias__("conv_relu") @@ -190,9 +234,9 @@ ANAKIN_REGISTER_OP(ConvRelu) #if defined BUILD_LITE .__alias__("power") #endif -//#ifdef USE_X86_PLACE -//.__alias__("power") -//#endif +#ifdef USE_X86_PLACE +.__alias__("conv_relu") +#endif .num_in(1) .num_out(1) .Args("group", " group of conv ") diff --git a/framework/operators/fusion_ops/conv_relu_pool.cpp b/framework/operators/fusion_ops/conv_relu_pool.cpp index 4537a1c46..029625191 100644 --- a/framework/operators/fusion_ops/conv_relu_pool.cpp +++ b/framework/operators/fusion_ops/conv_relu_pool.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/conv_relu_pool.h" namespace anakin { @@ -37,8 +51,13 @@ Status ConvReluPoolHelper::InitParam() { using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - auto weight_vec = weights.vector(); - + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); ActivationParam active_param(Active_relu, alpha); // Temp @@ -87,7 +106,7 @@ Status ConvReluPoolHelper::InitParam() { } template -Status ConvReluPoolHelper::Init(OpContext &ctx, +Status ConvReluPoolHelper::Init(OpContext &ctx, const std::vector >& ins, std::vector >& outs) { @@ -96,10 +115,15 @@ Status ConvReluPoolHelper::Init(OpContext &ctx, auto weights = GET_PARAMETER(PBlock, weight_1); auto bias_term = GET_PARAMETER(bool, bias_term); +#ifdef AMD_GPU + saber::ImplEnum impl_e = SABER_IMPL; +#else saber::ImplEnum impl_e = SABER_IMPL; - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { impl_e = SABER_IMPL; } +#endif + _funcs_conv_relu_pooling.init(ins, outs, _param_conv_relu_pooling, SPECIFY, impl_e, ctx); @@ -109,7 +133,7 @@ Status ConvReluPoolHelper::Init(OpContext &ctx, SET_PARAMETER(is_weights_transed, true, bool); if (bias_term) { auto bias = GET_PARAMETER(PBlock, weight_2); - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), @@ -118,7 +142,7 @@ Status ConvReluPoolHelper::Init(OpContext &ctx, bias.map_to_host(); } else { PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), @@ -130,7 +154,7 @@ Status ConvReluPoolHelper::Init(OpContext &ctx, } else { PBlock weight_empty; PBlock bias_empty; - graph::GraphGlobalMem::Global().template apply( + graph::GraphGlobalMem::Global().template apply( std::bind(&ConvPooling::saber_type>::trans_weights, &_funcs_conv_relu_pooling, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_relu_pooling.conv_param.pad_h, _param_conv_relu_pooling.conv_param.pad_w, _param_conv_relu_pooling.conv_param.dilation_h, _param_conv_relu_pooling.conv_param.dilation_w, @@ -156,6 +180,8 @@ ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, NV, Precision::INT8) #ifdef USE_X86_PLACE INSTANCE_CONVRELUPOOLING(X86, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, X86, Precision::FP32); +INSTANCE_CONVRELUPOOLING(X86, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvReluPool, ConvReluPoolHelper, X86, Precision::INT8); #endif #ifdef USE_ARM_PLACE diff --git a/framework/operators/fusion_ops/conv_scale.cpp b/framework/operators/fusion_ops/conv_scale.cpp new file mode 100644 index 000000000..38cf387b7 --- /dev/null +++ b/framework/operators/fusion_ops/conv_scale.cpp @@ -0,0 +1,261 @@ +#include "framework/operators/fusion_ops/conv_scale.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_CONVSCALE(Ttype, Ptype) \ +template<> \ +void ConvScale::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_scale;\ + SABER_CHECK(impl->_funcs_conv_scale(ins, outs, param, ctx));\ +} + +template +Status ConvScaleHelper::InitParam() { + LOG(WARNING) << "Parsing ConvScale op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + auto axis = GET_PARAMETER(int, axis); + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } + + // get scale param + auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); + auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); + auto scale_axis = GET_PARAMETER(int, scale_0_axis); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); + auto scale_weight_2_vector = scale_weight_2.vector(); + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if (!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if (bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + _param_conv_scale = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + if (weights_dtype == AK_FLOAT){ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, scale_weight_1_vector, scale_weight_2_vector,scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, scale_weight_1_vector, scale_weight_2_vector,scale_bias_term); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor())); + + _param_conv_scale = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor())); + + _param_conv_scale = conv_param; + } + return Status::OK(); +} + +template +Status ConvScaleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + auto bias_term = GET_PARAMETER(bool, bias_term); + + //different device please change here!!! + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.weight()->height() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.weight()->width() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.pad_h == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.pad_w == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.stride_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.stride_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.dilation_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.dilation_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.group == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale.bias()->valid_size() > 0); + bool use_k3s1d1 = (Ptype == Precision::FP32); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.weight()->height() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.weight()->width() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.group == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.stride_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.stride_w == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.dilation_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale.dilation_w == 1); + bool use_depthwise = (Ptype == Precision::FP32); + use_depthwise = use_depthwise && (_param_conv_scale.group == ins[0]->channel()); + use_depthwise = use_depthwise && (_param_conv_scale.group == outs[0]->channel()); + bool use_direct_k = (Ptype == Precision::FP32); + use_direct_k = use_direct_k && (_param_conv_scale.weight()->channel() >= 16); + use_direct_k = use_direct_k && (_param_conv_scale.group == 1); + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { + impl_e = SABER_IMPL; + } + if (std::is_same::value && Ptype == Precision::INT8) { + impl_e = SABER_IMPL; + } + SABER_CHECK(_funcs_conv_scale.init(ins, outs, \ + _param_conv_scale, SPECIFY, impl_e, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if (!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + if (bias_term) { + auto bias = GET_PARAMETER(PBlock, weight_2); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias.d_tensor(), _param_conv_scale.pad_h, _param_conv_scale.pad_w, _param_conv_scale.dilation_h, _param_conv_scale.dilation_w, + strides[0], strides[1], group, impl_e); + bias.map_to_host(); + } else { + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias_empty.d_tensor(), _param_conv_scale.pad_h, _param_conv_scale.pad_w, _param_conv_scale.dilation_h, _param_conv_scale.dilation_w, + strides[0], strides[1], group, impl_e); + } + weights.map_to_host(); + } else { + PBlock weight_empty; + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_scale.pad_h, _param_conv_scale.pad_w, _param_conv_scale.dilation_h, _param_conv_scale.dilation_w, + strides[0], strides[1], group, impl_e); + } + return Status::OK(); +} + +template +Status ConvScaleHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_scale.compute_output_shape(ins, outs, \ + _param_conv_scale)); + return Status::OK(); +} + +#ifdef USE_ARM_PLACE +INSTANCE_CONVSCALE(ARM, Precision::FP32); +template class ConvScaleHelper; +ANAKIN_REGISTER_OP_HELPER(ConvScale, ConvScaleHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_CONVSCALE(NV, Precision::FP32); +INSTANCE_CONVSCALE(NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvScale, ConvScaleHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvScale, ConvScaleHelper, NV, Precision::INT8); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_CONVSCALE(X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvScale, ConvScaleHelper, X86, Precision::FP32); +#endif + +#if defined BUILD_LITE +INSTANCE_CONVSCALE(X86, Precision::FP32); +template class ConvScaleHelper; +ANAKIN_REGISTER_OP_HELPER(ConvScale, ConvScaleHelper, X86, Precision::FP32); +#endif + + +//! register op +ANAKIN_REGISTER_OP(ConvScale) +.Doc("ConvScale fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_scale") +.__alias__("convolution_scale") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_scale") +#endif +#if defined BUILD_LITE +.__alias__("convolution_scale") +#endif +#ifdef AMD_GPU +//.__alias__("convolution_scale") +//.__alias__("convolution_scale") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)") +.Args("axis", "axis of conv"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/conv_3x3.h b/framework/operators/fusion_ops/conv_scale.h similarity index 71% rename from framework/operators/conv_3x3.h rename to framework/operators/fusion_ops/conv_scale.h index acc69ea34..052949889 100644 --- a/framework/operators/conv_3x3.h +++ b/framework/operators/fusion_ops/conv_scale.h @@ -13,8 +13,8 @@ limitations under the License. */ -#ifndef ANAKIN_OPERATOR_CONV_SASS_H -#define ANAKIN_OPERATOR_CONV_SASS_H +#ifndef ANAKIN_OPERATOR_CONV_SCALE_H +#define ANAKIN_OPERATOR_CONV_SCALE_H #include "framework/core/base.h" #include "framework/core/data_types.h" @@ -27,46 +27,46 @@ namespace anakin { namespace ops { template -class SassConvolutionHelper; +class ConvScaleHelper; /// pooling op /** - * \brief conv_3X3 implementation class + * \brief ConvScaleHelper implementation class * public inherit Operator */ template -class SassConvolution : public Operator { +class ConvScale : public Operator { public: - SassConvolution() {} + ConvScale() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvolution< Ttype(" + LOG(ERROR) << "Not Impl Yet Operator ConvScale< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvolutionHelper; + friend class ConvScaleHelper; }; /** - * \brief conv_3X3 helper class to implement conv3X3 + * \brief ConvScale helper class to implement it * public inherit OperatorHelper - * including init resource and shape size in conv3X3 context + * including init resource and shape size in ConvScaleHelper context */ template -class SassConvolutionHelper : public OperatorHelper { +class ConvScaleHelper : public OperatorHelper { public: - SassConvolutionHelper()=default; + ConvScaleHelper()=default; - ~SassConvolutionHelper(); + ~ConvScaleHelper() {} Status InitParam() override; /** * \brief initial all the resource needed by pooling - * \param ctx stand for conv_3X3 operation context + * \param ctx stand for ConvScale operation context * \param ins stand for input tensor vector * \param outs stand for output tensor vector * \return status @@ -85,18 +85,12 @@ class SassConvolutionHelper : public OperatorHelper { std::vector >& outs) override; public: - ///< _param_conv stand for conv_3X3 parameter - saber::ConvParam _param_conv; - ///< _funcs_conv stand for convolution function - saber::Conv::saber_type> _funcs_conv; - -private: - ///< _dims stand for conv_3X3 size - PTuple _dims; + ///< _param_conv_batchnorm stand for ConvScale parameter + saber::ConvParam _param_conv_scale; + ///< _funcs_conv stand for ConvScale function + saber::Conv::saber_type> _funcs_conv_scale; }; - - } /* namespace ops */ } /* namespace anakin */ diff --git a/framework/operators/fusion_ops/conv_scale_relu.cpp b/framework/operators/fusion_ops/conv_scale_relu.cpp new file mode 100644 index 000000000..675107b3f --- /dev/null +++ b/framework/operators/fusion_ops/conv_scale_relu.cpp @@ -0,0 +1,266 @@ +#include "framework/operators/fusion_ops/conv_scale_relu.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_CONVSCALERELU(Ttype, Ptype) \ +template<> \ +void ConvScaleRelu::operator()(\ + OpContext& ctx,\ + const std::vector >& ins,\ + std::vector >& outs) {\ + auto* impl = static_cast*>(this->_helper);\ + auto& param = static_cast*>\ + (this->_helper)->_param_conv_scale_relu;\ + SABER_CHECK(impl->_funcs_conv_scale_relu(ins, outs, param, ctx));\ +} + +template +Status ConvScaleReluHelper::InitParam() { + LOG(WARNING) << "Parsing ConvScaleRelu op parameter."; + + // get conv param + auto group = GET_PARAMETER(int, group); + auto bias_term = GET_PARAMETER(bool, bias_term); + auto padding = GET_PARAMETER(PTuple, padding); + auto strides = GET_PARAMETER(PTuple, strides); + auto dilation_rate = GET_PARAMETER(PTuple, dilation_rate); + auto filter_num = GET_PARAMETER(int, filter_num); + auto kernel_size = GET_PARAMETER(PTuple, kernel_size); + auto axis = GET_PARAMETER(int, axis); + + using pblock_type = PBlock; + auto weights = GET_PARAMETER(pblock_type, weight_1); + auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } + + // get scale param + auto scale_num_axes = GET_PARAMETER(int, scale_0_num_axes); + auto scale_bias_term = GET_PARAMETER(bool, scale_0_bias_term); + auto scale_axis = GET_PARAMETER(int, scale_0_axis); + auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); + auto scale_weight_1_vector = scale_weight_1.vector(); + auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); + auto scale_weight_2_vector = scale_weight_2.vector(); + + // get relu param + auto alpha = GET_PARAMETER(float, relu_0_alpha); + ActivationParam active_param(Active_relu, alpha); // TEMP + + // check if batchnorm parameters have been optimized + auto is_param_updated = CHECK_PARAMETER(is_param_updated); + if (!is_param_updated) { + SET_PARAMETER(is_param_updated, true, bool); + + if (bias_term) { + auto bias = GET_PARAMETER(pblock_type, weight_2); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, scale_weight_1_vector, scale_weight_2_vector, + scale_bias_term); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], + dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), + active_param); + _param_conv_scale_relu = conv_param; + } else { + pblock_type* bias = new pblock_type(); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + if (weights_dtype == AK_FLOAT){ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, scale_weight_1_vector, scale_weight_2_vector,scale_bias_term); + } else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_weights_conv_scale, weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, scale_weight_1_vector, scale_weight_2_vector,scale_bias_term); + } + + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias->d_tensor()), active_param); + + _param_conv_scale_relu = conv_param; + } + } else { + auto bias = GET_PARAMETER(pblock_type, weight_2); + saber::ConvParam conv_param(group, padding[0], padding[1], + strides[0], strides[1], dilation_rate[0], dilation_rate[1], + &(weights.d_tensor()), &(bias.d_tensor()), active_param); + + _param_conv_scale_relu = conv_param; + } + return Status::OK(); +} + +template +Status ConvScaleReluHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto group = GET_PARAMETER(int, group); + auto strides = GET_PARAMETER(PTuple, strides); + auto weights = GET_PARAMETER(PBlock, weight_1); + auto bias_term = GET_PARAMETER(bool, bias_term); + + //different device please change here!!! + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + bool use_k1s1p0 = (Ptype == Precision::FP32); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.weight()->height() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.weight()->width() == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.pad_h == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.pad_w == 0); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.stride_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.stride_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.dilation_h == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.dilation_w == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.group == 1); + use_k1s1p0 = use_k1s1p0 && (_param_conv_scale_relu.bias()->valid_size() > 0); + bool use_k3s1d1 = (Ptype == Precision::FP32); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.weight()->height() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.weight()->width() == 3); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.group == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.stride_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.stride_w == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.dilation_h == 1); + use_k3s1d1 = use_k3s1d1 && (_param_conv_scale_relu.dilation_w == 1); + bool use_depthwise = (Ptype == Precision::FP32); + use_depthwise = use_depthwise && (_param_conv_scale_relu.group == ins[0]->channel()); + use_depthwise = use_depthwise && (_param_conv_scale_relu.group == outs[0]->channel()); + bool use_direct_k = (Ptype == Precision::FP32); + use_direct_k = use_direct_k && (_param_conv_scale_relu.weight()->channel() >= 16); + use_direct_k = use_direct_k && (_param_conv_scale_relu.group == 1); + if (std::is_same::value + && (use_k1s1p0 || use_k3s1d1 || use_depthwise || use_direct_k)) { + impl_e = SABER_IMPL; + } + if (std::is_same::value && Ptype == Precision::INT8) { + impl_e = SABER_IMPL; + } + SABER_CHECK(_funcs_conv_scale_relu.init(ins, outs, \ + _param_conv_scale_relu, SPECIFY, impl_e, ctx)); + + // check if weights have been transposed + auto is_weights_transed = CHECK_PARAMETER(is_weights_transed); + if (!is_weights_transed) { + SET_PARAMETER(is_weights_transed, true, bool); + if (bias_term) { + auto bias = GET_PARAMETER(PBlock, weight_2); + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias.d_tensor(), _param_conv_scale_relu.pad_h, _param_conv_scale_relu.pad_w, _param_conv_scale_relu.dilation_h, _param_conv_scale_relu.dilation_w, + strides[0], strides[1], group, impl_e); + bias.map_to_host(); + } else { + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weights.d_tensor(), bias_empty.d_tensor(), _param_conv_scale_relu.pad_h, _param_conv_scale_relu.pad_w, _param_conv_scale_relu.dilation_h, _param_conv_scale_relu.dilation_w, + strides[0], strides[1], group, impl_e); + } + weights.map_to_host(); + } else { + PBlock weight_empty; + PBlock bias_empty; + graph::GraphGlobalMem::Global().template apply( + std::bind(&Conv::saber_type>::trans_weights, + &_funcs_conv_scale_relu, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10), + weight_empty.d_tensor(), bias_empty.d_tensor(), _param_conv_scale_relu.pad_h, _param_conv_scale_relu.pad_w, _param_conv_scale_relu.dilation_h, _param_conv_scale_relu.dilation_w, + strides[0], strides[1], group, impl_e); + } + return Status::OK(); +} + +template +Status ConvScaleReluHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_conv_scale_relu.compute_output_shape(ins, outs, \ + _param_conv_scale_relu)); + return Status::OK(); +} + +#ifdef USE_ARM_PLACE +INSTANCE_CONVSCALERELU(ARM, Precision::FP32); +template class ConvScaleReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvScaleRelu, ConvScaleReluHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_CONVSCALERELU(NV, Precision::FP32); +INSTANCE_CONVSCALERELU(NV, Precision::INT8); +ANAKIN_REGISTER_OP_HELPER(ConvScaleRelu, ConvScaleReluHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvScaleRelu, ConvScaleReluHelper, NV, Precision::INT8); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_CONVSCALERELU(X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(ConvScaleRelu, ConvScaleReluHelper, X86, Precision::FP32); +#endif + +#if defined BUILD_LITE +INSTANCE_CONVSCALERELU(X86, Precision::FP32); +template class ConvScaleReluHelper; +ANAKIN_REGISTER_OP_HELPER(ConvScaleRelu, ConvScaleReluHelper, X86, Precision::FP32); +#endif + + +//! register op +ANAKIN_REGISTER_OP(ConvScaleRelu) +.Doc("ConvScaleRelu fusion operator") +#ifdef USE_CUDA +.__alias__("convolution_scale") +.__alias__("convolution_scale") +#endif +#ifdef USE_ARM_PLACE +.__alias__("convolution_scale") +#endif +#if defined BUILD_LITE +.__alias__("convolution_scale") +#endif +#ifdef AMD_GPU +//.__alias__("convolution_scale") +//.__alias__("convolution_scale") +#endif +.num_in(1) +.num_out(1) +.Args("group", " group of conv ") +.Args("bias_term", " whether conv weights have bias") +.Args>("padding", "padding of conv (x, y)") +.Args>("strides", "strides of conv (x)") +.Args>("dilation_rate", "dilation rate of conv (x)") +.Args("filter_num", "filter(kernel) number of weights") +.Args>("kernel_size", "kernel size of kernel (x, y)") +.Args("axis", "axis of conv"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/conv_scale_relu.h b/framework/operators/fusion_ops/conv_scale_relu.h new file mode 100644 index 000000000..a61f55fce --- /dev/null +++ b/framework/operators/fusion_ops/conv_scale_relu.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_CONV_SCALE_RELU_H +#define ANAKIN_OPERATOR_CONV_SCALE_RELU_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/conv.h" + +namespace anakin { + +namespace ops { + +template +class ConvScaleReluHelper; + +/// pooling op +/** + * \brief ConvScaleReluHelper implementation class + * public inherit Operator + */ +template +class ConvScaleRelu : public Operator { +public: + ConvScaleRelu() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ConvScaleRelu< Ttype(" << + target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class ConvScaleReluHelper; +}; + +/** + * \brief ConvScaleRelu helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in ConvScaleReluHelper context + */ +template +class ConvScaleReluHelper : public OperatorHelper { +public: + ConvScaleReluHelper()=default; + + ~ConvScaleReluHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for ConvScaleRelu operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_conv_batchnorm stand for ConvScaleRelu parameter + saber::ConvParam _param_conv_scale_relu; + ///< _funcs_conv stand for ConvScaleRelu function + saber::Conv::saber_type> _funcs_conv_scale_relu; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp index 781288e4c..c0af4c23a 100644 --- a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp +++ b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.cpp @@ -38,10 +38,17 @@ Status DeconvBatchnormScaleReluHelper::InitParam() { auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); auto weights_shape = weights.shape(); + auto weights_dtype = weights.h_tensor().get_dtype(); + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } // get batchnorm param auto epsilon = GET_PARAMETER(float, batchnorm_0_epsilon); @@ -60,7 +67,7 @@ Status DeconvBatchnormScaleReluHelper::InitParam() { auto scale_weight_1 = GET_PARAMETER(pblock_type, scale_0_weight_1); auto scale_weight_1_vector = scale_weight_1.vector(); auto scale_weight_2 = GET_PARAMETER(pblock_type, scale_0_weight_2); - auto scale_weight_2_vector = scale_weight_2.vector(); + auto scale_weight_2_vector = scale_weight_2.vector(); // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); @@ -68,16 +75,31 @@ Status DeconvBatchnormScaleReluHelper::InitParam() { if(bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); - graph::GraphGlobalMem::Global().template apply(update_weights, - weights,bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - true, - batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - scale_weight_1_vector, - scale_weight_2_vector, - scale_bias_term); + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_deconv_weights, + weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + }else { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_deconv_weights, + weights, bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + true, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -86,16 +108,33 @@ Status DeconvBatchnormScaleReluHelper::InitParam() { _param_deconv_batchnorm_scale_relu = conv_param; } else { pblock_type* bias = new pblock_type(); - graph::GraphGlobalMem::Global().template apply(update_weights, - weights, *bias, - weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], - false, - batch_norm_weight_3_vector[0], epsilon, - batch_norm_weight_1_vector, - batch_norm_weight_2_vector, - scale_weight_1_vector, - scale_weight_2_vector, - scale_bias_term); + SET_PARAMETER(bias_term, true, bool); // set attr bias_term true + SET_PARAMETER(weight_2, *bias, pblock_type); // gen new bias + if (weights_dtype == AK_FLOAT) { + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_deconv_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } else{ + graph::GraphGlobalMem::Global().template apply( + WeightsFusion::update_deconv_weights, + weights, *bias, + weights_shape[0], weights_shape[1], weights_shape[2], weights_shape[3], + false, + batch_norm_weight_3_vector[0], epsilon, + batch_norm_weight_1_vector, + batch_norm_weight_2_vector, + scale_weight_1_vector, + scale_weight_2_vector, + scale_bias_term); + } saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -103,7 +142,7 @@ Status DeconvBatchnormScaleReluHelper::InitParam() { active_param); _param_deconv_batchnorm_scale_relu = conv_param; } - + return Status::OK(); } @@ -117,7 +156,7 @@ Status DeconvBatchnormScaleReluHelper::Init(OpContext& ctx, SABER_IMPL, ctx); } else { _funcs_deconv_batchnorm_scale_relu.init(ins, outs, _param_deconv_batchnorm_scale_relu, SPECIFY, - VENDER_IMPL, ctx); + SABER_IMPL, ctx); } //_funcs_deconv_batchnorm_scale_relu.init(ins, outs, _param_deconv_batchnorm_scale_relu, SPECIFY, VENDER_IMPL, ctx); @@ -143,7 +182,14 @@ template class DeconvBatchnormScaleReluHelper; template class DeconvBatchnormScaleReluHelper; template class DeconvBatchnormScaleReluHelper; #endif -#ifdef USE_X86_PLACE + +#if defined USE_X86_PLACE || defined BUILD_LITE +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +template class DeconvBatchnormScaleReluHelper; +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_DECONVBATCHNORMSCALERELU(X86, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, X86, Precision::FP32); #endif @@ -155,6 +201,7 @@ ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelp #endif #ifdef USE_ARM_PLACE +INSTANCE_DECONVBATCHNORMSCALERELU(ARM, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(DeconvBatchnormScaleRelu, DeconvBatchnormScaleReluHelper, ARM, Precision::FP32); #endif @@ -165,7 +212,10 @@ ANAKIN_REGISTER_OP(DeconvBatchnormScaleRelu) .__alias__("convolution_batchnorm_scale_relu") #endif #ifdef USE_ARM_PLACE -.__alias__("convolution_batchnorm_scale_relu") +.__alias__("deconvolution_batchnorm_scale_relu") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("deconvolution_batchnorm_scale_relu") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h index 12ba4ec3c..7c7605419 100644 --- a/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h +++ b/framework/operators/fusion_ops/deconv_batchnorm_scale_relu.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_OPERATOR_DECONV_BATCHNORM_SCALE_RELU_H @@ -40,11 +40,11 @@ class DeconvBatchnormScaleRelu : public Operator { DeconvBatchnormScaleRelu() {} /// forward impl - virtual void operator() (OpContext &ctx, - const std::vector >& ins, + virtual void operator() (OpContext &ctx, + const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator DeconvBatchnormScaleRelu< Ttype(" - << target_name::value << "), Precision("<< Ptype <<") >"; + LOG(ERROR) << "Not Impl Yet Operator DeconvBatchnormScaleRelu< Ttype(" + << target_name::value << "), Precision("<< (int)Ptype << ") >"; } friend class DeconvBatchnormScaleReluHelper; @@ -72,7 +72,7 @@ class DeconvBatchnormScaleReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, + const std::vector >& ins, std::vector >& outs) override; /** @@ -92,7 +92,7 @@ class DeconvBatchnormScaleReluHelper : public OperatorHelper { private: ///< _dims stand for DeconvBatchnormScaleRelu size - PTuple _dims; + PTuple _dims; }; diff --git a/framework/operators/fusion_ops/deconv_relu.cpp b/framework/operators/fusion_ops/deconv_relu.cpp index 3ee5d611d..aa4aa68c1 100644 --- a/framework/operators/fusion_ops/deconv_relu.cpp +++ b/framework/operators/fusion_ops/deconv_relu.cpp @@ -38,13 +38,20 @@ Status DeconvReluHelper::InitParam() { auto filter_num = GET_PARAMETER(int, filter_num); auto kernel_size = GET_PARAMETER(PTuple, kernel_size); auto axis = GET_PARAMETER(int, axis); - + using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - + // resize weights scale + auto& w = weights.h_tensor(); + if (w.get_scale().size() == 1){ + float scale_tmp = w.get_scale()[0]; + std::vector w_scale(filter_num, scale_tmp); + w.set_scale(w_scale); + } + // get relu param auto alpha = GET_PARAMETER(float, relu_0_alpha); - ActivationParam active_param(Active_relu);//, alpha); // TEMP + ActivationParam active_param(Active_relu); if (bias_term) { auto bias = GET_PARAMETER(pblock_type, weight_2); @@ -55,7 +62,7 @@ Status DeconvReluHelper::InitParam() { active_param); _param_deconv_relu = conv_param; } else { - Tensor4d* bias = new Tensor4d();; + Tensor4d* bias = new Tensor4d(); saber::ConvParam conv_param(group, padding[0], padding[1], strides[0], strides[1], dilation_rate[0], dilation_rate[1], @@ -83,7 +90,7 @@ Status DeconvReluHelper::Init(OpContext& ctx, p = p || ((ins[0]->channel() == _param_deconv_relu.group) && (ins[0]->channel() == outs[0]->channel())); - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { p = true; } diff --git a/framework/operators/fusion_ops/deconv_relu.h b/framework/operators/fusion_ops/deconv_relu.h index be6daedf4..c56fbb085 100644 --- a/framework/operators/fusion_ops/deconv_relu.h +++ b/framework/operators/fusion_ops/deconv_relu.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_OPERATOR_DECONV_RELU_H @@ -40,11 +40,11 @@ class DeconvRelu : public Operator { DeconvRelu() {} /// forward impl - virtual void operator() (OpContext &ctx, - const std::vector >& ins, + virtual void operator() (OpContext &ctx, + const std::vector >& ins, std::vector >& outs) { LOG(ERROR) << "Not Impl Yet Operator DeconvRelu< Ttype(" - << target_name::value << "), Precision("<< Ptype <<") >"; + << target_name::value << "), Precision("<< (int)Ptype <<") >"; } friend class DeconvReluHelper; @@ -72,7 +72,7 @@ class DeconvReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, + const std::vector >& ins, std::vector >& outs) override; /** @@ -87,12 +87,12 @@ class DeconvReluHelper : public OperatorHelper { public: ///< _param_deconv_relu stand for DeconvRelu parameter saber::ConvParam _param_deconv_relu; - ///< _funcs_deconv_relu stand for DeconvRelu function + ///< _funcs_deconv_relu stand for DeconvRelu function saber::Deconv::saber_type> _funcs_deconv_relu; private: ///< _dims stand for DeconvRelu size - PTuple _dims; + PTuple _dims; }; diff --git a/framework/operators/fusion_ops/eltwise_prelu.cpp b/framework/operators/fusion_ops/eltwise_prelu.cpp index 8369e717d..f89571e7a 100644 --- a/framework/operators/fusion_ops/eltwise_prelu.cpp +++ b/framework/operators/fusion_ops/eltwise_prelu.cpp @@ -4,44 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void EltwiseActivation::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_eltwise_prelu; - impl->_funcs_eltwise_prelu(ins, outs, param, ctx); -} -#endif -#ifdef USE_ARM_PLACE -template<> -void EltwiseActivation::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_eltwise_prelu; - impl->_funcs_eltwise_prelu(ins, outs, param, ctx); -} -#endif -#if defined USE_X86_PLACE || defined BUILD_LITE -template<> -void EltwiseActivation::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_eltwise_prelu; - impl->_funcs_eltwise_prelu(ins, outs, param, ctx); +#define INSTANCE_ELTWISE_PRELU(Ttype, Ptype) \ +template<> \ +void EltwiseActivation::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_eltwise_prelu; \ + impl->_funcs_eltwise_prelu(ins, outs, param, ctx); \ } -#endif -/// TODO ... specialization other type of operator - /// set helper template @@ -62,7 +34,7 @@ Status EltwiseActivationHelper::InitParam() { auto weights = GET_PARAMETER(pblock_type, prelu_0_weight_1); PreluParam prelu_param(channel_shared, &(weights.d_tensor())); - + ActivationParam activation_param(Active_prelu, 0, 0, prelu_param); EltwiseType elt_type; @@ -108,18 +80,21 @@ Status EltwiseActivationHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_ELTWISE_PRELU(NV, Precision::FP32); template class EltwiseActivationHelper; template class EltwiseActivationHelper; template class EltwiseActivationHelper; #endif #ifdef USE_ARM_PLACE +INSTANCE_ELTWISE_PRELU(ARM, Precision::FP32); template class EltwiseActivationHelper; template class EltwiseActivationHelper; template class EltwiseActivationHelper; #endif #if defined(USE_X86_PLACE) || defined(BUILD_LITE) +INSTANCE_ELTWISE_PRELU(X86, Precision::FP32); template class EltwiseActivationHelper; #endif @@ -148,6 +123,9 @@ ANAKIN_REGISTER_OP(EltwiseActivation) #if defined(USE_X86_PLACE) || defined(BUILD_LITE) .__alias__("eltwise_prelu") #endif +#ifdef AMD_GPU +//.__alias__("eltwise_prelu") +#endif .num_in(1) .num_out(1) .Args("type", " eltwise type( string )") diff --git a/framework/operators/fusion_ops/eltwise_relu.cpp b/framework/operators/fusion_ops/eltwise_relu.cpp index 5e4ad7774..1a35b5930 100644 --- a/framework/operators/fusion_ops/eltwise_relu.cpp +++ b/framework/operators/fusion_ops/eltwise_relu.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/eltwise_relu.h" namespace anakin { @@ -10,8 +24,8 @@ void EltwiseRelu::operator()(\ OpContext& ctx,\ const std::vector >& ins,\ std::vector >& outs) { \ - auto* impl = static_cast*>(this->_helper); \ - auto& param = static_cast*> \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ (this->_helper)->_param_eltwise_relu; \ impl->_funcs_eltwise_relu(ins, outs, param, ctx); \ } @@ -77,15 +91,20 @@ Status EltwiseReluHelper::InferShape(const #ifdef USE_CUDA INSTANCE_ELTWISERELU(NV, Precision::FP32) +INSTANCE_ELTWISERELU(NV, Precision::INT8) template class EltwiseReluHelper; template class EltwiseReluHelper; template class EltwiseReluHelper; +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, Precision::INT8); #endif #ifdef USE_ARM_PLACE +INSTANCE_ELTWISERELU(ARM, Precision::FP32) template class EltwiseReluHelper; template class EltwiseReluHelper; template class EltwiseReluHelper; +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, ARM, Precision::FP32); #endif #ifdef BUILD_LITE @@ -93,7 +112,9 @@ INSTANCE_ELTWISERELU(X86, Precision::FP32) template class EltwiseReluHelper; template class EltwiseReluHelper; template class EltwiseReluHelper; +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, X86, Precision::FP32); #endif + // register helper #ifdef USE_X86_PLACE @@ -101,22 +122,20 @@ INSTANCE_ELTWISERELU(X86, Precision::FP32); ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, X86, Precision::FP32); #endif -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, NV, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_ELTWISERELU(AMD, Precision::FP32) +template class EltwiseReluHelper; +template class EltwiseReluHelper; +template class EltwiseReluHelper; +ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, AMD, Precision::FP32); #endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, ARM, Precision::FP32); -#endif - -#ifdef BUILD_LITE -ANAKIN_REGISTER_OP_HELPER(EltwiseRelu, EltwiseReluHelper, X86, Precision::FP32); -#endif //! register op ANAKIN_REGISTER_OP(EltwiseRelu) .Doc("EltwiseRelu operator") #ifdef USE_CUDA .__alias__("eltwise") +.__alias__("eltwise") #endif #ifdef USE_ARM_PLACE .__alias__("eltwise") @@ -124,6 +143,9 @@ ANAKIN_REGISTER_OP(EltwiseRelu) #ifdef BUILD_LITE .__alias__("eltwise") #endif +#ifdef AMD_GPU +.__alias__("eltwise") +#endif .num_in(1) .num_out(1) .Args("type", " eltwise type( string )") diff --git a/framework/operators/fusion_ops/eltwise_relu.h b/framework/operators/fusion_ops/eltwise_relu.h index 6a5dee117..a211bf565 100644 --- a/framework/operators/fusion_ops/eltwise_relu.h +++ b/framework/operators/fusion_ops/eltwise_relu.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_OPERATOR_ELTWISE_RELU_H @@ -40,11 +40,11 @@ class EltwiseRelu : public Operator { EltwiseRelu() {} /// forward impl - virtual void operator() (OpContext &ctx, - const std::vector >& ins, + virtual void operator() (OpContext &ctx, + const std::vector >& ins, std::vector >& outs) { LOG(ERROR) << "Not Impl Yet Operator EltwiseRelu< Ttype(" - << target_name::value << "), Precision("<< Ptype <<") >"; + << target_name::value << "), Precision("<< (int)Ptype <<") >"; } friend class EltwiseReluHelper; @@ -72,7 +72,7 @@ class EltwiseReluHelper : public OperatorHelper { * \return status */ Status Init(OpContext &ctx, - const std::vector >& ins, + const std::vector >& ins, std::vector >& outs) override; /** @@ -92,7 +92,7 @@ class EltwiseReluHelper : public OperatorHelper { private: ///< _dims stand for EltwiseRelu size - PTuple _dims; + PTuple _dims; }; diff --git a/framework/operators/fusion_ops/permute_power.cpp b/framework/operators/fusion_ops/permute_power.cpp index f4df67af0..fde366416 100644 --- a/framework/operators/fusion_ops/permute_power.cpp +++ b/framework/operators/fusion_ops/permute_power.cpp @@ -1,36 +1,33 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/fusion_ops/permute_power.h" namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void PermutePower::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_permute_power; - impl->_funcs_permute_power(ins, outs, param, ctx); +#define INSTANCE_PERMUTE_POWER(Ttype, Ptype) \ +template<> \ +void PermutePower::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_permute_power; \ + impl->_funcs_permute_power(ins, outs, param, ctx); \ } -#endif -#ifdef USE_X86_PLACE -template<> -void PermutePower::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param_permute_power; - impl->_funcs_permute_power(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - /// set helper template @@ -70,31 +67,34 @@ Status PermutePowerHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_PERMUTE_POWER(NV, Precision::FP32); template class PermutePowerHelper; template class PermutePowerHelper; template class PermutePowerHelper; +ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -template class PermutePowerHelper; -template class PermutePowerHelper; -template class PermutePowerHelper; +INSTANCE_PERMUTE_POWER(ARM, Precision::FP32); + +ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_PERMUTE_POWER(X86, Precision::FP32); template class PermutePowerHelper; ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, X86, Precision::FP32); #endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, ARM, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_PERMUTE_POWER(AMD, Precision::FP32); +template class PermutePowerHelper; +template class PermutePowerHelper; +template class PermutePowerHelper; +ANAKIN_REGISTER_OP_HELPER(PermutePower, PermutePowerHelper, AMD, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(PermutePower) .Doc("PermutePower fusion operator") @@ -104,6 +104,9 @@ ANAKIN_REGISTER_OP(PermutePower) #ifdef USE_ARM_PLACE .__alias__("permute_power") #endif +#ifdef AMD_GPU +.__alias__("permute_power") +#endif .num_in(1) .num_out(1) .Args("power_0_scale", " scale of param for pawer") diff --git a/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.cpp b/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.cpp new file mode 100644 index 000000000..6afb19a7d --- /dev/null +++ b/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.cpp @@ -0,0 +1,108 @@ +#include "framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.h" + +namespace anakin { + +namespace ops { +#define INSTANCE_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN(Ttype, Ptype) \ +template<> \ +void SeqConcatSeqPoolSoftSign::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_seq_concat_seq_pool_soft_sign; \ + impl->_funcs_seq_concat_seq_pool_soft_sign(ins, outs, param, ctx); \ +} + + +/// TODO ... specialization other type of operator + + +/// set helper +template +SeqConcatSeqPoolSoftSignHelper::~SeqConcatSeqPoolSoftSignHelper() { + LOG(INFO) << "Decons permute_cpu_float"; +} + +template +Status SeqConcatSeqPoolSoftSignHelper::InitParam() { + DLOG(WARNING) << "Parsing SeqConcatSeqPoolSoftSign op parameter."; + auto pooltype = GET_PARAMETER(std::string, seq_pool_0_pooltype); + std::unordered_map type_map; + type_map.insert(std::make_pair("null", anakin::saber::Sequence_pool_unknow)); + type_map.insert(std::make_pair("AVERAGE", anakin::saber::Sequence_pool_average)); + type_map.insert(std::make_pair("SUM", anakin::saber::Sequence_pool_sum)); + type_map.insert(std::make_pair("SQRT", anakin::saber::Sequence_pool_sqrt)); + type_map.insert(std::make_pair("LAST", anakin::saber::Sequence_pool_last)); + type_map.insert(std::make_pair("FIRST", anakin::saber::Sequence_pool_first)); + type_map.insert(std::make_pair("MAX", anakin::saber::Sequence_pool_max)); + + saber::SequenceConcatParam seq_concat_param; + saber::SequencePoolParam seq_pool_param(type_map[pooltype]); + saber::SoftSignParam soft_sign_param; + + saber::SeqConcatSeqPoolSoftSignParam seq_concat_seq_pool_soft_sign_param(seq_concat_param, seq_pool_param, soft_sign_param); + _param_seq_concat_seq_pool_soft_sign = seq_concat_seq_pool_soft_sign_param; + return Status::OK(); +} + +template +Status SeqConcatSeqPoolSoftSignHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + _funcs_seq_concat_seq_pool_soft_sign.init(ins, outs, _param_seq_concat_seq_pool_soft_sign, SPECIFY, SABER_IMPL, ctx); + return Status::OK(); +} + +template +Status SeqConcatSeqPoolSoftSignHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + _funcs_seq_concat_seq_pool_soft_sign.compute_output_shape(ins, outs, _param_seq_concat_seq_pool_soft_sign); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN(NV, Precision::FP32); +template class SeqConcatSeqPoolSoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN(X86, Precision::FP32); +template class SeqConcatSeqPoolSoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN(ARM, Precision::FP32); +template class SeqConcatSeqPoolSoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN(AMD, Precision::FP32); +template class SeqConcatSeqPoolSoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignHelper, AMD, Precision::FP32); +#endif + + +//! register op +ANAKIN_REGISTER_OP(SeqConcatSeqPoolSoftSign) +.Doc("SeqConcatSeqPoolSoftSign fusion operator") +#ifdef USE_CUDA +.__alias__("seq_concat_seq_pool_soft_sign") +#endif +#ifdef USE_ARM_PLACE +.__alias__("seq_concat_seq_pool_soft_sign") +#endif +.num_in(1) +.num_out(1) +.Args("pooltype", " sequence pool type"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.h b/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.h new file mode 100644 index 000000000..5839d678c --- /dev/null +++ b/framework/operators/fusion_ops/seq_concat_seq_pool_soft_sign.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H +#define ANAKIN_OPERATOR_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/seq_concat_seq_pool_soft_sign.h" + +namespace anakin { + +namespace ops { + +template +class SeqConcatSeqPoolSoftSignHelper; + +/// pooling op +/** + * \brief SeqConcatSeqPoolSoftSign implementation class + * public inherit Operator + */ +template +class SeqConcatSeqPoolSoftSign : public Operator { +public: + SeqConcatSeqPoolSoftSign() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SeqConcatSeqPoolSoftSign< Ttype(" + << target_name::value << "), Precision("; + } + + friend class SeqConcatSeqPoolSoftSignHelper; +}; + +/** + * \brief SeqConcatSeqPoolSoftSign helper class to implement it + * public inherit OperatorHelper + * including init resource and shape size in SeqConcatSeqPoolSoftSign context + */ +template +class SeqConcatSeqPoolSoftSignHelper : public OperatorHelper { +public: + SeqConcatSeqPoolSoftSignHelper()=default; + + ~SeqConcatSeqPoolSoftSignHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SeqConcatSeqPoolSoftSign operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_seq_concat_seq_pool_soft_sign stand for SeqConcatSeqPoolSoftSign parameter + saber::SeqConcatSeqPoolSoftSignParam _param_seq_concat_seq_pool_soft_sign; + ///< _funcs_seq_concat_seq_pool_soft_sign stand for SeqConcatSeqPoolSoftSign function + saber::SeqConcatSeqPoolSoftSign::saber_type> _funcs_seq_concat_seq_pool_soft_sign; + +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/gather.cpp b/framework/operators/gather.cpp index 353fb6aa5..13658197c 100644 --- a/framework/operators/gather.cpp +++ b/framework/operators/gather.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/gather.h" namespace anakin { @@ -18,6 +32,13 @@ void Gather::operator()(OpContext& ctx, std::vector>& outs) { } #endif +#ifdef AMD_GPU +template<> +void Gather::operator()(OpContext& ctx, + const std::vector>& ins, + std::vector>& outs) { +} +#endif /// TODO ... specialization other type of operator @@ -65,6 +86,11 @@ template class GatherHelper; template class GatherHelper; template class GatherHelper; #endif +#ifdef AMD_GPU +template class GatherHelper; +template class GatherHelper; +template class GatherHelper; +#endif // register help #ifdef USE_CUDA @@ -85,6 +111,12 @@ ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::FP16); ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, X86, Precision::INT8); #endif +#ifdef AMD_GPU +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, AMD, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, AMD, Precision::FP16); +ANAKIN_REGISTER_OP_HELPER(Gather, GatherHelper, AMD, Precision::INT8); +#endif + //! register op ANAKIN_REGISTER_OP(Gather) #ifdef USE_CUDA @@ -95,6 +127,9 @@ ANAKIN_REGISTER_OP(Gather) #endif #ifdef USE_X86_PLACE .__alias__("gather") +#endif +#ifdef AMD_GPU + .__alias__("gather") #endif .Doc("Gather operator [ only a middle data holder and reshape ] "); diff --git a/framework/operators/generate_proposals.cpp b/framework/operators/generate_proposals.cpp new file mode 100644 index 000000000..78ce8b6d8 --- /dev/null +++ b/framework/operators/generate_proposals.cpp @@ -0,0 +1,113 @@ +#include "framework/operators/generate_proposals.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_GENERATE_PROPOSALS(Ttype, Ptype) \ +template<> \ +void GenerateProposals::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_generate_proposals; \ + impl->_funcs_generate_proposals(ins, outs, param, ctx); \ +} + +/// set helper +template +GenerateProposalsHelper::~GenerateProposalsHelper() { +} + +template +Status GenerateProposalsHelper::InitParam() { + DLOG(WARNING) << "Parsing GenerateProposals op parameter."; + auto pre_nms_top_n = GET_PARAMETER(int, pre_nms_top_n); + auto post_nms_top_n = GET_PARAMETER(int, post_nms_top_n); + auto nms_thresh = GET_PARAMETER(float, nms_thresh); + auto min_size = GET_PARAMETER(float, min_size); + auto eta = GET_PARAMETER(float, eta); + GenerateProposalsParam param_generate_proposals(pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + _param_generate_proposals = param_generate_proposals; + + return Status::OK(); +} + +template +Status GenerateProposalsHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_generate_proposals.init(ins, outs, _param_generate_proposals, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status GenerateProposalsHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_generate_proposals.compute_output_shape(ins, outs, _param_generate_proposals)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_GENERATE_PROPOSALS(NV, Precision::FP32); + +template<> +Status GenerateProposalsHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_generate_proposals.init(ins, outs, _param_generate_proposals, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(GenerateProposals, GenerateProposalsHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_GENERATE_PROPOSALS(X86, Precision::FP32); +INSTANCE_GENERATE_PROPOSALS(X86, Precision::FP16); +INSTANCE_GENERATE_PROPOSALS(X86, Precision::INT8); +template class GenerateProposalsHelper; +ANAKIN_REGISTER_OP_HELPER(GenerateProposals, GenerateProposalsHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_GENERATE_PROPOSALS(ARM, Precision::FP32); +template class GenerateProposalsHelper; +ANAKIN_REGISTER_OP_HELPER(GenerateProposals, GenerateProposalsHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_GENERATE_PROPOSALS(AMD, Precision::FP32); +template class GenerateProposalsHelper; +template class GenerateProposalsHelper; +template class GenerateProposalsHelper; +ANAKIN_REGISTER_OP_HELPER(GenerateProposals, GenerateProposalsHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(GenerateProposals) +.Doc("GenerateProposals operator") +#ifdef USE_CUDA +.__alias__("generate_proposals") +#endif +#ifdef USE_ARM_PLACE +.__alias__("generate_proposals") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("generate_proposals") +#endif +#ifdef AMD_GPU +.__alias__("generate_proposals") +#endif +.num_in(1) +.num_out(1) +.Args("pre_nms_top_n", "prelu channel is shared or not ") +.Args("post_nms_top_n", "post_nms_top_n") +.Args("nms_thresh", "nms_thresh") +.Args("min_size", "min_size ") +.Args("eta", "eta"); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/generate_proposals.h b/framework/operators/generate_proposals.h new file mode 100644 index 000000000..eb5d164fc --- /dev/null +++ b/framework/operators/generate_proposals.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_GENERATE_PROPOSALS_H +#define ANAKIN_OPERATOR_GENERATE_PROPOSALS_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/generate_proposals.h" + +namespace anakin { + +namespace ops { + +template +class GenerateProposalsHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class GenerateProposals : public Operator { +public: + GenerateProposals() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator GenerateProposals< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class GenerateProposalsHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class GenerateProposalsHelper : public OperatorHelper { +public: + GenerateProposalsHelper()=default; + + ~GenerateProposalsHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_generate_proposals stand for generate_proposals parameter + saber::GenerateProposalsParam _param_generate_proposals; + ///< _funcs_generate_proposals stand for generate_proposals function + saber::GenerateProposals::saber_type> _funcs_generate_proposals; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/group_norm.cpp b/framework/operators/group_norm.cpp new file mode 100644 index 000000000..7b4a3a08b --- /dev/null +++ b/framework/operators/group_norm.cpp @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "framework/operators/group_norm.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_GROUP_NORMAL(Ttype, Ptype) \ +template<> \ +void GroupNormal::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_group_normal; \ + impl->_funcs_group_normal(ins, outs, param, ctx); \ +} + +/// TODO ... specialization other type of operator +/// set helper +template +GroupNormalHelper::~GroupNormalHelper() { +} + +template +Status GroupNormalHelper::InitParam() { + //DLOG(WARNING) << "Parsing GroupNormal op parameter."; + auto eps = GET_PARAMETER(float, eps); + auto p = GET_PARAMETER_WITH_DEFAULT(int, p, 1); + auto group = GET_PARAMETER_WITH_DEFAULT(int, group, 0); + auto has_bias = GET_PARAMETER_WITH_DEFAULT(bool, has_bias, false); + auto has_scale = GET_PARAMETER_WITH_DEFAULT(bool, has_scale, false); + CHECK_GE(group, 1) << "group normal group must > 1"; + PBlock bias; + PBlock scale; + if (has_scale){ + scale = GET_PARAMETER(PBlock, scale); + } + if (has_bias){ + bias = GET_PARAMETER(PBlock, bias); + } + saber::NormalizeParam group_normal_param(has_scale, &(scale.d_tensor()), + has_bias, &(bias.d_tensor()), group, eps); + _param_group_normal = group_normal_param; + + + return Status::OK(); +} + +template +Status GroupNormalHelper::Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_group_normal.init(ins, outs, _param_group_normal, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status GroupNormalHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_group_normal.compute_output_shape(ins, outs, _param_group_normal)); + return Status::OK(); +} + +#ifdef AMD_GPU +INSTANCE_GROUP_NORMAL(AMD, Precision::FP32); +template class GroupNormalHelper; +ANAKIN_REGISTER_OP_HELPER(GroupNormal, GroupNormalHelper, AMD, Precision::FP32); +#endif + +#ifdef USE_CUDA +INSTANCE_GROUP_NORMAL(NV, Precision::FP32); +template class GroupNormalHelper; +template class GroupNormalHelper; +template class GroupNormalHelper; +#endif + +#ifdef USE_X86_PLACE +INSTANCE_GROUP_NORMAL(X86, Precision::FP32); +template class GroupNormalHelper; +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_GROUP_NORMAL(ARM, Precision::FP32); +template class GroupNormalHelper; +template class GroupNormalHelper; +template class GroupNormalHelper; +#endif + +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(GroupNormal, GroupNormalHelper, NV, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(GroupNormal, GroupNormalHelper, ARM, Precision::FP32); +#endif + +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +ANAKIN_REGISTER_OP_HELPER(GroupNormal, GroupNormalHelper, X86, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(GroupNormal) + .Doc("GroupNormal operator") +#ifdef USE_CUDA + .__alias__("group_normal") +#endif +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) + .__alias__("group_normal") +#endif +#ifdef USE_ARM_PLACE + .__alias__("group_normal") +#endif +#ifdef AMD_GPU + .__alias__("group_normal") +#endif + .num_in(1) + .num_out(1) + .Args("is_across_spatial", "") + .Args("is_shared_channel", "") + .Args("eps", "") + .Args("p", ""); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/group_norm.h b/framework/operators/group_norm.h new file mode 100644 index 000000000..1af91dd77 --- /dev/null +++ b/framework/operators/group_norm.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_GROUP_NORMAL_H +#define ANAKIN_OPERATOR_GROUP_NORMAL_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/normalize.h" + +namespace anakin { + +namespace ops { + +template +class GroupNormalHelper; + +/// pooling op +/** + * \brief GroupNormal operation class + * public inheritance Operator + */ +template +class GroupNormal : public Operator { +public: + GroupNormal() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator GroupNormal< Ttype(" + // << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class GroupNormalHelper; +}; + +/** + * \brief GroupNormal helper class + * public inherit OperatorHelper + * including init resource and shape size in group_normal context + */ +template +class GroupNormalHelper : public OperatorHelper { +public: + GroupNormalHelper()=default; + + ~GroupNormalHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for GroupNormal operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_group_normal stand for GroupNormal parameter + saber::NormalizeParam _param_group_normal; + ///< _funcs_group_normal stand for GroupNormal function + saber::Normalize::saber_type> _funcs_group_normal; + +private: + ///< _dims stand for GroupNormal size + PTuple _dims; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/gru.cpp b/framework/operators/gru.cpp index 8a433c416..1285694ee 100644 --- a/framework/operators/gru.cpp +++ b/framework/operators/gru.cpp @@ -4,26 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Gru::operator()(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_gru; - impl->_funcs_gru(ins, outs, param, ctx); -} -#endif -#ifdef USE_X86_PLACE -template<> -void Gru::operator()(OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_gru; - impl->_funcs_gru(ins, outs, param, ctx); +#define INSTANCE_GRU(Ttype, Ptype) \ +template<> \ +void Gru::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_gru; \ + impl->_funcs_gru(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator /// set helper @@ -90,6 +81,7 @@ Status GruHelper::InferShape(const std::vector } #ifdef USE_CUDA +INSTANCE_GRU(NV, Precision::FP32); template class GruHelper; template class GruHelper; template class GruHelper; @@ -97,6 +89,7 @@ ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_GRU(ARM, Precision::FP32); template class GruHelper; template class GruHelper; template class GruHelper; @@ -104,12 +97,18 @@ ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_GRU(X86, Precision::FP32); template class GruHelper; template class GruHelper; template class GruHelper; ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, X86, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_GRU(AMD, Precision::FP32); +template class GruHelper; +ANAKIN_REGISTER_OP_HELPER(Gru, GruHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(Gru) @@ -123,6 +122,9 @@ ANAKIN_REGISTER_OP(Gru) #ifdef USE_X86_PLACE .__alias__("gru") #endif +#ifdef AMD_GPU +.__alias__("gru") +#endif .num_in(1) .num_out(1) .Args("is_reverse", " is_reverse for gru.") diff --git a/framework/operators/im2sequence.cpp b/framework/operators/im2sequence.cpp index 67b69c263..58e6b3719 100644 --- a/framework/operators/im2sequence.cpp +++ b/framework/operators/im2sequence.cpp @@ -1,19 +1,34 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/im2sequence.h" namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Im2Sequence::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_im2sequence; - impl->_funcs_im2sequence(ins, outs, param, ctx); +#define INSTANCE_IM2SEQUENCE(Ttype, Ptype) \ +template<> \ +void Im2Sequence::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_im2sequence; \ + impl->_funcs_im2sequence(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator @@ -58,28 +73,30 @@ Status Im2SequenceHelper::InferShape(const std::vector; template class Im2SequenceHelper; template class Im2SequenceHelper; +ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_IM2SEQUENCE(ARM, Precision::FP32); template class Im2SequenceHelper; template class Im2SequenceHelper; template class Im2SequenceHelper; +ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, ARM, Precision::FP32); #endif -//template class Im2SequenceHelper; -//template class Im2SequenceHelper; -//template class Im2SequenceHelper; -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, ARM, Precision::FP32); +#ifdef AMD_GPU +INSTANCE_IM2SEQUENCE(AMD, Precision::FP32); +template class Im2SequenceHelper; +template class Im2SequenceHelper; +template class Im2SequenceHelper; +ANAKIN_REGISTER_OP_HELPER(Im2Sequence, Im2SequenceHelper, AMD, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Im2Sequence) .Doc("Im2Sequence operator") @@ -88,6 +105,9 @@ ANAKIN_REGISTER_OP(Im2Sequence) #endif #ifdef USE_ARM_PLACE .__alias__("im2sequence") +#endif +#ifdef AMD_GPU + .__alias__("im2sequence") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/input.cpp b/framework/operators/input.cpp index 0bf2e3b27..e62dfbb0c 100644 --- a/framework/operators/input.cpp +++ b/framework/operators/input.cpp @@ -65,6 +65,9 @@ Status InputHelper::InferShape(const std::vector; ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, Precision::FP32); +INSTANCE_INPUT(NV, Precision::INT8); +template class InputHelper; +ANAKIN_REGISTER_OP_HELPER(Input, InputHelper, NV, Precision::INT8); #endif #ifdef USE_ARM_PLACE diff --git a/framework/operators/interp.cpp b/framework/operators/interp.cpp index d9a709178..19e61abaf 100644 --- a/framework/operators/interp.cpp +++ b/framework/operators/interp.cpp @@ -48,7 +48,7 @@ Status InterpHelper::InferShape(const std::vector::InferShape(const std::vector"< resize_param(width_scale, height_scale); + ResizeParam resize_param(RESIZE_CUSTOM, width_scale, height_scale); _param_resize = resize_param; SABER_CHECK(_funcs_resize.compute_output_shape(ins, outs, _param_resize)); @@ -105,7 +105,13 @@ ANAKIN_REGISTER_OP_HELPER(Interp, InterpHelper, X86, Precision::FP32); INSTANCE_INTERP(ARM, Precision::FP32); template class InterpHelper; ANAKIN_REGISTER_OP_HELPER(Interp, InterpHelper, ARM, Precision::FP32); -#endif//arm +#endif + +#ifdef AMD_GPU +INSTANCE_INTERP(AMD, Precision::FP32); +template class InterpHelper; +ANAKIN_REGISTER_OP_HELPER(Interp, InterpHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(Interp) @@ -119,6 +125,9 @@ ANAKIN_REGISTER_OP(Interp) #if defined USE_X86_PLACE || defined(BUILD_LITE) .__alias__("Interp") #endif +#ifdef AMD_GPU +.__alias__("Interp") +#endif .num_in(1) .num_out(1) .Args("height_scale", " height scale for resize") diff --git a/framework/operators/layer_norm.cpp b/framework/operators/layer_norm.cpp index 6faef4f5a..1487306ce 100644 --- a/framework/operators/layer_norm.cpp +++ b/framework/operators/layer_norm.cpp @@ -4,7 +4,7 @@ namespace anakin{ namespace ops{ -#define INSTANCE_LAYERNORM(Ttype, Ptype) \ +#define INSTANCE_LAYER_NORM(Ttype, Ptype) \ template<> \ void LayerNorm::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -47,19 +47,25 @@ Status LayerNormHelper::InferShape(const std::vector; ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_LAYER_NORM(AMD, Precision::FP32); +template class LayerNormHelper; +ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, AMD, Precision::FP32); +#endif + #ifdef USE_X86_PLACE -INSTANCE_LAYERNORM(X86, Precision::FP32); +INSTANCE_LAYER_NORM(X86, Precision::FP32); template class LayerNormHelper; ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -INSTANCE_LAYERNORM(ARM, Precision::FP32); +INSTANCE_LAYER_NORM(ARM, Precision::FP32); template class LayerNormHelper; ANAKIN_REGISTER_OP_HELPER(LayerNorm, LayerNormHelper, ARM, Precision::FP32); #endif @@ -76,6 +82,9 @@ ANAKIN_REGISTER_OP(LayerNorm) #ifdef USE_X86_PLACE .__alias__("layernorm") #endif +#ifdef AMD_GPU +.__alias__("layernorm") +#endif .num_in(1) .num_out(1) .Args("begin_norm_axis", " begin norm axis") diff --git a/framework/operators/lrn.cpp b/framework/operators/lrn.cpp index 309db250b..ea0f497a9 100644 --- a/framework/operators/lrn.cpp +++ b/framework/operators/lrn.cpp @@ -4,18 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Lrn::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_lrn; - impl->_funcs_lrn(ins, outs, param, ctx); +#define INSTANCE_LRN(Ttype, Ptype) \ +template<> \ +void Lrn::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_lrn; \ + impl->_funcs_lrn(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator @@ -52,7 +50,12 @@ template Status LrnHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { - SABER_CHECK(_funcs_lrn.init(ins, outs, _param_lrn, SPECIFY, VENDER_IMPL, ctx)); + + saber::ImplEnum impl_e = VENDER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + SABER_CHECK(_funcs_lrn.init(ins, outs, _param_lrn, SPECIFY, impl_e, ctx)); return Status::OK(); } @@ -64,23 +67,30 @@ Status LrnHelper::InferShape(const std::vector return Status::OK(); } +#ifdef AMD_GPU +INSTANCE_LRN(AMD, Precision::FP32); +template class LrnHelper; +ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, AMD, Precision::FP32); +#endif + #ifdef USE_CUDA +INSTANCE_LRN(NV, Precision::FP32); template class LrnHelper; template class LrnHelper; template class LrnHelper; +ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, NV, Precision::FP32); #endif +#if defined USE_X86_PLACE || defined(BUILD_LITE) +INSTANCE_LRN(X86, Precision::FP32); +template class LrnHelper; +ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, X86, Precision::FP32); +#endif #ifdef USE_ARM_PLACE +INSTANCE_LRN(ARM, Precision::FP32); template class LrnHelper; template class LrnHelper; template class LrnHelper; -#endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE ANAKIN_REGISTER_OP_HELPER(Lrn, LrnHelper, ARM, Precision::FP32); #endif @@ -90,9 +100,15 @@ ANAKIN_REGISTER_OP(Lrn) #ifdef USE_CUDA .__alias__("LRN") #endif +#if defined USE_X86_PLACE || defined(BUILD_LITE) +.__alias__("LRN") +#endif #ifdef USE_ARM_PLACE .__alias__("LRN") #endif +#ifdef AMD_GPU +.__alias__("LRN") +#endif .num_in(3) .num_out(1); diff --git a/framework/operators/lstm.cpp b/framework/operators/lstm.cpp index a98ef7ef5..5084da750 100644 --- a/framework/operators/lstm.cpp +++ b/framework/operators/lstm.cpp @@ -4,26 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Lstm::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_lstm; - impl->_funcs_lstm(ins, outs, param, ctx); +#define INSTANCE_LSTM(Ttype, Ptype) \ +template<> \ +void Lstm::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_lstm; \ + impl->_funcs_lstm(ins, outs, param, ctx); \ } -#endif -#ifdef USE_X86_PLACE -template<> -void Lstm::operator() (OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_lstm; - impl->_funcs_lstm(ins, outs, param, ctx); -} -#endif /// TODO ... specialization other type of operator /// set helper @@ -90,33 +80,36 @@ Status LstmHelper::InferShape(const std::vector return Status::OK(); } +#ifdef AMD_GPU +INSTANCE_LSTM(AMD, Precision::FP32); +template class LstmHelper; +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, AMD, Precision::FP32); +#endif + #ifdef USE_CUDA +INSTANCE_LSTM(NV, Precision::FP32); template class LstmHelper; template class LstmHelper; template class LstmHelper; +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_LSTM(ARM, Precision::FP32); template class LstmHelper; template class LstmHelper; template class LstmHelper; +ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_LSTM(X86, Precision::FP32); template class LstmHelper; template class LstmHelper; template class LstmHelper; -#endif - -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, ARM, Precision::FP32); -#endif -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(Lstm, LstmHelper, X86, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(Lstm) .Doc("Lstm operator") @@ -125,11 +118,15 @@ ANAKIN_REGISTER_OP(Lstm) .__alias__("LSTM") #endif #ifdef USE_ARM_PLACE - .__alias__("Lstm") + // .__alias__("Lstm") #endif #ifdef USE_X86_PLACE .__alias__("Lstm") .__alias__("LSTM") +#endif +#ifdef AMD_GPU + .__alias__("Lstm") + .__alias__("LSTM") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/lstmp.cpp b/framework/operators/lstmp.cpp new file mode 100644 index 000000000..f41b393bc --- /dev/null +++ b/framework/operators/lstmp.cpp @@ -0,0 +1,148 @@ +#include "framework/operators/lstmp.h" +#include +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Lstmp::operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_lstm; + impl->_funcs_lstm(ins, outs, param, ctx); +} +#endif +#ifdef USE_X86_PLACE +template<> +void Lstmp::operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_lstm; + impl->_funcs_lstm(ins, outs, param, ctx); +} +template<> +void Lstmp::operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*>(this->_helper)->_param_lstm; + impl->_funcs_lstm(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator +/// set helper +template +LstmpHelper::~LstmpHelper() { +} + +template +Status LstmpHelper::InitParam() { + DLOG(WARNING) << "Parsing Lstm op parameter."; + + auto cell_dim = GET_PARAMETER(int, cellDim); + auto skip_num = GET_PARAMETER(int, skipNum); + auto out_dim = GET_PARAMETER(int, outDim); + auto rec_act_type = GET_PARAMETER(std::string, recActType); + + + using pblock_type = PBlock; + auto weight_wu = GET_PARAMETER(pblock_type, weight_1); + auto bias = GET_PARAMETER(pblock_type, weight_2); + + + LOG(INFO) << "lstmp args = [" << cell_dim << "," << out_dim << "," << skip_num + << "," << rec_act_type << "]"; + + const bool use_peepholes= true; + bool with_peephole_in = true; + bool skip_input_in = false; + bool is_reverse_in = false; + float dropout_param_in = 1.f; + int num_direction_in = 1; + int numLayers_in = 1; + LstmParam lstm_param(&(weight_wu.d_tensor()), &(bias.d_tensor()), nullptr, + Active_unknow, Active_sigmoid, + Active_tanh, Active_tanh, + with_peephole_in, skip_input_in, is_reverse_in, dropout_param_in, + num_direction_in, numLayers_in,skip_num,out_dim,cell_dim); + _param_lstm = lstm_param; + + return Status::OK(); +} + +template +Status LstmpHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + DLOG(INFO) << "inti lstm in op.cpp"; + SABER_CHECK(_funcs_lstm.init(ins, outs, _param_lstm, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status LstmpHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_lstm.compute_output_shape(ins, outs, _param_lstm)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class LstmpHelper; +template class LstmpHelper; +template class LstmpHelper; +#endif + +#ifdef USE_ARM_PLACE +template class LstmpHelper; +template class LstmpHelper; +template class LstmpHelper; +#endif + +#ifdef USE_X86_PLACE +template class LstmpHelper; +template class LstmpHelper; +template class LstmpHelper; +#endif + +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Lstmp, LstmpHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Lstmp, LstmpHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Lstmp, LstmpHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Lstmp, LstmpHelper, X86, Precision::INT8); +#endif +//! register op +ANAKIN_REGISTER_OP(Lstmp) +.Doc("Lstmp operator") +#ifdef USE_CUDA +.__alias__("Lstmp") +.__alias__("LSTMP") +#endif +#ifdef USE_ARM_PLACE +.__alias__("Lstmp") +#endif +#ifdef USE_X86_PLACE +.__alias__("Lstmp") +.__alias__("LSTMP") +.__alias__("Lstmp") +.__alias__("LSTMP") +#endif +.num_in(1) +.num_out(1) +.Args("cellDim", " is_reverse for lstm.") +.Args("skipNum", "some descp") +.Args("outDim", "some descp") +.Args("recActType", "some descp"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/lstmp.h b/framework/operators/lstmp.h new file mode 100644 index 000000000..4d44d8148 --- /dev/null +++ b/framework/operators/lstmp.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_FRAMEWORK_OPERATORS_LSTMP_H +#define ANAKIN_FRAMEWORK_OPERATORS_LSTMP_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/lstmp.h" + +namespace anakin { + +namespace ops { + +template +class LstmpHelper; + + +/// lstm op +/** + * \brief Lstm implementation class + * public inherit Operator + */ +template +class Lstmp : public Operator { +public: + Lstmp() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Lstm< Ttype(" + << target_name::value << "), Precision(" << (int)Ptype << ") >"; + } + + friend class LstmpHelper; +}; + +/** + * \brief Lstm helper class to implement Lstm + * public inherit OperatorHelper + * including init resource and shape size in Lstm context + */ +template +class LstmpHelper : public OperatorHelper { +public: + LstmpHelper() = default; + + ~LstmpHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by lstm + * \param ctx stand for Lstm operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_lstm stand for Lstm parameter + saber::LstmParam _param_lstm; + ///< _funcs_lstm stand for Lstm function + saber::Lstmp::saber_type> _funcs_lstm; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif //ANAKIN_LSTMP_H diff --git a/framework/operators/mat_mul.cpp b/framework/operators/mat_mul.cpp new file mode 100644 index 000000000..decb29d88 --- /dev/null +++ b/framework/operators/mat_mul.cpp @@ -0,0 +1,109 @@ +#include "framework/operators/mat_mul.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_MAT_MUL(Ttype, Ptype) \ +template<> \ +void MatMul::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_mat_mul; \ + impl->_funcs_mat_mul(ins, outs, param, ctx); \ +} + +/// set helper +template +MatMulHelper::~MatMulHelper() { +} + +template +Status MatMulHelper::InitParam() { + LOG(WARNING) << "Parsing MatMul op parameter."; + auto transpose_x = GET_PARAMETER(bool, transpose_x); + auto transpose_y = GET_PARAMETER(bool, transpose_y); + auto scale = GET_PARAMETER(float, coeff); + LOG(INFO) <<"mat mul coeff" << scale; + MatMulParam param_mat_mul(transpose_x, transpose_y, scale); + _param_mat_mul = param_mat_mul; + + return Status::OK(); +} + +template +Status MatMulHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_mat_mul.init(ins, outs, _param_mat_mul, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status MatMulHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_mat_mul.compute_output_shape(ins, outs, _param_mat_mul)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_MAT_MUL(NV, Precision::FP32); + +template<> +Status MatMulHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_mat_mul.init(ins, outs, _param_mat_mul, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(MatMul, MatMulHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_MAT_MUL(X86, Precision::FP32); +INSTANCE_MAT_MUL(X86, Precision::FP16); +INSTANCE_MAT_MUL(X86, Precision::INT8); +template class MatMulHelper; +ANAKIN_REGISTER_OP_HELPER(MatMul, MatMulHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_MAT_MUL(ARM, Precision::FP32); +template class MatMulHelper; +ANAKIN_REGISTER_OP_HELPER(MatMul, MatMulHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_MAT_MUL(AMD, Precision::FP32); +template class MatMulHelper; +template class MatMulHelper; +template class MatMulHelper; +ANAKIN_REGISTER_OP_HELPER(MatMul, MatMulHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(MatMul) +.Doc("MatMul operator") +#ifdef USE_CUDA +.__alias__("mat_mul") +#endif +#ifdef USE_ARM_PLACE +.__alias__("mat_mul") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("mat_mul") +#endif +#ifdef AMD_GPU +.__alias__("mat_mul") +#endif +.num_in(1) +.num_out(1) +.Args("type", " type of MatMul ") +.Args("channel_shared", "prelu channel is shared or not "); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/mat_mul.h b/framework/operators/mat_mul.h new file mode 100644 index 000000000..6e5f1c0f7 --- /dev/null +++ b/framework/operators/mat_mul.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_MAT_MUL_H +#define ANAKIN_OPERATOR_MAT_MUL_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/mat_mul.h" + +namespace anakin { + +namespace ops { + +template +class MatMulHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class MatMul : public Operator { +public: + MatMul() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator MatMul< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class MatMulHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class MatMulHelper : public OperatorHelper { +public: + MatMulHelper()=default; + + ~MatMulHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_mat_mul stand for mat_mul parameter + saber::MatMulParam _param_mat_mul; + ///< _funcs_mat_mul stand for mat_mul function + saber::MatMul::saber_type> _funcs_mat_mul; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/match_matrix.cpp b/framework/operators/match_matrix.cpp index acddba631..dd357c1e2 100644 --- a/framework/operators/match_matrix.cpp +++ b/framework/operators/match_matrix.cpp @@ -4,33 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void MatchMatrix::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_match_matrix; - impl->_funcs_match_matrix(ins, outs, param, ctx); +#define INSTANCE_MATCH_MATRIX(Ttype, Ptype) \ +template<> \ +void MatchMatrix::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_match_matrix; \ + impl->_funcs_match_matrix(ins, outs, param, ctx); \ } -#endif - -#ifdef USE_X86_PLACE -template<> -void MatchMatrix::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_match_matrix; - impl->_funcs_match_matrix(ins, outs, param, ctx); -} -#endif /// TODO ... specialization other type of operator @@ -47,10 +30,16 @@ Status MatchMatrixHelper::InitParam() { auto dim_t = GET_PARAMETER(int, dim_t); auto linear_term = GET_PARAMETER(bool, linear_term); auto bias_term = GET_PARAMETER(bool, bias_term); + bool is_l_same = true; + bool found_is_l_same = CHECK_PARAMETER(is_l_same); + if (found_is_l_same) { + is_l_same = GET_PARAMETER(bool, is_l_same); + } using pblock_type = PBlock; auto weights = GET_PARAMETER(pblock_type, weight_1); - MatchMatrixParam param_match_matrix(dim_in, dim_t, linear_term, bias_term, &(weights.d_tensor())); + MatchMatrixParam param_match_matrix(dim_in, dim_t, + linear_term, bias_term, is_l_same, &(weights.d_tensor())); _param_match_matrix = param_match_matrix; return Status::OK(); @@ -73,30 +62,31 @@ Status MatchMatrixHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_MATCH_MATRIX(NV, Precision::FP32); template class MatchMatrixHelper; template class MatchMatrixHelper; template class MatchMatrixHelper; +ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_MATCH_MATRIX(ARM, Precision::FP32); template class MatchMatrixHelper; template class MatchMatrixHelper; template class MatchMatrixHelper; +ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_MATCH_MATRIX(X86, Precision::FP32); template class MatchMatrixHelper; template class MatchMatrixHelper; template class MatchMatrixHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, ARM, Precision::FP32); -#endif -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, X86, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_MATCH_MATRIX(AMD, Precision::FP32); +template class MatchMatrixHelper; +ANAKIN_REGISTER_OP_HELPER(MatchMatrix, MatchMatrixHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(MatchMatrix) .Doc("MatchMatrix operator") @@ -109,6 +99,9 @@ ANAKIN_REGISTER_OP(MatchMatrix) #ifdef USE_X86_PLACE .__alias__("match_matrix") #endif +#ifdef AMD_GPU +.__alias__("match_matrix") +#endif .num_in(2) .num_out(1) .Args("dim_in", " dims of input embedding ") diff --git a/framework/operators/maxout.cpp b/framework/operators/maxout.cpp index f53ee2410..a24ef3f5a 100644 --- a/framework/operators/maxout.cpp +++ b/framework/operators/maxout.cpp @@ -4,36 +4,16 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void MaxOut::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_maxout; - impl->_funcs_maxout(ins, outs, param, ctx); -} -#endif - -#ifdef USE_X86_PLACE -template<> -void MaxOut::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_maxout; - impl->_funcs_maxout(ins, outs, param, ctx); +#define INSTANCE_MAXOUT(Ttype, Ptype) \ +template<> \ +void MaxOut::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_maxout; \ + impl->_funcs_maxout(ins, outs, param, ctx); \ } -#endif - -/// TODO ... specialization other type of operator - /// set helper template @@ -67,30 +47,32 @@ Status MaxOutHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_MAXOUT(NV, Precision::FP32); template class MaxOutHelper; template class MaxOutHelper; template class MaxOutHelper; +ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_MAXOUT(ARM, Precision::FP32); template class MaxOutHelper; template class MaxOutHelper; template class MaxOutHelper; +ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_MAXOUT(X86, Precision::FP32); template class MaxOutHelper; template class MaxOutHelper; template class MaxOutHelper; -#endif -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, ARM, Precision::FP32); -#endif -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, X86, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_MAXOUT(AMD, Precision::FP32); +template class MaxOutHelper; +ANAKIN_REGISTER_OP_HELPER(MaxOut, MaxOutHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(MaxOut) .Doc("MaxOut operator") @@ -103,6 +85,9 @@ ANAKIN_REGISTER_OP(MaxOut) #ifdef USE_X86_PLACE .__alias__("maxout") #endif +#ifdef AMD_GPU +.__alias__("maxout") +#endif .num_in(1) .num_out(1) .Args("groups", " split tensor's channel by size groups. "); diff --git a/framework/operators/mean.cpp b/framework/operators/mean.cpp new file mode 100644 index 000000000..93a5f1375 --- /dev/null +++ b/framework/operators/mean.cpp @@ -0,0 +1,98 @@ +#include "framework/operators/mean.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_MEAN(Ttype, Ptype) \ +template<> \ +void Mean::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_mean; \ + impl->_funcs_mean(ins, outs, param, ctx); \ +} + +/// set helper +template +MeanHelper::~MeanHelper() { +} + +template +Status MeanHelper::InitParam() { + DLOG(WARNING) << "Parsing Mean op parameter."; + MeanParam param_mean; + _param_mean = param_mean; + + return Status::OK(); +} + +template +Status MeanHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_mean.init(ins, outs, _param_mean, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status MeanHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_mean.compute_output_shape(ins, outs, _param_mean)); + return Status::OK(); +} + +#ifdef AMD_GPU +INSTANCE_MEAN(AMD, Precision::FP32); +template class MeanHelper; +ANAKIN_REGISTER_OP_HELPER(Mean, MeanHelper, AMD, Precision::FP32); +#endif +#ifdef USE_CUDA +INSTANCE_MEAN(NV, Precision::FP32); +template class MeanHelper; +template class MeanHelper; +template class MeanHelper; +ANAKIN_REGISTER_OP_HELPER(Mean, MeanHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +INSTANCE_MEAN(ARM, Precision::FP32); +template class MeanHelper; +template class MeanHelper; +template class MeanHelper; +ANAKIN_REGISTER_OP_HELPER(Mean, MeanHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +INSTANCE_MEAN(X86, Precision::FP32); +template class MeanHelper; +template class MeanHelper; +template class MeanHelper; +ANAKIN_REGISTER_OP_HELPER(Mean, MeanHelper, X86, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(Mean) +.Doc("Mean operator") +#ifdef USE_CUDA +.__alias__("mean") +#endif +#ifdef USE_ARM_PLACE +.__alias__("mean") +#endif +#ifdef USE_X86_PLACE +.__alias__("mean") +#endif +#ifdef AMD_GPU +.__alias__("mean") +#endif +.num_in(1) +.num_out(1) +.Args("groups", " split tensor's channel by size groups. "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/mean.h b/framework/operators/mean.h new file mode 100644 index 000000000..2c5f53371 --- /dev/null +++ b/framework/operators/mean.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_MEAN_H +#define ANAKIN_OPERATOR_MEAN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/mean.h" + +namespace anakin { + +namespace ops { + +template +class MeanHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class Mean : public Operator { +public: + Mean() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Mean< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class MeanHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class MeanHelper : public OperatorHelper { +public: + MeanHelper()=default; + + ~MeanHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_match_matrix stand for match_matrix parameter + saber::MeanParam _param_mean; + ///< _funcs_match_matrix stand for match_matrix function + saber::Mean::saber_type> _funcs_mean; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/normalize.cpp b/framework/operators/normalize.cpp index cb5170c2c..d74ecfa2b 100644 --- a/framework/operators/normalize.cpp +++ b/framework/operators/normalize.cpp @@ -1,24 +1,35 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/normalize.h" namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Normalize::operator() ( - OpContext &ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_normalize; - impl->_funcs_normalize(ins, outs, param, ctx); +#define INSTANCE_NORMALIZE(Ttype, Ptype) \ +template<> \ +void Normalize::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_normalize; \ + impl->_funcs_normalize(ins, outs, param, ctx); \ } -#endif /// TODO ... specialization other type of operator - - /// set helper template NormalizeHelper::~NormalizeHelper() { @@ -27,22 +38,26 @@ NormalizeHelper::~NormalizeHelper() { template Status NormalizeHelper::InitParam() { //DLOG(WARNING) << "Parsing Normalize op parameter."; - auto is_across_spatial = GET_PARAMETER(bool, is_across_spatial); - auto is_shared_channel = GET_PARAMETER(bool, is_shared_channel); + auto is_across_spatial = GET_PARAMETER_WITH_DEFAULT(bool, is_across_spatial, false); + auto is_shared_channel = GET_PARAMETER_WITH_DEFAULT(bool, is_shared_channel, false); auto eps = GET_PARAMETER(float, eps); - auto p = GET_PARAMETER(int, p); + auto p = GET_PARAMETER_WITH_DEFAULT(int, p, 1); + if (FIND_PARAMETER(weight_1)) { + using pblock_type = PBlock; + auto input_scale = GET_PARAMETER(pblock_type, weight_1); + saber::NormalizeParam normalize_param(is_across_spatial, is_shared_channel, \ + &(input_scale.d_tensor()), eps, p); + _param_normalize = normalize_param; + } else { + saber::NormalizeParam normalize_param(is_across_spatial, is_shared_channel, eps, p); + _param_normalize = normalize_param; + } - using pblock_type = PBlock; - auto input_scale = GET_PARAMETER(pblock_type, weight_1); - - saber::NormalizeParam normalize_param(is_across_spatial, is_shared_channel, \ - &(input_scale.d_tensor()), eps, p); - _param_normalize = normalize_param; return Status::OK(); } template -Status NormalizeHelper::Init(OpContext &ctx, +Status NormalizeHelper::Init(OpContext &ctx, const std::vector >& ins, std::vector >& outs) { SABER_CHECK(_funcs_normalize.init(ins, outs, _param_normalize, SPECIFY, SABER_IMPL, ctx)); @@ -56,32 +71,40 @@ Status NormalizeHelper::InferShape(const std::vector; +ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, AMD, Precision::FP32); +#endif + #ifdef USE_CUDA +INSTANCE_NORMALIZE(NV, Precision::FP32); template class NormalizeHelper; template class NormalizeHelper; template class NormalizeHelper; #endif #ifdef USE_X86_PLACE +INSTANCE_NORMALIZE(X86, Precision::FP32); template class NormalizeHelper; #endif #ifdef USE_ARM_PLACE +INSTANCE_NORMALIZE(ARM, Precision::FP32); template class NormalizeHelper; template class NormalizeHelper; template class NormalizeHelper; #endif -// register helper #ifdef USE_CUDA ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, NV, Precision::FP32); -#endif +#endif #ifdef USE_ARM_PLACE ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, ARM, Precision::FP32); #endif -#ifdef USE_X86_PLACE +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) ANAKIN_REGISTER_OP_HELPER(Normalize, NormalizeHelper, X86, Precision::FP32); #endif @@ -91,11 +114,14 @@ ANAKIN_REGISTER_OP(Normalize) #ifdef USE_CUDA .__alias__("normalize") #endif -#ifdef USE_X86_PLACE +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) .__alias__("normalize") #endif #ifdef USE_ARM_PLACE .__alias__("normalize") +#endif +#ifdef AMD_GPU + .__alias__("normalize") #endif .num_in(1) .num_out(1) diff --git a/framework/operators/one_hot.cpp b/framework/operators/one_hot.cpp new file mode 100644 index 000000000..7fb6f925f --- /dev/null +++ b/framework/operators/one_hot.cpp @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include "framework/operators/one_hot.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ONE_HOT(Ttype, Ptype) \ +template<> \ +void OneHot::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_one_hot; \ + impl->_funcs_one_hot(ins, outs, param, ctx); \ +} + +template +Status OneHotHelper::InitParam() { + + DLOG(WARNING) << "Parsing OneHot op parameter."; + auto depth = GET_PARAMETER(int, depth); + saber::OneHotParam one_hot_param(depth); + _param_one_hot = one_hot_param; + return Status::OK(); +} + +template +Status OneHotHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + + //different device pleace change here.. + saber::ImplEnum impl_e = SABER_IMPL; + SABER_CHECK(_funcs_one_hot.init(ins, outs, _param_one_hot, SPECIFY, impl_e, ctx)); + + // check if weights have been transposed + return Status::OK(); +} + +template +Status OneHotHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_one_hot.compute_output_shape(ins, outs, _param_one_hot)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class OneHotHelper; +INSTANCE_ONE_HOT(NV, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(OneHot, OneHotHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ONE_HOT(X86, Precision::FP32); +template class OneHotHelper; +ANAKIN_REGISTER_OP_HELPER(OneHot, OneHotHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ONE_HOT(ARM, Precision::FP32); +template class OneHotHelper; +ANAKIN_REGISTER_OP_HELPER(OneHot, OneHotHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(OneHot) +.Doc("OneHot operator") +#ifdef USE_CUDA +.__alias__("one_hot") +#endif +#ifdef USE_ARM_PLACE +.__alias__("one_hot") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("one_hot") +#endif +.num_in(1) +.num_out(1) +.Args("depth", " depth of one_hot "); +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/one_hot.h b/framework/operators/one_hot.h new file mode 100644 index 000000000..7e99e1739 --- /dev/null +++ b/framework/operators/one_hot.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ONE_HOT_H +#define ANAKIN_OPERATOR_ONE_HOT_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/one_hot.h" + +namespace anakin { + +namespace ops { + +template +class OneHotHelper; + +/// pooling op +/** + * \brief operation class + * public inheritance Operator + */ +template +class OneHot : public Operator { +public: + OneHot() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + + LOG(ERROR) << "Not Impl Yet Operator OneHot< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class OneHotHelper; +}; + +/** + * \brief helper class + * public inherit OperatorHelper + * including init resource and shape size in one_hot context + */ +template +class OneHotHelper : public OperatorHelper { +public: + OneHotHelper() = default; + + ~OneHotHelper() = default; + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for one_hot operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_one_hot stand for one_hot parameter + saber::OneHotParam _param_one_hot; + ///< _funcs_one_hot stand for one_hot function + saber::OneHot::saber_type> _funcs_one_hot; + +private: + ///< _dims stand for OneHot size + PTuple _dims; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/ops.h b/framework/operators/ops.h index 4cb8a0b1f..5a5b2577f 100644 --- a/framework/operators/ops.h +++ b/framework/operators/ops.h @@ -22,7 +22,6 @@ #include "framework/operators/axpy.h" #include "framework/operators/batch_norm.h" #include "framework/operators/concat.h" -#include "framework/operators/conv_3x3.h" #include "framework/operators/convolution.h" #include "framework/operators/crf_decoding.h" #include "framework/operators/crop.h" @@ -63,10 +62,6 @@ #include "framework/operators/split.h" #include "framework/operators/standard_rnn.h" -#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h" -#include "framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu_pool.h" -#include "framework/operators/fusion_ops/conv_3x3_relu.h" -#include "framework/operators/fusion_ops/conv_3x3_relu_pool.h" #include "framework/operators/fusion_ops/conv_batchnorm_scale.h" #include "framework/operators/fusion_ops/conv_batchnorm_scale_relu.h" #include "framework/operators/fusion_ops/conv_batchnorm_scale_relu_pool.h" diff --git a/framework/operators/output.cpp b/framework/operators/output.cpp index e5e1f8ee4..b5fb43c31 100644 --- a/framework/operators/output.cpp +++ b/framework/operators/output.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/output.h" namespace anakin { @@ -47,6 +61,12 @@ template class OutputHelper; ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, ARM, Precision::FP32); #endif //arm +#ifdef AMD_GPU +INSTANCE_OUTPUT(AMD, Precision::FP32); +template class OutputHelper; +ANAKIN_REGISTER_OP_HELPER(Output, OutputHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(Output) #ifdef USE_CUDA @@ -58,6 +78,9 @@ ANAKIN_REGISTER_OP(Output) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("output") #endif +#ifdef AMD_GPU +.__alias__("output") +#endif .Doc("Output operator [ only a input data holder and reshape ] "); } /* namespace ops */ diff --git a/framework/operators/pad.cpp b/framework/operators/pad.cpp new file mode 100644 index 000000000..5bc77ea00 --- /dev/null +++ b/framework/operators/pad.cpp @@ -0,0 +1,91 @@ +#include "framework/operators/pad.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_PAD(Ttype, Ptype) \ +template<> \ +void Pad::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_pad; \ + impl->_funcs_pad(ins, outs, param, ctx); \ +} + +template +Status PadHelper::InitParam() { + LOG(WARNING) << "!!!!!!!! Parsing Pad op parameter."; + auto pad_c = GET_PARAMETER(PTuple, pad_c); + auto pad_h = GET_PARAMETER(PTuple, pad_h); + auto pad_w = GET_PARAMETER(PTuple, pad_w); + + + saber::PadParam Pad_param(pad_c.vector(),pad_h.vector(),pad_w.vector()); + _param_pad = Pad_param; + return Status::OK(); +} + +template +Status PadHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pad.init(ins, outs, _param_pad, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status PadHelper::InferShape(const std::vector >&ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pad.compute_output_shape(ins, outs, _param_pad)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_PAD(NV, Precision::FP32); +template class PadHelper; +ANAKIN_REGISTER_OP_HELPER(Pad, PadHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_PAD(AMD, Precision::FP32); +template class PadHelper; +ANAKIN_REGISTER_OP_HELPER(Pad, PadHelper, AMD, Precision::FP32); +#endif + +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +INSTANCE_PAD(X86, Precision::FP32); +template class PadHelper; +ANAKIN_REGISTER_OP_HELPER(Pad, PadHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_PAD(ARM, Precision::FP32); +template class PadHelper; +ANAKIN_REGISTER_OP_HELPER(Pad, PadHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(Pad) +.Doc("Pad operator") +#ifdef USE_CUDA +.__alias__("Pad") +#endif +#ifdef USE_ARM_PLACE +.__alias__("Pad") +#endif +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +.__alias__("Pad") +#endif +#ifdef AMD_GPU +.__alias__("Pad") +#endif +.num_in(1) +.num_out(1) +.Args>("dims", " dims for permuting the order of input "); + +} /* namespace ops */ + +} /* namespace anakin */ diff --git a/framework/operators/pad.h b/framework/operators/pad.h new file mode 100644 index 000000000..ab9851f88 --- /dev/null +++ b/framework/operators/pad.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_PAD_H +#define ANAKIN_OPERATOR_PAD_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/pad.h" + +namespace anakin { + +namespace ops { + +template +class PadHelper; + +/// pooling op +/** + * \brief Pad implementation class + * public inherit Operator + */ +template +class Pad : public Operator { +public: + Pad() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator Pad< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; + LOG(ERROR) << "Not Impl Yet Operator Pad"; + } + + friend class PadHelper; +}; + +/** + * \brief Permut helper class to implement conv 3X3 + * public inherit OperatorHelper + * including init resource and shape size in Permut context + */ +template +class PadHelper : public OperatorHelper { +public: + PadHelper() = default; + + ~PadHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Permut operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_Pad stand for Pad parameter + saber::PadParam _param_pad; + ///< _funcs_Pad stand for Pad function + saber::Pad::saber_type> _funcs_pad; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/pad2d.cpp b/framework/operators/pad2d.cpp new file mode 100644 index 000000000..08759faa8 --- /dev/null +++ b/framework/operators/pad2d.cpp @@ -0,0 +1,104 @@ +#include "framework/operators/pad2d.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_PAD2D(Ttype, Ptype) \ +template<> \ +void Pad2D::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_pad2d; \ + impl->_funcs_pad2d(ins, outs, param, ctx); \ +} + +template +Status Pad2DHelper::InitParam() { + DLOG(WARNING) << "Parsing Pad2D op parameter."; + auto mode = GET_PARAMETER(std::string, mode); + auto pad_value = GET_PARAMETER_WITH_DEFAULT(float, value, 0.f); + auto pad_h = GET_PARAMETER(PTuple, pad_h); + auto pad_w = GET_PARAMETER(PTuple, pad_w); + + PadMode pad_mode; + if (mode == "constant"){ + pad_mode = PAD_CONSTANT; + } else if (mode == "edge"){ + pad_mode = PAD_EDGE; + } else if (mode == "reflect"){ + pad_mode = PAD_REFLECT; + } else { + pad_mode = PAD_CONSTANT; + } + saber::Pad2DParam pad2d_param(pad_h.vector(), pad_w.vector(), pad_value, pad_mode); + _param_pad2d = pad2d_param; + return Status::OK(); +} + +template +Status Pad2DHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pad2d.init(ins, outs, _param_pad2d, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status Pad2DHelper::InferShape(const std::vector >&ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pad2d.compute_output_shape(ins, outs, _param_pad2d)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_PAD2D(NV, Precision::FP32); +template class Pad2DHelper; +ANAKIN_REGISTER_OP_HELPER(Pad2D, Pad2DHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_PAD2D(AMD, Precision::FP32); +template class Pad2DHelper; +ANAKIN_REGISTER_OP_HELPER(Pad2D, Pad2DHelper, AMD, Precision::FP32); +#endif + +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +INSTANCE_PAD2D(X86, Precision::FP32); +template class Pad2DHelper; +ANAKIN_REGISTER_OP_HELPER(Pad2D, Pad2DHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_PAD2D(ARM, Precision::FP32); +template class Pad2DHelper; +ANAKIN_REGISTER_OP_HELPER(Pad2D, Pad2DHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(Pad2D) +.Doc("Pad2D operator") +#ifdef USE_CUDA +.__alias__("Pad2D") +#endif +#ifdef USE_ARM_PLACE +.__alias__("Pad2D") +#endif +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +.__alias__("Pad2D") +#endif +#ifdef AMD_GPU +.__alias__("Pad2D") +#endif +.num_in(1) +.num_out(1) +.Args("mode", "pad mode") +.Args("pad_value", "pad value") +.Args>("pad_h", "pad left and right value") +.Args>("pad_w", "pad top and bottom value"); + +} /* namespace ops */ + +} /* namespace anakin */ diff --git a/framework/operators/pad2d.h b/framework/operators/pad2d.h new file mode 100644 index 000000000..cc18d4679 --- /dev/null +++ b/framework/operators/pad2d.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_PAD2D_H +#define ANAKIN_OPERATOR_PAD2D_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/pad2d.h" + +namespace anakin { + +namespace ops { + +template +class Pad2DHelper; + +/// pad2d op +/** +* \brief Pad implementation class +* public inherit Operator +*/ +template +class Pad2D : public Operator { +public: + Pad2D() {} + + /// forward impl + virtual void operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Pad2D< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class Pad2DHelper; +}; + +/** +* \brief Pad2D helper class to implement conv 3X3 +* public inherit OperatorHelper +* including init resource and shape size in Permut context +*/ +template +class Pad2DHelper : public OperatorHelper { +public: + Pad2DHelper() = default; + + ~Pad2DHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Pad2D operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_Pad2D stand for Pad2D parameter + saber::Pad2DParam _param_pad2d; + ///< _funcs_Pad2D stand for Pad2D function + saber::Pad2D::saber_type> _funcs_pad2d; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/permute.cpp b/framework/operators/permute.cpp index c80771333..b04787221 100644 --- a/framework/operators/permute.cpp +++ b/framework/operators/permute.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/permute.h" namespace anakin { @@ -51,6 +65,12 @@ template class PermuteHelper; ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_PERMUTE(AMD, Precision::FP32); +template class PermuteHelper; +ANAKIN_REGISTER_OP_HELPER(Permute, PermuteHelper, AMD, Precision::FP32); +#endif + #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_PERMUTE(X86, Precision::FP32); template class PermuteHelper; @@ -75,6 +95,9 @@ ANAKIN_REGISTER_OP(Permute) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("permute") #endif +#ifdef AMD_GPU +.__alias__("permute") +#endif .num_in(1) .num_out(1) .Args>("dims", " dims for permuting the order of input "); diff --git a/framework/operators/pixel_shuffle.cpp b/framework/operators/pixel_shuffle.cpp new file mode 100644 index 000000000..85a6de2b0 --- /dev/null +++ b/framework/operators/pixel_shuffle.cpp @@ -0,0 +1,79 @@ +#include "framework/operators/pixel_shuffle.h" + +namespace anakin { + +namespace ops { + +template +void PixelShuffle::operator()(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_pixel_shuffle; + impl->_funcs_pixel_shuffle(ins, outs, param, ctx); +} + +template +Status PixelShuffleHelper::InitParam() { + DLOG(WARNING) << " Parsing PixelShuffle op parameter."; + auto rw = GET_PARAMETER(int, rw); + auto rh = GET_PARAMETER(int, rh); + auto channel_first = GET_PARAMETER(bool, channel_first); + + saber::PixelShuffleParam pixel_shuffle_param(rh, rw, channel_first); + _param_pixel_shuffle = pixel_shuffle_param; + return Status::OK(); +} + +template +Status PixelShuffleHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pixel_shuffle.init(ins, outs, _param_pixel_shuffle, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status PixelShuffleHelper::InferShape(const std::vector >& + ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pixel_shuffle.compute_output_shape(ins, outs, _param_pixel_shuffle)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class PixelShuffleHelper; +ANAKIN_REGISTER_OP_HELPER(PixelShuffle, PixelShuffleHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +template class PixelShuffleHelper; +ANAKIN_REGISTER_OP_HELPER(PixelShuffle, PixelShuffleHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +template class PixelShuffleHelper; +ANAKIN_REGISTER_OP_HELPER(PixelShuffle, PixelShuffleHelper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(PixelShuffle) +.Doc("PixelShuffle operator") +#ifdef USE_CUDA +.__alias__("PixelShuffle") +#endif +#ifdef USE_ARM_PLACE +.__alias__("PixelShuffle") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("PixelShuffle") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/pixel_shuffle.h b/framework/operators/pixel_shuffle.h new file mode 100644 index 000000000..7d2723de1 --- /dev/null +++ b/framework/operators/pixel_shuffle.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_PIXEL_SHUFFLE_H +#define ANAKIN_OPERATOR_PIXEL_SHUFFLE_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/pixel_shuffle.h" + +namespace anakin { + +namespace ops { + +template +class PixelShuffleHelper; + +/// pooling op +/** + * \brief PixelShuffle implementation class + * public inherit Operator + */ +template +class PixelShuffle : public Operator { +public: + PixelShuffle() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs); + + + friend class PixelShuffleHelper; +}; + +/** + * \brief Permut helper class to implement conv 3X3 + * public inherit OperatorHelper + * including init resource and shape size in Permut context + */ +template +class PixelShuffleHelper : public OperatorHelper { +public: + PixelShuffleHelper()=default; + + ~PixelShuffleHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for Permut operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_PixelShuffle stand for PixelShuffle parameter + saber::PixelShuffleParam _param_pixel_shuffle; + ///< _funcs_PixelShuffle stand for PixelShuffle function + saber::PixelShuffle::saber_type> _funcs_pixel_shuffle; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/pooling.cpp b/framework/operators/pooling.cpp index 5ad7d29a7..b7107557c 100644 --- a/framework/operators/pooling.cpp +++ b/framework/operators/pooling.cpp @@ -37,6 +37,12 @@ Status PoolingHelper::InitParam() { pool_strides[0], pool_strides[1], Pooling_average_include_padding, global_pooling, cmp_out_shape_floor_as_conv); _param_pooling = pooling_param; + } else if (pool_method == "AVGEXC") { + PoolingParam pooling_param(pool_size[0], pool_size[1], + pool_padding[0], pool_padding[1], + pool_strides[0], pool_strides[1], + Pooling_average_exclude_padding, global_pooling, cmp_out_shape_floor_as_conv); + _param_pooling = pooling_param; } else { LOG(FATAL) << " Pooling op doesn't support : " << pool_method << " pooling."; } @@ -68,6 +74,15 @@ Status PoolingHelper::Init(OpContext &ctx, \ return Status::OK(); } ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, NV, Precision::FP32); +INSTANCE_POOLING(NV, Precision::INT8); +template <> +Status PoolingHelper::Init(OpContext &ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_pooling.init(ins, outs, _param_pooling, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, NV, Precision::INT8); #endif #ifdef USE_ARM_PLACE @@ -78,8 +93,10 @@ ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, ARM, Precision::FP32); #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_POOLING(X86, Precision::FP32); +INSTANCE_POOLING(X86, Precision::INT8); template class PoolingHelper; ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(Pooling, PoolingHelper, X86, Precision::INT8); #endif #ifdef AMD_GPU @@ -93,6 +110,8 @@ ANAKIN_REGISTER_OP(Pooling) #ifdef USE_CUDA .__alias__("pooling") .__alias__("pool") +.__alias__("pooling") +.__alias__("pool") #endif #ifdef USE_ARM_PLACE .__alias__("pooling") diff --git a/framework/operators/power.cpp b/framework/operators/power.cpp index 226fabb70..c4b3fc783 100644 --- a/framework/operators/power.cpp +++ b/framework/operators/power.cpp @@ -1,36 +1,33 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/power.h" namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void Power::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_power; - impl->_funcs_power(ins, outs, param, ctx); +#define INSTANCE_POWER(Ttype, Ptype) \ +template<> \ +void Power::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*> \ + (this->_helper)->_param_power; \ + impl->_funcs_power(ins, outs, param, ctx); \ } -#endif - -#if defined USE_X86_PLACE || defined BUILD_LITE - template<> - void Power::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = impl->_param_power; - impl->_funcs_power(ins, outs, param, ctx); - } -#endif -/// TODO ... specialization other type of operator - /// set helper template @@ -66,23 +63,31 @@ Status PowerHelper::InferShape(const std::vector; template class PowerHelper; template class PowerHelper; #endif #ifdef USE_ARM_PLACE +INSTANCE_POWER(ARM, Precision::FP32); template class PowerHelper; template class PowerHelper; template class PowerHelper; #endif #if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_POWER(X86, Precision::FP32); template class PowerHelper; template class PowerHelper; template class PowerHelper; #endif +#ifdef AMD_GPU +INSTANCE_POWER(AMD, Precision::FP32); +template class PowerHelper; +ANAKIN_REGISTER_OP_HELPER(Power, PowerHelper, AMD, Precision::FP32); +#endif // register helper #ifdef USE_CUDA @@ -106,6 +111,9 @@ ANAKIN_REGISTER_OP(Power) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("power") #endif +#ifdef AMD_GPU +.__alias__("power") +#endif .num_in(1) .num_out(1) .Args("scale", " scale of param for pawer") diff --git a/framework/operators/priorbox.cpp b/framework/operators/priorbox.cpp index e02377dbd..1a3b7314a 100644 --- a/framework/operators/priorbox.cpp +++ b/framework/operators/priorbox.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/priorbox.h" namespace anakin { @@ -93,6 +107,12 @@ template class PriorBoxHelper; ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_PRIORBOX(AMD, Precision::FP32); +template class PriorBoxHelper; +ANAKIN_REGISTER_OP_HELPER(PriorBox, PriorBoxHelper, AMD, Precision::FP32); +#endif + #ifdef USE_ARM_PLACE INSTANCE_PRIORBOX(ARM, Precision::FP32); template class PriorBoxHelper; @@ -117,6 +137,9 @@ ANAKIN_REGISTER_OP(PriorBox) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("priorbox") #endif +#ifdef AMD_GPU +.__alias__("priorbox") +#endif .num_in(1) .num_out(1) .Args>("min_size", " min_size of bbox ") diff --git a/framework/operators/product_quant_embedding_with_vsum.cpp b/framework/operators/product_quant_embedding_with_vsum.cpp new file mode 100644 index 000000000..2e8ebdac4 --- /dev/null +++ b/framework/operators/product_quant_embedding_with_vsum.cpp @@ -0,0 +1,156 @@ +#include "framework/operators/product_quant_embedding_with_vsum.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void ProductQuantEmbeddingWithVsum::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = + static_cast*>(this->_helper); + auto& param = + static_cast*>(this->_helper)->_param_product_quant_embedding_with_vsum; + impl->_funcs_product_quant_embedding_with_vsum(ins, outs, param, ctx); +} +#endif + +#ifdef USE_X86_PLACE +template<> +void ProductQuantEmbeddingWithVsum::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = + static_cast*>(this->_helper); + auto& param = + static_cast*>(this->_helper)->_param_product_quant_embedding_with_vsum; + impl->_funcs_product_quant_embedding_with_vsum(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +ProductQuantEmbeddingWithVsumHelper::~ProductQuantEmbeddingWithVsumHelper() { +} + +template +Status ProductQuantEmbeddingWithVsumHelper::InitParam() { + DLOG(WARNING) << "Parsing ProductQuantEmbeddingWithVsum op parameter."; + auto word_voc = GET_PARAMETER(int, word_voc); + auto word_emb = GET_PARAMETER(int, word_emb); + auto max_seq_len = GET_PARAMETER(int, max_seq_len); + auto top_unigram = GET_PARAMETER(int, top_unigram); + auto sec_unigram = GET_PARAMETER(int, sec_unigram); + auto thd_unigram = GET_PARAMETER(int, thd_unigram); + auto top_bigram = GET_PARAMETER(int, top_bigram); + auto sec_bigram = GET_PARAMETER(int, sec_bigram); + auto thd_bigram = GET_PARAMETER(int, thd_bigram); + auto top_collocation = GET_PARAMETER(int, top_collocation); + auto sec_collocation = GET_PARAMETER(int, sec_collocation); + auto thd_collocation = GET_PARAMETER(int, thd_collocation); + + + using pblock_type = PBlock; + auto embedding_0 = GET_PARAMETER(pblock_type, weight_3); + auto embedding_1 = GET_PARAMETER(pblock_type, weight_6); + auto embedding_2 = GET_PARAMETER(pblock_type, weight_9); + auto quant_dict_0 = GET_PARAMETER(pblock_type, weight_2); + auto quant_dict_1 = GET_PARAMETER(pblock_type, weight_5); + auto quant_dict_2 = GET_PARAMETER(pblock_type, weight_8); + + ProductQuantEmbeddingWithVsumParam param_product_quant_embedding_with_vsum(word_emb, + word_voc, + top_unigram, + top_bigram, + top_collocation, + sec_unigram, + sec_bigram, + sec_collocation, + thd_unigram, + thd_bigram, + thd_collocation, + max_seq_len, + &(embedding_0.d_tensor()), + &(embedding_1.d_tensor()), + &(embedding_2.d_tensor()), + &(quant_dict_0.d_tensor()), + &(quant_dict_1.d_tensor()), + &(quant_dict_2.d_tensor())); + + _param_product_quant_embedding_with_vsum = param_product_quant_embedding_with_vsum; + + return Status::OK(); +} + +template +Status ProductQuantEmbeddingWithVsumHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_product_quant_embedding_with_vsum.init(ins, outs, _param_product_quant_embedding_with_vsum, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status ProductQuantEmbeddingWithVsumHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_product_quant_embedding_with_vsum.compute_output_shape(ins, outs, _param_product_quant_embedding_with_vsum)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +#endif +#ifdef USE_ARM_PLACE +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +#endif +#ifdef USE_X86_PLACE +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +template class ProductQuantEmbeddingWithVsumHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(ProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +//ANAKIN_REGISTER_OP_HELPER(ProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(ProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(ProductQuantEmbeddingWithVsum) +.Doc("ProductQuantEmbeddingWithVsum operator") +#ifdef USE_CUDA +.__alias__("product_quant_embedding_with_vsum") +#endif +#ifdef USE_ARM_PLACE +//.__alias__("product_quant_embedding_with_vsum") +#endif +#ifdef USE_X86_PLACE +.__alias__("product_quant_embedding_with_vsum") +#endif +.num_in(1) +.num_out(1) +.Args("word_num", "word_num") +.Args("emb_dim", " emb_dim ") +.Args("padding_idx", " padding idx ") +.Args("num_direct", " num direct 1 or 2"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/product_quant_embedding_with_vsum.h b/framework/operators/product_quant_embedding_with_vsum.h new file mode 100644 index 000000000..31770d9d1 --- /dev/null +++ b/framework/operators/product_quant_embedding_with_vsum.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_OPERATOR_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/product_quant_embedding_with_vsum.h" + +namespace anakin { + +namespace ops { + +template +class ProductQuantEmbeddingWithVsumHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class ProductQuantEmbeddingWithVsum : public Operator { +public: + ProductQuantEmbeddingWithVsum() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ProductQuantEmbeddingWithVsum< Ttype(" + << target_name::value << "), Precision("<< (int)Ptype <<") >"; + } + + friend class ProductQuantEmbeddingWithVsumHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class ProductQuantEmbeddingWithVsumHelper : public OperatorHelper { +public: + ProductQuantEmbeddingWithVsumHelper()=default; + + ~ProductQuantEmbeddingWithVsumHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_product_quant_embedding_with_vsum stand for product_quant_embedding_with_vsum parameter + saber::ProductQuantEmbeddingWithVsumParam _param_product_quant_embedding_with_vsum; + ///< _funcs_product_quant_embedding_with_vsum stand for product_quant_embedding_with_vsum function + saber::ProductQuantEmbeddingWithVsum::saber_type> _funcs_product_quant_embedding_with_vsum; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/proposal_img_scale_to_cam_coords.cpp b/framework/operators/proposal_img_scale_to_cam_coords.cpp index fa31fbe61..7a3982c67 100644 --- a/framework/operators/proposal_img_scale_to_cam_coords.cpp +++ b/framework/operators/proposal_img_scale_to_cam_coords.cpp @@ -163,8 +163,8 @@ ANAKIN_REGISTER_OP_HELPER(ProposalImgScaleToCamCoords, ProposalImgScaleToCamCoordsHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(ProposalImgScaleToCamCoords, - ProposalImgScaleToCamCoordsHelper, ARM, Precision::FP32); +//ANAKIN_REGISTER_OP_HELPER(ProposalImgScaleToCamCoords, + //ProposalImgScaleToCamCoordsHelper, ARM, Precision::FP32); #endif //! register op ANAKIN_REGISTER_OP(ProposalImgScaleToCamCoords) @@ -173,7 +173,7 @@ ANAKIN_REGISTER_OP(ProposalImgScaleToCamCoords) .__alias__("proposal_img_scal_to_cam_coords") #endif #ifdef USE_ARM_PLACE -.__alias__("proposal_img_scal_to_cam_coords") +//.__alias__("proposal_img_scal_to_cam_coords") #endif .num_in(1) .num_out(1) @@ -224,4 +224,4 @@ ANAKIN_REGISTER_OP(ProposalImgScaleToCamCoords) .Args("with_trunc_ratio", "with_trunc_ratio of proposal_img_scale_to_cam_coords_param") .Args("regress_ph_rh_as_whole", "regress_ph_rh_as_whole of proposal_img_scale_to_cam_coords_param"); } /* namespace ops */ -} /* namespace anakin */ \ No newline at end of file +} /* namespace anakin */ diff --git a/framework/operators/proposal_img_scale_to_cam_coords.h b/framework/operators/proposal_img_scale_to_cam_coords.h index 2d89cedc9..3d2ba58ea 100644 --- a/framework/operators/proposal_img_scale_to_cam_coords.h +++ b/framework/operators/proposal_img_scale_to_cam_coords.h @@ -39,7 +39,7 @@ class ProposalImgScaleToCamCoords : public Operator { const std::vector >& ins, std::vector >& outs) { LOG(ERROR) << "Not Impl Yet Operator Proposal_img_scale_to_cam_coords::value << "), Precision(" << Ptype << ") >"; + target_name::value << "), Precision(" << (int)Ptype << ") >"; } friend class ProposalImgScaleToCamCoordsHelper; }; @@ -81,4 +81,4 @@ class ProposalImgScaleToCamCoordsHelper : public OperatorHelper { }; } /* namespace ops */ } /* namespace anakin */ -#endif \ No newline at end of file +#endif diff --git a/framework/operators/ps_roi_pooling.cpp b/framework/operators/ps_roi_pooling.cpp new file mode 100644 index 000000000..d8b04bd5a --- /dev/null +++ b/framework/operators/ps_roi_pooling.cpp @@ -0,0 +1,116 @@ +#include "framework/operators/ps_roi_pooling.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_PSROIPOOLING(Ttype, Ptype) \ +template<> \ +void PsRoiPooling::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = static_cast*>(this->_helper); \ + auto& param = static_cast*>\ + (this->_helper)->_param_ps_roi_pooling; \ + impl->_funcs_ps_roi_pooling(ins, outs, param, ctx); \ +} + +template +Status PsRoiPoolingHelper::InitParam() { + DLOG(WARNING) << "Parsing PsRoiPooling op parameter."; + + auto pooled_width = GET_PARAMETER(int, pooled_width); + auto pooled_height = GET_PARAMETER(int, pooled_height); + auto crop_width = GET_PARAMETER(int, crop_width); + auto crop_height = GET_PARAMETER(int, crop_height); + auto global_pooling = GET_PARAMETER_WITH_DEFAULT(bool, global_pooling, true); + auto extra_value = GET_PARAMETER_WITH_DEFAULT(float, extra_value, 0); + auto method = GET_PARAMETER_WITH_DEFAULT(int, method, 0); + + auto spatial_scale = GET_PARAMETER(float, spatial_scale); + + PsRoiPoolParam ps_roi_pooling_param(pooled_height, + pooled_width, crop_height, crop_width, method, extra_value, + global_pooling,spatial_scale); + + _param_ps_roi_pooling = ps_roi_pooling_param; + + return Status::OK(); +} + +template +Status PsRoiPoolingHelper::Init(OpContext &ctx, const std::vector> &ins, + std::vector> &outs) { + SABER_CHECK(_funcs_ps_roi_pooling.init(ins, outs, _param_ps_roi_pooling, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status PsRoiPoolingHelper::InferShape(const std::vector> &ins, + std::vector> &outs) { + SABER_CHECK(_funcs_ps_roi_pooling.compute_output_shape(ins, outs, _param_ps_roi_pooling)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_PSROIPOOLING(NV, Precision::FP32); +template <> +Status PsRoiPoolingHelper::Init(OpContext &ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { + SABER_CHECK(_funcs_ps_roi_pooling.init(ins, outs, _param_ps_roi_pooling, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(PsRoiPooling, PsRoiPoolingHelper, NV, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_PSROIPOOLING(ARM, Precision::FP32); +template class PsRoiPoolingHelper; +ANAKIN_REGISTER_OP_HELPER(PsRoiPooling, PsRoiPoolingHelper, ARM, Precision::FP32); +#endif //arm + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_PSROIPOOLING(X86, Precision::FP32); +template class PsRoiPoolingHelper; +ANAKIN_REGISTER_OP_HELPER(PsRoiPooling, PsRoiPoolingHelper, X86, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_PSROIPOOLING(AMD, Precision::FP32); +template class PsRoiPoolingHelper; +ANAKIN_REGISTER_OP_HELPER(PsRoiPooling, PsRoiPoolingHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(PsRoiPooling) +.Doc("PsRoiPooling operator") +#ifdef USE_CUDA +.__alias__("PsRoiPooling") +.__alias__("pool") +#endif +#ifdef USE_ARM_PLACE +.__alias__("PsRoiPooling") +.__alias__("pool") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("PsRoiPooling") +.__alias__("pool") +#endif +#ifdef AMD_GPU +.__alias__("PsRoiPooling") +.__alias__("pool") +#endif +.num_in(1) +.num_out(1) +.Args("method", "PsRoiPooling type to be applied (MAX, SUM, AVG).") +.Args("cmp_out_shape_floor_as_conv cmp_out_shape_floor_as_conv of PsRoiPooling for adu novel approach") +.Args("global_PsRoiPooling", "whether execute global PsRoiPooling on input") +.Args>("pool_size", " kernel size for PsRoiPooling (x, y) or (x, y, z).") +.Args>("strides", "stride for PsRoiPooling (x, y) or (x, y, z).") +.Args>("padding", "pad for PsRoiPooling: (x, y) or (x, y, z)."); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/ps_roi_pooling.h b/framework/operators/ps_roi_pooling.h new file mode 100644 index 000000000..195e95941 --- /dev/null +++ b/framework/operators/ps_roi_pooling.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_PS_ROI_POOLING_H +#define ANAKIN_OPERATOR_PS_ROI_POOLING_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/ps_roi_pooling.h" + +namespace anakin { + +namespace ops { + +template +class PsRoiPoolingHelper; + +/// PsRoiPooling op +/** + * \brief PsRoiPooling implementation class + * public inherit Operator + */ +template +class PsRoiPooling : public Operator { +public: + PsRoiPooling() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator PsRoiPooling< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + + } + + friend class PsRoiPoolingHelper; +}; + +/** + * \brief PsRoiPooling helper class to implement PsRoiPooling + * public inherit OperatorHelper + * including init resource and shape size in PsRoiPooling context + */ +template +class PsRoiPoolingHelper : public OperatorHelper { +public: + PsRoiPoolingHelper()=default; + + ~PsRoiPoolingHelper() {} + + Status InitParam() override; + + /** + * \brief initial all the resource needed by PsRoiPooling + * \param ctx stand for PsRoiPooling operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_PsRoiPooling stand for PsRoiPooling parameter + saber::PsRoiPoolParam _param_ps_roi_pooling; + ///< _funcs_PsRoiPooling stand for PsRoiPooling function + saber::PsRoiPool::saber_type> _funcs_ps_roi_pooling; +}; + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/pyramid_hash_quant_embedding_with_vsum.cpp b/framework/operators/pyramid_hash_quant_embedding_with_vsum.cpp new file mode 100644 index 000000000..2c70251ed --- /dev/null +++ b/framework/operators/pyramid_hash_quant_embedding_with_vsum.cpp @@ -0,0 +1,124 @@ +#include "framework/operators/pyramid_hash_quant_embedding_with_vsum.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM(Ttype, Ptype) \ +template<> \ +void PyramidHashQuantEmbeddingWithVsum::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_pyramid_hash_quant_embedding_with_vsum; \ + impl->_funcs_pyramid_hash_quant_embedding_with_vsum(ins, outs, param, ctx); \ +} + +/// set helper +template +PyramidHashQuantEmbeddingWithVsumHelper::~PyramidHashQuantEmbeddingWithVsumHelper() { +} + +template +Status PyramidHashQuantEmbeddingWithVsumHelper::InitParam() { + DLOG(WARNING) << "Parsing PyramidHashQuantEmbeddingWithVsum op parameter."; + auto space_size = GET_PARAMETER(int, space_size); + auto emb_size = GET_PARAMETER(int, emb_size); + auto pyramid_layer = GET_PARAMETER(int, pyramid_layer); + auto rand_len = GET_PARAMETER(int, rand_len); + auto white_list_len = GET_PARAMETER(int, white_list_len); + auto black_list_len = GET_PARAMETER(int, black_list_len); + auto dropout_percent = GET_PARAMETER(float, dropout_percent); + using pblock_type = PBlock; + auto quant_dict = GET_PARAMETER(pblock_type, weight_2); + auto hash_space = GET_PARAMETER(pblock_type, weight_3); + auto white_filter = GET_PARAMETER(pblock_type, weight_4); + auto black_filter = GET_PARAMETER(pblock_type, weight_5); + Tensor* white_filter_tensor = NULL; + Tensor* black_filter_tensor = NULL; + if (white_list_len) { + white_filter_tensor = &(white_filter.d_tensor()); + } + if (black_list_len) { + black_filter_tensor = &(black_filter.d_tensor()); + } + + PyramidHashQuantEmbeddingParam param_pyramid_hash_quant_embedding_with_vsum(space_size, + emb_size, + pyramid_layer, + rand_len, + white_list_len, + black_list_len, + dropout_percent, + &(quant_dict.d_tensor()), + &(hash_space.d_tensor()), + white_filter_tensor, + black_filter_tensor); + + _param_pyramid_hash_quant_embedding_with_vsum = param_pyramid_hash_quant_embedding_with_vsum; + + return Status::OK(); +} + +template +Status PyramidHashQuantEmbeddingWithVsumHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pyramid_hash_quant_embedding_with_vsum.init(ins, outs, _param_pyramid_hash_quant_embedding_with_vsum, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status PyramidHashQuantEmbeddingWithVsumHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_pyramid_hash_quant_embedding_with_vsum.compute_output_shape(ins, outs, _param_pyramid_hash_quant_embedding_with_vsum)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM(NV, Precision::FP32); +template class PyramidHashQuantEmbeddingWithVsumHelper; +ANAKIN_REGISTER_OP_HELPER(PyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingWithVsumHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM(X86, Precision::FP32); +template class PyramidHashQuantEmbeddingWithVsumHelper; +ANAKIN_REGISTER_OP_HELPER(PyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingWithVsumHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM(ARM, Precision::FP32); +template class PyramidHashQuantEmbeddingWithVsumHelper; +ANAKIN_REGISTER_OP_HELPER(PyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingWithVsumHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM(AMD, Precision::FP32); +template class PyramidHashQuantEmbeddingWithVsumHelper; +ANAKIN_REGISTER_OP_HELPER(PyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingWithVsumHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(PyramidHashQuantEmbeddingWithVsum) +.Doc("PyramidHashQuantEmbeddingWithVsum operator") +#ifdef USE_CUDA +.__alias__("pyramid_hash_quant_embedding_with_vsum") +#endif +#ifdef USE_ARM_PLACE +.__alias__("pyramid_hash_quant_embedding_with_vsum") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("pyramid_hash_quant_embedding_with_vsum") +#endif +#ifdef AMD_GPU +.__alias__("pyramid_hash_quant_embedding_with_vsum") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h b/framework/operators/pyramid_hash_quant_embedding_with_vsum.h similarity index 56% rename from framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h rename to framework/operators/pyramid_hash_quant_embedding_with_vsum.h index c2d401cb3..f256ca20d 100644 --- a/framework/operators/fusion_ops/conv_3x3_batchnorm_scale_relu.h +++ b/framework/operators/pyramid_hash_quant_embedding_with_vsum.h @@ -13,64 +13,64 @@ limitations under the License. */ -#ifndef ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_H -#define ANAKIN_OPERATOR_CONV_SASS_BATCHNORM_SCALE_RELU_H +#ifndef ANAKIN_OPERATOR_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_OPERATOR_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H #include "framework/core/base.h" #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv.h" +#include "saber/funcs/pyramid_hash_quant_embedding_with_vsum.h" namespace anakin { namespace ops { template -class SassConvBatchnormScaleReluHelper; +class PyramidHashQuantEmbeddingWithVsumHelper; /// pooling op /** - * \brief SassConvBatchnormScaleRelu implementation class - * public inherit Operator + * \brief operation of ops class + * public inheritance Operator */ template -class SassConvBatchnormScaleRelu : public Operator { +class PyramidHashQuantEmbeddingWithVsum : public Operator { public: - SassConvBatchnormScaleRelu() {} + PyramidHashQuantEmbeddingWithVsum() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvBatchnormScaleRelu< Ttype(" - << target_name::value << "), Precision("<< Ptype <<") >"; + LOG(ERROR) << "Not Impl Yet Operator PyramidHashQuantEmbeddingWithVsum< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvBatchnormScaleReluHelper; + friend class PyramidHashQuantEmbeddingWithVsumHelper; }; /** - * \brief SassConvBatchnormScaleRelu helper class to implement it - * public inherit OperatorHelper - * including init resource and shape size in SassConvBatchnormScaleRelu context + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape */ template -class SassConvBatchnormScaleReluHelper : public OperatorHelper { +class PyramidHashQuantEmbeddingWithVsumHelper : public OperatorHelper { public: - SassConvBatchnormScaleReluHelper()=default; + PyramidHashQuantEmbeddingWithVsumHelper()=default; - ~SassConvBatchnormScaleReluHelper(); + ~PyramidHashQuantEmbeddingWithVsumHelper(); Status InitParam() override; - + /** * \brief initial all the resource needed by pooling - * \param ctx stand for SassConvBatchnormScaleRelu operation context + * \param ctx stand for operation context * \param ins stand for input tensor vector * \param outs stand for output tensor vector * \return status - *///! initial all the resource needed by pooling + */ Status Init(OpContext &ctx, const std::vector >& ins, std::vector >& outs) override; @@ -85,14 +85,10 @@ class SassConvBatchnormScaleReluHelper : public OperatorHelper { std::vector >& outs) override; public: - ///< _param_conv_batchnorm_scale_relu stand for SassConvBatchnormScaleRelu parameter - saber::ConvParam _param_conv_batchnorm_scale_relu; - ///< _funcs_conv_batchnorm_scale_relu stand for SassConvBatchnormScaleRelu function - saber::Conv::saber_type> _funcs_conv_batchnorm_scale_relu; - -private: - ///< _dims stand for SassConvBatchnormScaleRelu size - PTuple _dims; + ///< _param_pyramid_hash_quant_embedding_with_vsum stand for pyramid_hash_quant_embedding_with_vsum parameter + saber::PyramidHashQuantEmbeddingParam _param_pyramid_hash_quant_embedding_with_vsum; + ///< _funcs_pyramid_hash_quant_embedding_with_vsum stand for pyramid_hash_quant_embedding_with_vsum function + saber::PyramidHashQuantEmbeddingWithVsum::saber_type> _funcs_pyramid_hash_quant_embedding_with_vsum; }; diff --git a/framework/operators/reduce.cpp b/framework/operators/reduce.cpp new file mode 100644 index 000000000..6fa1ad512 --- /dev/null +++ b/framework/operators/reduce.cpp @@ -0,0 +1,139 @@ + +#include "framework/operators/reduce.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void Reduce::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>( + this->_helper); + auto& param = static_cast*>( + this->_helper)->_param_reduce; + impl->_funcs_reduce(ins, outs, param, ctx); +} +#endif + +#ifdef USE_X86_PLACE +template<> +void Reduce::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>( + this->_helper); + auto& param = static_cast*>( + this->_helper)->_param_reduce; + impl->_funcs_reduce(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + +/// set helper +template +ReduceHelper::~ReduceHelper() { +} + +template +Status ReduceHelper::InitParam() { + DLOG(WARNING) << "Parsing Reduce op parameter."; + auto type_str = GET_PARAMETER(std::string, reduce_type); + ReduceType type = Reduce_unknow; + if (type_str == "Reduce_min") { + type = Reduce_min; + } else if (type_str == "Reduce_max") { + type = Reduce_max; + } else if (type_str == "Reduce_sum") { + type = Reduce_sum; + } else if (type_str == "Reduce_avg") { + type = Reduce_avg; + } else if (type_str == "Reduce_prod") { + type = Reduce_prod; + } + auto keep_dim = GET_PARAMETER(bool, keep_dim); + auto reduce_all = GET_PARAMETER(bool, reduce_all); + auto reduce_dim = GET_PARAMETER(PTuple, reduce_dim); + auto coeff = GET_PARAMETER(float, coeff); + ReduceParam param_reduce(reduce_dim.vector(), + type, keep_dim, reduce_all, coeff); + + _param_reduce = param_reduce; + return Status::OK(); +} + +template +Status ReduceHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_reduce.init(ins, outs, _param_reduce, + SPECIFY, SABER_IMPL, ctx)); + + return Status::OK(); +} + +template +Status ReduceHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + + SABER_CHECK(_funcs_reduce.compute_output_shape(ins, outs, _param_reduce)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class ReduceHelper; +template class ReduceHelper; +template class ReduceHelper; +#endif +#ifdef USE_ARM_PLACE +// template class ReduceHelper; +// template class ReduceHelper; +// template class ReduceHelper; +#endif +#ifdef USE_X86_PLACE +template class ReduceHelper; +template class ReduceHelper; +template class ReduceHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(Reduce, ReduceHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(Reduce, ReduceHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(Reduce, ReduceHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(Reduce) +.Doc("Reduce operator") +#ifdef USE_CUDA +.__alias__("reduce") +#endif +#ifdef USE_ARM_PLACE +.__alias__("reduce") +#endif +#ifdef USE_X86_PLACE +.__alias__("reduce") +#endif +.num_in(1) +.num_out(1) +.Args>("reduce_dim", "ratios of gen_anchor_param") +.Args("keep_dim", "ratios of gen_anchor_param") +.Args("reduce_type", "ratios of gen_anchor_param") +.Args("reduce_all", "ratios of gen_anchor_param") +.Args("coeff", "ratios of gen_anchor_param"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/reduce.h b/framework/operators/reduce.h new file mode 100644 index 000000000..2b03dca2d --- /dev/null +++ b/framework/operators/reduce.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_REDUCE_H +#define ANAKIN_OPERATOR_REDUCE_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/reduce.h" + +namespace anakin { + +namespace ops { + +template +class ReduceHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class Reduce : public Operator { +public: + Reduce() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator Reduce< Ttype(" + << target_name::value << "), Precision("<< (int)Ptype <<") >"; + } + + friend class ReduceHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class ReduceHelper : public OperatorHelper { +public: + ReduceHelper() = default; + + ~ReduceHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_match_matrix stand for reduce parameter + saber::ReduceParam _param_reduce; + ///< _funcs_match_matrix stand for reduce function + saber::Reduce::saber_type> _funcs_reduce; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/reduce_min.cpp b/framework/operators/reduce_min.cpp new file mode 100644 index 000000000..41b4f0a4e --- /dev/null +++ b/framework/operators/reduce_min.cpp @@ -0,0 +1,115 @@ +#include "framework/operators/reduce_min.h" + +namespace anakin { + +namespace ops { + +#ifdef USE_CUDA +template<> +void ReduceMin::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = + static_cast*>(this->_helper); + auto& param = + static_cast*>(this->_helper)->_param_reduce_min; + impl->_funcs_reduce_min(ins, outs, param, ctx); +} +#endif + +#ifdef USE_X86_PLACE +template<> +void ReduceMin::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = + static_cast*>(this->_helper); + auto& param = + static_cast*>(this->_helper)->_param_reduce_min; + impl->_funcs_reduce_min(ins, outs, param, ctx); +} +#endif + +/// TODO ... specialization other type of operator + + +/// set helper +template +ReduceMinHelper::~ReduceMinHelper() { +} + +template +Status ReduceMinHelper::InitParam() { + DLOG(WARNING) << "Parsing ReduceMin op parameter."; + auto keep_dim = GET_PARAMETER(bool, keep_dim); + auto reduce_dim = GET_PARAMETER(PTuple, reduce_dim); + ReduceMinParam param_reduce_min(reduce_dim.vector(), keep_dim); + _param_reduce_min = param_reduce_min; + + return Status::OK(); +} + +template +Status ReduceMinHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_reduce_min.init(ins, outs, _param_reduce_min, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status ReduceMinHelper::InferShape(const + std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_reduce_min.compute_output_shape(ins, outs, _param_reduce_min)); + return Status::OK(); +} + +#ifdef USE_CUDA +template class ReduceMinHelper; +template class ReduceMinHelper; +template class ReduceMinHelper; +#endif +#ifdef USE_ARM_PLACE +template class ReduceMinHelper; +template class ReduceMinHelper; +template class ReduceMinHelper; +#endif +#ifdef USE_X86_PLACE +template class ReduceMinHelper; +template class ReduceMinHelper; +template class ReduceMinHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(ReduceMin, ReduceMinHelper, NV, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(ReduceMin, ReduceMinHelper, ARM, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(ReduceMin, ReduceMinHelper, X86, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(ReduceMin) +.Doc("ReduceMin operator") +#ifdef USE_CUDA +.__alias__("reduce_min") +#endif +#ifdef USE_ARM_PLACE +.__alias__("reduce_min") +#endif +#ifdef USE_X86_PLACE +.__alias__("reduce_min") +#endif +.num_in(1) +.num_out(1) +.Args("groups", " split tensor's channel by size groups. "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/reduce_min.h b/framework/operators/reduce_min.h new file mode 100644 index 000000000..447ac64a8 --- /dev/null +++ b/framework/operators/reduce_min.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_MEAN_H +#define ANAKIN_OPERATOR_MEAN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/reduce_min.h" + +namespace anakin { + +namespace ops { + +template +class ReduceMinHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class ReduceMin : public Operator { +public: + ReduceMin() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator ReduceMin< Ttype(" + << target_name::value << "), Precision("<< (int)Ptype <<") >"; + } + + friend class ReduceMinHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class ReduceMinHelper : public OperatorHelper { +public: + ReduceMinHelper()=default; + + ~ReduceMinHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_match_matrix stand for reduce_min parameter + saber::ReduceMinParam _param_reduce_min; + ///< _funcs_match_matrix stand for reduce_min function + saber::ReduceMin::saber_type> _funcs_reduce_min; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/relu.cpp b/framework/operators/relu.cpp index 4fdeeb681..fca9fc76b 100644 --- a/framework/operators/relu.cpp +++ b/framework/operators/relu.cpp @@ -57,6 +57,9 @@ ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, NV, Precision::FP32); INSTANCE_RELU(X86, Precision::FP32); template class ReLUHelper; ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, X86, Precision::FP32); +INSTANCE_RELU(X86, Precision::INT8); +template class ReLUHelper; +ANAKIN_REGISTER_OP_HELPER(ReLU, ReLUHelper, X86, Precision::INT8); #endif #ifdef USE_ARM_PLACE diff --git a/framework/operators/reshape.cpp b/framework/operators/reshape.cpp index 59f57c5fb..872e6edea 100644 --- a/framework/operators/reshape.cpp +++ b/framework/operators/reshape.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/reshape.h" namespace anakin { @@ -74,6 +88,12 @@ template class ReshapeHelper; ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, ARM, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_RESHAPE(AMD, Precision::FP32); +template class ReshapeHelper; +ANAKIN_REGISTER_OP_HELPER(Reshape, ReshapeHelper, AMD, Precision::FP32); +#endif + //! register op ANAKIN_REGISTER_OP(Reshape) .Doc("Reshape operator") @@ -86,6 +106,9 @@ ANAKIN_REGISTER_OP(Reshape) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("reshape") #endif +#ifdef AMD_GPU +.__alias__("reshape") +#endif .num_in(1) .num_out(1) .Args>("dims", " dims of redhape target"); diff --git a/framework/operators/resize.cpp b/framework/operators/resize.cpp index 6e7b4b6f3..d9c3636e5 100644 --- a/framework/operators/resize.cpp +++ b/framework/operators/resize.cpp @@ -20,11 +20,27 @@ Status ResizeHelper::InitParam() { DLOG(WARNING) << "Parsing Resize op parameter."; // get resize param - auto width_scale = GET_PARAMETER(float, width_scale); - auto height_scale = GET_PARAMETER(float, height_scale); - - ResizeParam resize_param(height_scale, width_scale); - _param_resize = resize_param; + auto resize_method = GET_PARAMETER_WITH_DEFAULT(std::string, method,"RESIZE_CUSTOM"); + auto width_scale = GET_PARAMETER_WITH_DEFAULT(float, width_scale, 0.f); + auto height_scale = GET_PARAMETER_WITH_DEFAULT(float, height_scale, 0.f); + auto out_width = GET_PARAMETER_WITH_DEFAULT(int, out_width, -1); + auto out_height = GET_PARAMETER_WITH_DEFAULT(int, out_height, -1); + if (resize_method == "BILINEAR_ALIGN"){ + ResizeParam resize_param(BILINEAR_ALIGN, height_scale, width_scale, out_width, out_height); + _param_resize = resize_param; + } else if (resize_method == "BILINEAR_NO_ALIGN"){ + ResizeParam resize_param(BILINEAR_NO_ALIGN, height_scale, width_scale, out_width, out_height); + _param_resize = resize_param; + } else if (resize_method == "RESIZE_CUSTOM"){ + ResizeParam resize_param(RESIZE_CUSTOM, height_scale, width_scale, out_width, out_height); + _param_resize = resize_param; + } else if (resize_method == "NEAREST_ALIGN"){ + ResizeParam resize_param(NEAREST_ALIGN, height_scale, width_scale, out_width, out_height); + _param_resize = resize_param; + } else { + LOG(FATAL) << "Resize op doesn't support : " << resize_method << " resize method."; + } + return Status::OK(); } @@ -38,6 +54,27 @@ Status ResizeHelper::Init(OpContext &ctx, const std::vector template Status ResizeHelper::InferShape(const std::vector> &ins, std::vector> &outs) { + + auto min_dim = GET_PARAMETER_WITH_DEFAULT(int, min_dim, -1); + auto max_dim = GET_PARAMETER_WITH_DEFAULT(int, max_dim, -1); + if (min_dim != -1 && max_dim != -1){ + CHECK_LE(min_dim, max_dim) << "min_dim must less than max_dim"; + int in_h = ins[0] -> height(); + int in_w = ins[0] -> width(); + float in_min = fmin(in_h, in_w); + float scale = min_dim / in_min; + int resized_h = int(round(in_h * scale)); + int resized_w = int(round(in_w * scale)); + if (fmax(resized_h, resized_w) > max_dim){ + float in_max = fmax(in_h, in_w); + scale = max_dim / in_max; + resized_h = int(round(in_h * scale)); + resized_w = int(round(in_w * scale)); + } + ResizeParam resize_param(RESIZE_CUSTOM, scale, scale, resized_w, resized_h); + _param_resize = resize_param; + } + SABER_CHECK(_funcs_resize.compute_output_shape(ins, outs, _param_resize)); return Status::OK(); } @@ -64,7 +101,13 @@ ANAKIN_REGISTER_OP_HELPER(Resize, ResizeHelper, X86, Precision::FP32); INSTANCE_RESIZE(ARM, Precision::FP32); template class ResizeHelper; ANAKIN_REGISTER_OP_HELPER(Resize, ResizeHelper, ARM, Precision::FP32); -#endif//arm +#endif + +#ifdef AMD_GPU +INSTANCE_RESIZE(AMD, Precision::FP32); +template class ResizeHelper; +ANAKIN_REGISTER_OP_HELPER(Resize, ResizeHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(Resize) @@ -78,11 +121,15 @@ ANAKIN_REGISTER_OP(Resize) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("Resize") #endif +#ifdef AMD_GPU +.__alias__("Resize") +#endif .num_in(1) .num_out(1) +.Args("method", "resize type to be applied (BILINEAR_ALIGN, BILINEAR_NO_ALIGN, RESIZE_CUSTOM).") .Args("height_scale", " height scale for resize") .Args("width_scale", " width scale for resize"); } /* namespace ops */ -} /* namespace anakin */ \ No newline at end of file +} /* namespace anakin */ diff --git a/framework/operators/reverse_input.cpp b/framework/operators/reverse_input.cpp index 09b1d0c10..25a287103 100644 --- a/framework/operators/reverse_input.cpp +++ b/framework/operators/reverse_input.cpp @@ -30,7 +30,7 @@ Status ReverseInputHelper::InferShape(const std::vector \ void ReverseInput::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -42,19 +42,25 @@ void ReverseInput::operator()(OpContext& ctx, \ } #ifdef USE_CUDA -INSTANCE_CONCAT(NV, Precision::FP32); +INSTANCE_REVERSE_INPUT(NV, Precision::FP32); template class ReverseInputHelper; ANAKIN_REGISTER_OP_HELPER(ReverseInput, ReverseInputHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_REVERSE_INPUT(AMD, Precision::FP32); +template class ReverseInputHelper; +ANAKIN_REGISTER_OP_HELPER(ReverseInput, ReverseInputHelper, AMD, Precision::FP32); +#endif + #ifdef USE_ARM_PLACE -INSTANCE_CONCAT(ARM, Precision::FP32); +INSTANCE_REVERSE_INPUT(ARM, Precision::FP32); template class ReverseInputHelper; ANAKIN_REGISTER_OP_HELPER(ReverseInput, ReverseInputHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -INSTANCE_CONCAT(X86, Precision::FP32); +INSTANCE_REVERSE_INPUT(X86, Precision::FP32); template class ReverseInputHelper; ANAKIN_REGISTER_OP_HELPER(ReverseInput, ReverseInputHelper, X86, Precision::FP32); #endif @@ -71,6 +77,9 @@ ANAKIN_REGISTER_OP(ReverseInput) #ifdef USE_X86_PLACE .__alias__("reverse_input") #endif +#ifdef USE_GPU +.__alias__("reverse_input") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/reverse_sequence.cpp b/framework/operators/reverse_sequence.cpp index 9c221b109..1205f5ede 100644 --- a/framework/operators/reverse_sequence.cpp +++ b/framework/operators/reverse_sequence.cpp @@ -30,7 +30,7 @@ Status ReverseSequenceHelper::InferShape(const std::vector \ void ReverseSequence::operator()(OpContext& ctx, \ const std::vector >& ins, \ @@ -42,19 +42,25 @@ void ReverseSequence::operator()(OpContext& ctx, \ } #ifdef USE_CUDA -INSTANCE_CONCAT(NV, Precision::FP32); +INSTANCE_REVERSE_SEQUENCE(NV, Precision::FP32); template class ReverseSequenceHelper; ANAKIN_REGISTER_OP_HELPER(ReverseSequence, ReverseSequenceHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_REVERSE_SEQUENCE(AMD, Precision::FP32); +template class ReverseSequenceHelper; +ANAKIN_REGISTER_OP_HELPER(ReverseSequence, ReverseSequenceHelper, AMD, Precision::FP32); +#endif + #ifdef USE_ARM_PLACE -INSTANCE_CONCAT(ARM, Precision::FP32); +INSTANCE_REVERSE_SEQUENCE(ARM, Precision::FP32); template class ReverseSequenceHelper; ANAKIN_REGISTER_OP_HELPER(ReverseSequence, ReverseSequenceHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE -INSTANCE_CONCAT(X86, Precision::FP32); +INSTANCE_REVERSE_SEQUENCE(X86, Precision::FP32); template class ReverseSequenceHelper; ANAKIN_REGISTER_OP_HELPER(ReverseSequence, ReverseSequenceHelper, X86, Precision::FP32); #endif @@ -71,6 +77,9 @@ ANAKIN_REGISTER_OP(ReverseSequence) #ifdef USE_X86_PLACE .__alias__("reverse_sequence") #endif +#ifdef AMD_GPU +.__alias__("reverse_sequence") +#endif .num_in(1) .num_out(1); diff --git a/framework/operators/roi_align.cpp b/framework/operators/roi_align.cpp new file mode 100644 index 000000000..ab3bd5af8 --- /dev/null +++ b/framework/operators/roi_align.cpp @@ -0,0 +1,109 @@ +#include "framework/operators/roi_align.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ROI_ALIGN(Ttype, Ptype) \ +template<> \ +void RoiAlign::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_roi_align; \ + impl->_funcs_roi_align(ins, outs, param, ctx); \ +} + +/// set helper +template +RoiAlignHelper::~RoiAlignHelper() { +} + +template +Status RoiAlignHelper::InitParam() { + DLOG(WARNING) << "Parsing RoiAlign op parameter."; + auto pooled_height = GET_PARAMETER(int, pooled_height); + auto pooled_width = GET_PARAMETER(int, pooled_width); + auto spatial_scale = GET_PARAMETER(float, spatial_scale); + auto sampling_ratio = GET_PARAMETER(int, sampling_ratio); + RoiAlignParam param_roi_align(pooled_height, pooled_width, spatial_scale, sampling_ratio); + _param_roi_align = param_roi_align; + + return Status::OK(); +} + +template +Status RoiAlignHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_roi_align.init(ins, outs, _param_roi_align, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status RoiAlignHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_roi_align.compute_output_shape(ins, outs, _param_roi_align)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ROI_ALIGN(NV, Precision::FP32); + +template<> +Status RoiAlignHelper::Init(OpContext& ctx, + const std::vector< Tensor4dPtr > & ins, + std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_roi_align.init(ins, outs, _param_roi_align, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(RoiAlign, RoiAlignHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ROI_ALIGN(X86, Precision::FP32); +INSTANCE_ROI_ALIGN(X86, Precision::FP16); +INSTANCE_ROI_ALIGN(X86, Precision::INT8); +template class RoiAlignHelper; +ANAKIN_REGISTER_OP_HELPER(RoiAlign, RoiAlignHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ROI_ALIGN(ARM, Precision::FP32); +template class RoiAlignHelper; +ANAKIN_REGISTER_OP_HELPER(RoiAlign, RoiAlignHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ROI_ALIGN(AMD, Precision::FP32); +template class RoiAlignHelper; +template class RoiAlignHelper; +template class RoiAlignHelper; +ANAKIN_REGISTER_OP_HELPER(RoiAlign, RoiAlignHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(RoiAlign) +.Doc("RoiAlign operator") +#ifdef USE_CUDA +.__alias__("roi_align") +#endif +#ifdef USE_ARM_PLACE +.__alias__("roi_align") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("roi_align") +#endif +#ifdef AMD_GPU +.__alias__("roi_align") +#endif +.num_in(1) +.num_out(1) +.Args("type", " type of RoiAlign ") +.Args("channel_shared", "prelu channel is shared or not "); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/roi_align.h b/framework/operators/roi_align.h new file mode 100644 index 000000000..5334a4bce --- /dev/null +++ b/framework/operators/roi_align.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_ROI_ALIGN_H +#define ANAKIN_OPERATOR_ROI_ALIGN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/roi_align.h" + +namespace anakin { + +namespace ops { + +template +class RoiAlignHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class RoiAlign : public Operator { +public: + RoiAlign() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator RoiAlign< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class RoiAlignHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class RoiAlignHelper : public OperatorHelper { +public: + RoiAlignHelper()=default; + + ~RoiAlignHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_roi_align stand for roi_align parameter + saber::RoiAlignParam _param_roi_align; + ///< _funcs_roi_align stand for roi_align function + saber::RoiAlign::saber_type> _funcs_roi_align; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/roi_pool.cpp b/framework/operators/roi_pool.cpp new file mode 100644 index 000000000..fb866c325 --- /dev/null +++ b/framework/operators/roi_pool.cpp @@ -0,0 +1,110 @@ +#include "framework/operators/roi_pool.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_ROI_POOL(Ttype, Ptype) \ +template<> \ +void RoiPool::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_roi_pool; \ + impl->_funcs_roi_pool(ins, outs, param, ctx); \ +} + +/// set helper +template +RoiPoolHelper::~RoiPoolHelper() { +} + +template +Status RoiPoolHelper::InitParam() { + DLOG(WARNING) << "Parsing RoiPool op parameter."; + auto pooled_height = GET_PARAMETER(int, pooled_h); + auto pooled_width = GET_PARAMETER(int, pooled_w); + auto spatial_scale = GET_PARAMETER(float, spatial_scale); + RoiPoolParam param_roi_pool(pooled_height, pooled_width, spatial_scale); + _param_roi_pool = param_roi_pool; + + return Status::OK(); +} + +template +Status RoiPoolHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_roi_pool.init(ins, outs, _param_roi_pool, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status RoiPoolHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_roi_pool.compute_output_shape(ins, outs, _param_roi_pool)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_ROI_POOL(NV, Precision::FP32); +template<> +Status RoiPoolHelper::Init(OpContext& ctx, \ + const std::vector< Tensor4dPtr > & ins, std::vector< Tensor4dPtr >& outs) { + SABER_CHECK(_funcs_roi_pool.init(ins, outs, _param_roi_pool, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} +ANAKIN_REGISTER_OP_HELPER(RoiPool, RoiPoolHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_ROI_POOL(X86, Precision::FP32); +INSTANCE_ROI_POOL(X86, Precision::FP16); +INSTANCE_ROI_POOL(X86, Precision::INT8); +template class RoiPoolHelper; +ANAKIN_REGISTER_OP_HELPER(RoiPool, RoiPoolHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_ROI_POOL(ARM, Precision::FP32); +template class RoiPoolHelper; +ANAKIN_REGISTER_OP_HELPER(RoiPool, RoiPoolHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_ROI_POOL(AMD, Precision::FP32); +template class RoiPoolHelper; +template class RoiPoolHelper; +template class RoiPoolHelper; +ANAKIN_REGISTER_OP_HELPER(RoiPool, RoiPoolHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(RoiPool) + .Doc("RoiPool operator") +#ifdef USE_CUDA +.__alias__("roi_pool") +#endif +#ifdef USE_ARM_PLACE +.__alias__("roi_pool") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("roi_pool") +#endif +#ifdef AMD_GPU +.__alias__("roi_pool") +#endif +.num_in(1) +.num_out(1) +.Args("type", " type of RoiPool ") +.Args("pooled_h", "roi pool height") +.Args("pooled_w", "roi pool width") +.Args("spatial_scale", "roi pool spatial_scale"); + +} /* namespace ops */ + +} /* namespace anakin */ + + + diff --git a/framework/operators/roi_pool.h b/framework/operators/roi_pool.h new file mode 100644 index 000000000..def26b974 --- /dev/null +++ b/framework/operators/roi_pool.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_OPERATOR_ROI_POOLING_H +#define ANAKIN_OPERATOR_ROI_POOLING_H +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/roi_pooling.h" + +namespace anakin { + +namespace ops { + +template +class RoiPoolHelper; + +/** +* \brief RoiPool implementation class +* public inherit Operator +*/ +template +class RoiPool : public Operator { +public: + RoiPool() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator RoiPooling< Ttype(" + << target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class RoiPoolHelper; +}; + +/** +* \brief RoiPool helper class to implement RoiPool +* public inherit OperatorHelper +* including init resource and shape size in RoiPool context +*/ +template +class RoiPoolHelper : public OperatorHelper { +public: + RoiPoolHelper()=default; + + ~RoiPoolHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for RoiPool operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_roi_pool stand for RoiPool parameter + saber::RoiPoolParam _param_roi_pool; + ///< _funcs_roi_pool stand for RoiPool function + saber::RoiPool::saber_type> _funcs_roi_pool; + +}; + +} /* namespace ops */ + +} /* namespace anakin */ +#endif //ANAKIN_OPERATOR_ROI_POOLING_H diff --git a/framework/operators/rois_anchor_feature.cpp b/framework/operators/rois_anchor_feature.cpp index cfe2987a7..820cf5997 100644 --- a/framework/operators/rois_anchor_feature.cpp +++ b/framework/operators/rois_anchor_feature.cpp @@ -103,4 +103,4 @@ ANAKIN_REGISTER_OP(RoisAnchorFeature) .Args("ft_log_ratio_w", " param of rois_anchor_feature_param") .Args("bbox_size_add_one", " param of rois_anchor_feature_param"); } /* namespace ops */ -} /* namespace anakin */ \ No newline at end of file +} /* namespace anakin */ diff --git a/framework/operators/rois_anchor_feature.h b/framework/operators/rois_anchor_feature.h index 5c93a2f1e..e31e0607f 100644 --- a/framework/operators/rois_anchor_feature.h +++ b/framework/operators/rois_anchor_feature.h @@ -38,7 +38,7 @@ class RoisAnchorFeature : public Operator { const std::vector >& ins, std::vector >& outs) { LOG(ERROR) << "Not Impl Yet Operator convolution::value << "), Precision(" << Ptype << ") >"; + target_name::value << "), Precision(" << (int)Ptype << ") >"; } friend class RoisAnchorFeatureHelper; }; @@ -79,4 +79,4 @@ class RoisAnchorFeatureHelper : public OperatorHelper { }; } /* namespace ops */ } /* namespace anakin */ -#endif \ No newline at end of file +#endif diff --git a/framework/operators/rpn_proposal_ssd.cpp b/framework/operators/rpn_proposal_ssd.cpp index be1804379..62cccbeab 100644 --- a/framework/operators/rpn_proposal_ssd.cpp +++ b/framework/operators/rpn_proposal_ssd.cpp @@ -13,6 +13,18 @@ void RPNProposalSSD::operator()( impl->_funcs_rpn_prop_ssd(ins, outs, param, ctx); } #endif +#ifdef USE_ARM_PLACE +template<> +void RPNProposalSSD::operator()( + OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + auto* impl = static_cast*>(this->_helper); + auto& param = static_cast*> + (this->_helper)->_param_rpn_prop_ssd; + impl->_funcs_rpn_prop_ssd(ins, outs, param, ctx); +} +#endif /// TODO ... specialization other type of operator /// set helper template diff --git a/framework/operators/scale.cpp b/framework/operators/scale.cpp index 7b7dd04d8..d534be958 100644 --- a/framework/operators/scale.cpp +++ b/framework/operators/scale.cpp @@ -1,3 +1,19 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * +*/ + #include "framework/operators/scale.h" namespace anakin { @@ -60,6 +76,12 @@ template class ScaleHelper; ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, NV, Precision::FP32); #endif +#ifdef AMD_GPU +INSTANCE_SCALE(AMD, Precision::FP32); +template class ScaleHelper; +ANAKIN_REGISTER_OP_HELPER(Scale, ScaleHelper, AMD, Precision::FP32); +#endif + #if defined USE_X86_PLACE || defined BUILD_LITE INSTANCE_SCALE(X86, Precision::FP32); template class ScaleHelper; @@ -84,6 +106,9 @@ ANAKIN_REGISTER_OP(Scale) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("Scale") #endif +#ifdef AMD_GPU +.__alias__("Scale") +#endif .num_in(1) .num_out(1) .Args("type", " type of Scale "); diff --git a/framework/operators/sequence_concat.cpp b/framework/operators/sequence_concat.cpp new file mode 100644 index 000000000..579bedf7d --- /dev/null +++ b/framework/operators/sequence_concat.cpp @@ -0,0 +1,92 @@ +#include "framework/operators/sequence_concat.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SEQUENCE_CONCAT(Ttype, Ptype) \ +template<> \ +void SequenceConcat::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sequence_concat; \ + impl->_funcs_sequence_concat(ins, outs, param, ctx); \ +} + +/// set helper +template +SequenceConcatHelper::~SequenceConcatHelper() { +} + +template +Status SequenceConcatHelper::InitParam() { + DLOG(WARNING) << "Parsing SequenceConcat op parameter."; + SequenceConcatParam param_sequence_concat; + _param_sequence_concat = param_sequence_concat; + + return Status::OK(); +} + +template +Status SequenceConcatHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_concat.init(ins, outs, _param_sequence_concat, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequenceConcatHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_concat.compute_output_shape(ins, outs, _param_sequence_concat)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SEQUENCE_CONCAT(NV, Precision::FP32); +template class SequenceConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConcat, SequenceConcatHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SEQUENCE_CONCAT(X86, Precision::FP32); +template class SequenceConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConcat, SequenceConcatHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_CONCAT(ARM, Precision::FP32); +template class SequenceConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConcat, SequenceConcatHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_SEQUENCE_CONCAT(AMD, Precision::FP32); +template class SequenceConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConcat, SequenceConcatHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(SequenceConcat) +.Doc("SequenceConcat operator") +#ifdef USE_CUDA +.__alias__("sequence_concat") +#endif +#ifdef USE_ARM_PLACE +.__alias__("sequence_concat") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("sequence_concat") +#endif +#ifdef AMD_GPU +.__alias__("sequence_concat") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/sequence_concat.h b/framework/operators/sequence_concat.h new file mode 100644 index 000000000..8dc895808 --- /dev/null +++ b/framework/operators/sequence_concat.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQUENCE_CONCAT_H +#define ANAKIN_OPERATOR_SEQUENCE_CONCAT_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sequence_concat.h" + +namespace anakin { + +namespace ops { + +template +class SequenceConcatHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class SequenceConcat : public Operator { +public: + SequenceConcat() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SequenceConcat< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SequenceConcatHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class SequenceConcatHelper : public OperatorHelper { +public: + SequenceConcatHelper()=default; + + ~SequenceConcatHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_sequence_concat stand for sequence_concat parameter + saber::SequenceConcatParam _param_sequence_concat; + ///< _funcs_sequence_concat stand for sequence_concat function + saber::SequenceConcat::saber_type> _funcs_sequence_concat; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/sequence_conv.cpp b/framework/operators/sequence_conv.cpp index c57277a5c..364a73d1f 100644 --- a/framework/operators/sequence_conv.cpp +++ b/framework/operators/sequence_conv.cpp @@ -4,31 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void SequenceConv::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param; - impl->_funcs(ins, outs, param, ctx); +#define INSTANCE_SEQUENCE_CONV(Ttype, Ptype) \ +template<> \ +void SequenceConv::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param; \ + impl->_funcs(ins, outs, param, ctx); \ } -#endif - -#ifdef USE_X86_PLACE -template<> -void SequenceConv::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*> - (this->_helper)->_param; - impl->_funcs(ins, outs, param, ctx); -} -#endif /// TODO ... specialization other type of operator @@ -70,7 +56,6 @@ template<> Status SequenceConvHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { - LOG(INFO) << "are you ok"; SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } @@ -78,7 +63,6 @@ template<> Status SequenceConvHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { - LOG(INFO) << "are you ok"; SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } @@ -87,7 +71,6 @@ template<> Status SequenceConvHelper::Init(OpContext& ctx, const std::vector >& ins, std::vector >& outs) { - LOG(INFO) << "are you ok"; SABER_CHECK(_funcs.init(ins, outs, _param, SPECIFY, SABER_IMPL, ctx)); return Status::OK(); } @@ -108,32 +91,33 @@ Status SequenceConvHelper::InferShape(const SABER_CHECK(_funcs.compute_output_shape(ins, outs, _param)); return Status::OK(); } +#ifdef AMD_GPU +INSTANCE_SEQUENCE_CONV(AMD, Precision::FP32); +template class SequenceConvHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, AMD, Precision::FP32); +#endif #ifdef USE_X86_PLACE +INSTANCE_SEQUENCE_CONV(X86, Precision::FP32); template class SequenceConvHelper; template class SequenceConvHelper; template class SequenceConvHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, X86, Precision::FP32); #endif #ifdef USE_CUDA +INSTANCE_SEQUENCE_CONV(NV, Precision::FP32); template class SequenceConvHelper; template class SequenceConvHelper; template class SequenceConvHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, NV, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_CONV(ARM, Precision::FP32); template class SequenceConvHelper; template class SequenceConvHelper; template class SequenceConvHelper; -#endif -// register helper -#ifdef USE_X86_PLACE -ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, X86, Precision::FP32); -#endif - -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, NV, Precision::FP32); -#endif -#ifdef USE_ARM_PLACE ANAKIN_REGISTER_OP_HELPER(SequenceConv, SequenceConvHelper, ARM, Precision::FP32); #endif + //! register op ANAKIN_REGISTER_OP(SequenceConv) .Doc("SequenceConv operator") @@ -146,6 +130,9 @@ ANAKIN_REGISTER_OP(SequenceConv) #ifdef USE_ARM_PLACE .__alias__("SequenceConv") #endif +#ifdef AMD_GPU +.__alias__("SequenceConv") +#endif .num_in(1) .num_out(1) .Args("axis", " axis "); diff --git a/framework/operators/sequence_depadding.cpp b/framework/operators/sequence_depadding.cpp new file mode 100644 index 000000000..7182106a3 --- /dev/null +++ b/framework/operators/sequence_depadding.cpp @@ -0,0 +1,96 @@ +#include "framework/operators/sequence_depadding.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SEQUENCE_DEPADDING(Ttype, Ptype) \ +template<> \ +void SequenceDePadding::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sequence_depadding; \ + impl->_funcs_sequence_depadding(ins, outs, param, ctx); \ +} + +/// set helper +template +SequenceDePaddingHelper::~SequenceDePaddingHelper() { +} + +template +Status SequenceDePaddingHelper::InitParam() { + LOG(WARNING) << "Parsing SequenceDePadding op parameter."; + SequenceDePaddingParam param_sequence_depadding; + _param_sequence_depadding = param_sequence_depadding; + + return Status::OK(); +} + +template +Status SequenceDePaddingHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_depadding.init(ins, outs, _param_sequence_depadding, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequenceDePaddingHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_depadding.compute_output_shape(ins, outs, _param_sequence_depadding)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SEQUENCE_DEPADDING(NV, Precision::FP32); +template class SequenceDePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceDePadding, SequenceDePaddingHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SEQUENCE_DEPADDING(X86, Precision::FP32); +INSTANCE_SEQUENCE_DEPADDING(X86, Precision::FP16); +INSTANCE_SEQUENCE_DEPADDING(X86, Precision::INT8); +template class SequenceDePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceDePadding, SequenceDePaddingHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_DEPADDING(ARM, Precision::FP32); +template class SequenceDePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceDePadding, SequenceDePaddingHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_SEQUENCE_DEPADDING(AMD, Precision::FP32); +template class SequenceDePaddingHelper; +template class SequenceDePaddingHelper; +template class SequenceDePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceDePadding, SequenceDePaddingHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(SequenceDePadding) +.Doc("SequenceDePadding operator") +#ifdef USE_CUDA +.__alias__("sequence_depadding") +#endif +#ifdef USE_ARM_PLACE +.__alias__("sequence_depadding") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("sequence_depadding") +#endif +#ifdef AMD_GPU +.__alias__("sequence_depadding") +#endif +.num_in(2) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/sequence_depadding.h b/framework/operators/sequence_depadding.h new file mode 100644 index 000000000..2e4e1bccf --- /dev/null +++ b/framework/operators/sequence_depadding.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQUENCE_DEPADDING_H +#define ANAKIN_OPERATOR_SEQUENCE_DEPADDING_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sequence_depadding.h" + +namespace anakin { + +namespace ops { + +template +class SequenceDePaddingHelper; + +/// pooling op +/** + * \brief SequenceDePadding operation class + * public inheritance Operator + */ +template +class SequenceDePadding : public Operator { +public: + SequenceDePadding() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator SequenceDePadding< Ttype(" + //<< target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SequenceDePaddingHelper; +}; + +/** + * \brief SequenceDePadding helper class + * public inherit OperatorHelper + * including init resource and shape size in sequence_depadding context + */ +template +class SequenceDePaddingHelper : public OperatorHelper { +public: + SequenceDePaddingHelper()=default; + + ~SequenceDePaddingHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SequenceDePadding operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_sequence_depadding stand for SequenceDePadding parameter + saber::SequenceDePaddingParam _param_sequence_depadding; + ///< _funcs_sequence_depadding stand for SequenceDePadding function + saber::SequenceDePadding::saber_type> _funcs_sequence_depadding; + +private: +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/sequence_expand.cpp b/framework/operators/sequence_expand.cpp index 3e64cc6e8..74ddfccd9 100644 --- a/framework/operators/sequence_expand.cpp +++ b/framework/operators/sequence_expand.cpp @@ -64,18 +64,24 @@ ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, NV, Precision::F #endif #ifdef USE_X86_PLACE -INSTANCE_SEQUENCE_EXPAND(X86, Precision::FP32); -INSTANCE_SEQUENCE_EXPAND(X86, Precision::FP16); -INSTANCE_SEQUENCE_EXPAND(X86, Precision::INT8); +INSTANCE_SEQUENCE_EXPAND(X86, Precision::FP32); +INSTANCE_SEQUENCE_EXPAND(X86, Precision::FP16); +INSTANCE_SEQUENCE_EXPAND(X86, Precision::INT8); template class SequenceExpandHelper; -ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, X86, Precision::FP32); +ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, X86, Precision::FP32); #endif #ifdef USE_ARM_PLACE -INSTANCE_SEQUENCE_EXPAND(ARM, Precision::FP32); +INSTANCE_SEQUENCE_EXPAND(ARM, Precision::FP32); template class SequenceExpandHelper; -ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, ARM, Precision::FP32); -#endif//arm +ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, ARM, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_SEQUENCE_EXPAND(AMD, Precision::FP32); +template class SequenceExpandHelper; +ANAKIN_REGISTER_OP_HELPER(SequenceExpand, SequenceExpandHelper, AMD, Precision::FP32); +#endif //! register op ANAKIN_REGISTER_OP(SequenceExpand) @@ -89,6 +95,9 @@ ANAKIN_REGISTER_OP(SequenceExpand) #ifdef USE_X86_PLACE .__alias__("sequence_expand") #endif +#ifdef AMD_GPU +.__alias__("sequence_expand") +#endif .num_in(2) .num_out(1) .Args("ref_level", "ref level must be 0"); diff --git a/framework/operators/sequence_padding.cpp b/framework/operators/sequence_padding.cpp new file mode 100644 index 000000000..a761a498b --- /dev/null +++ b/framework/operators/sequence_padding.cpp @@ -0,0 +1,96 @@ +#include "framework/operators/sequence_padding.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SEQUENCE_PADDING(Ttype, Ptype) \ +template<> \ +void SequencePadding::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sequence_padding; \ + impl->_funcs_sequence_padding(ins, outs, param, ctx); \ +} + +/// set helper +template +SequencePaddingHelper::~SequencePaddingHelper() { +} + +template +Status SequencePaddingHelper::InitParam() { + LOG(WARNING) << "Parsing SequencePadding op parameter."; + SequencePaddingParam param_sequence_padding; + _param_sequence_padding = param_sequence_padding; + + return Status::OK(); +} + +template +Status SequencePaddingHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_padding.init(ins, outs, _param_sequence_padding, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequencePaddingHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_padding.compute_output_shape(ins, outs, _param_sequence_padding)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SEQUENCE_PADDING(NV, Precision::FP32); +template class SequencePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePadding, SequencePaddingHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SEQUENCE_PADDING(X86, Precision::FP32); +INSTANCE_SEQUENCE_PADDING(X86, Precision::FP16); +INSTANCE_SEQUENCE_PADDING(X86, Precision::INT8); +template class SequencePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePadding, SequencePaddingHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_PADDING(ARM, Precision::FP32); +template class SequencePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePadding, SequencePaddingHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_SEQUENCE_PADDING(AMD, Precision::FP32); +template class SequencePaddingHelper; +template class SequencePaddingHelper; +template class SequencePaddingHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePadding, SequencePaddingHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(SequencePadding) +.Doc("SequencePadding operator") +#ifdef USE_CUDA +.__alias__("sequence_padding") +#endif +#ifdef USE_ARM_PLACE +.__alias__("sequence_padding") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("sequence_padding") +#endif +#ifdef AMD_GPU +.__alias__("sequence_padding") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/sequence_padding.h b/framework/operators/sequence_padding.h new file mode 100644 index 000000000..882e0e0f4 --- /dev/null +++ b/framework/operators/sequence_padding.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQUENCE_PADDING_H +#define ANAKIN_OPERATOR_SEQUENCE_PADDING_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sequence_padding.h" + +namespace anakin { + +namespace ops { + +template +class SequencePaddingHelper; + +/// pooling op +/** + * \brief SequencePadding operation class + * public inheritance Operator + */ +template +class SequencePadding : public Operator { +public: + SequencePadding() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator SequencePadding< Ttype(" + //<< target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SequencePaddingHelper; +}; + +/** + * \brief SequencePadding helper class + * public inherit OperatorHelper + * including init resource and shape size in sequence_padding context + */ +template +class SequencePaddingHelper : public OperatorHelper { +public: + SequencePaddingHelper()=default; + + ~SequencePaddingHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SequencePadding operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_sequence_padding stand for SequencePadding parameter + saber::SequencePaddingParam _param_sequence_padding; + ///< _funcs_sequence_padding stand for SequencePadding function + saber::SequencePadding::saber_type> _funcs_sequence_padding; + +private: +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/sequence_pool.cpp b/framework/operators/sequence_pool.cpp index c8b81befe..ac216e182 100644 --- a/framework/operators/sequence_pool.cpp +++ b/framework/operators/sequence_pool.cpp @@ -4,32 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_X86_PLACE -template<> -void SequencePool::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_sequence_pool; - impl->_funcs_sequence_pool(ins, outs, param, ctx); -} -#endif - - -#ifdef USE_CUDA -template<> -void SequencePool::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = static_cast*>(this->_helper); - auto& param = static_cast*>(this->_helper)->_param_sequence_pool; - impl->_funcs_sequence_pool(ins, outs, param, ctx); +#define INSTANCE_SEQUENCE_POOL(Ttype, Ptype) \ +template<> \ +void SequencePool::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sequence_pool; \ + impl->_funcs_sequence_pool(ins, outs, param, ctx); \ } -#endif -/// TODO ... specialization other type of operator - /// set helper template @@ -70,33 +55,32 @@ Status SequencePoolHelper::InferShape(const std::vector; template class SequencePoolHelper; template class SequencePoolHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, NV, Precision::FP32); +#endif + +#ifdef AMD_GPU +INSTANCE_SEQUENCE_POOL(AMD, Precision::FP32); +template class SequencePoolHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, AMD, Precision::FP32); #endif #ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_POOL(ARM, Precision::FP32); template class SequencePoolHelper; template class SequencePoolHelper; template class SequencePoolHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, ARM, Precision::FP32); #endif #ifdef USE_X86_PLACE +INSTANCE_SEQUENCE_POOL(X86, Precision::FP32); template class SequencePoolHelper; template class SequencePoolHelper; template class SequencePoolHelper; -#endif - -// register helper -#ifdef USE_CUDA -ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, NV, Precision::FP32); -#endif - -#ifdef USE_ARM_PLACE -ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, ARM, Precision::FP32); -#endif - -#ifdef USE_X86_PLACE ANAKIN_REGISTER_OP_HELPER(SequencePool, SequencePoolHelper, X86, Precision::FP32); #endif @@ -112,6 +96,9 @@ ANAKIN_REGISTER_OP(SequencePool) #ifdef USE_X86_PLACE .__alias__("SequencePool") #endif +#ifdef AMD_GPU +.__alias__("SequencePool") +#endif .num_in(1) .num_out(1) .Args("pooltype", " pooltype to compute "); diff --git a/framework/operators/sequence_pool_concat.cpp b/framework/operators/sequence_pool_concat.cpp new file mode 100644 index 000000000..3e24e7965 --- /dev/null +++ b/framework/operators/sequence_pool_concat.cpp @@ -0,0 +1,118 @@ +#include "framework/operators/sequence_pool_concat.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SEQUENCE_POOL_CONCAT(Ttype, Ptype) \ +template<> \ +void SequencePoolConcat::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sequence_pool; \ + impl->_funcs_sequence_pool(ins, outs, param, ctx); \ +} + +/// set helper +template +SequencePoolConcatHelper::~SequencePoolConcatHelper() { +} + +template +Status SequencePoolConcatHelper::InitParam() { + DLOG(WARNING) << "Parsing SequencePoolConcat op parameter."; + auto pooltype = GET_PARAMETER(std::string, pooltype); + std::unordered_map type_map; + type_map.insert(std::make_pair("null", anakin::saber::Sequence_pool_unknow)); + type_map.insert(std::make_pair("AVERAGE", anakin::saber::Sequence_pool_average)); + type_map.insert(std::make_pair("SUM", anakin::saber::Sequence_pool_sum)); + type_map.insert(std::make_pair("SQRT", anakin::saber::Sequence_pool_sqrt)); + type_map.insert(std::make_pair("LAST", anakin::saber::Sequence_pool_last)); + type_map.insert(std::make_pair("FIRST", anakin::saber::Sequence_pool_first)); + type_map.insert(std::make_pair("MAX", anakin::saber::Sequence_pool_max)); + int slot_num = 1; + if (CHECK_PARAMETER(slot_num)) { + slot_num = GET_PARAMETER(int, slot_num); + } else { + LOG(FATAL) << "not found slot num param!!!!"; + } + saber::SequencePoolParam seq_param(type_map[pooltype]); + saber::ConcatParam concat_param(0); + saber::SequencePoolConcatParam sequence_pool_param(seq_param, concat_param, slot_num); + _param_sequence_pool = sequence_pool_param; + return Status::OK(); +} + +template +Status SequencePoolConcatHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_pool.init(ins, outs, _param_sequence_pool, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SequencePoolConcatHelper::InferShape(const std::vector >& +ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sequence_pool.compute_output_shape(ins, outs, _param_sequence_pool)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SEQUENCE_POOL_CONCAT(NV, Precision::FP32); +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePoolConcat, SequencePoolConcatHelper, NV, Precision::FP32); +#endif + +//#ifdef AMD_GPU +//INSTANCE_SEQUENCE_POOL_CONCAT(AMD, Precision::FP32); +//template class SequencePoolConcatHelper; +//ANAKIN_REGISTER_OP_HELPER(SequencePoolConcat, SequencePoolConcatHelper, AMD, Precision::FP32); +//#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SEQUENCE_POOL_CONCAT(ARM, Precision::FP32); +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePoolConcat, SequencePoolConcatHelper, ARM, Precision::FP32); +#endif + +#ifdef USE_X86_PLACE +INSTANCE_SEQUENCE_POOL_CONCAT(X86, Precision::FP32); +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +template class SequencePoolConcatHelper; +ANAKIN_REGISTER_OP_HELPER(SequencePoolConcat, SequencePoolConcatHelper, X86, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(SequencePoolConcat) +.Doc("SequencePoolConcat operator") +#ifdef USE_CUDA +.__alias__("SequencePoolConcat") +#endif +#ifdef USE_ARM_PLACE +.__alias__("SequencePoolConcat") +#endif +#ifdef USE_X86_PLACE +.__alias__("SequencePoolConcat") +#endif +#ifdef AMD_GPU +.__alias__("SequencePoolConcat") +#endif +.num_in(1) +.num_out(1) +.Args("pooltype", " pooltype to compute "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/sequence_pool_concat.h b/framework/operators/sequence_pool_concat.h new file mode 100644 index 000000000..5940bc4f4 --- /dev/null +++ b/framework/operators/sequence_pool_concat.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SEQUENCE_POOL_CONCAT_H +#define ANAKIN_OPERATOR_SEQUENCE_POOL_CONCAT_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sequence_pool_concat.h" + +namespace anakin { + +namespace ops { + +template +class SequencePoolConcatHelper; + +/// pooling op +/** + * \brief SequencePoolConcat operation class + * public inheritance Operator + */ +template +class SequencePoolConcat : public Operator { +public: + SequencePoolConcat() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + //LOG(ERROR) << "Not Impl Yet Operator SequencePoolConcat< Ttype(" + //<< target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SequencePoolConcatHelper; +}; + +/** + * \brief SequencePoolConcat helper class + * public inherit OperatorHelper + * including init resource and shape size in sequence_pool context + */ +template +class SequencePoolConcatHelper : public OperatorHelper { +public: + SequencePoolConcatHelper()=default; + + ~SequencePoolConcatHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for SequencePoolConcat operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_sequence_pool stand for SequencePoolConcat parameter + saber::SequencePoolConcatParam _param_sequence_pool; + ///< _funcs_sequence_pool stand for SequencePoolConcat function + saber::SequencePoolConcat::saber_type> _funcs_sequence_pool; + +private: + ///< _dims stand for SequencePoolConcat size + PTuple _dims; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/shuffle_channel.cpp b/framework/operators/shuffle_channel.cpp index 5cddf2bab..628445e97 100644 --- a/framework/operators/shuffle_channel.cpp +++ b/framework/operators/shuffle_channel.cpp @@ -38,6 +38,12 @@ Status ShuffleChannelHelper::InferShape(const std::vector; +ANAKIN_REGISTER_OP_HELPER(ShuffleChannel, ShuffleChannelHelper, AMD, Precision::FP32); +#endif + #ifdef USE_CUDA INSTANCE_SHUFFLE_CHANNEL(NV, Precision::FP32); INSTANCE_SHUFFLE_CHANNEL(NV, Precision::INT8); @@ -80,6 +86,9 @@ ANAKIN_REGISTER_OP(ShuffleChannel) .__alias__("shufflechannel") .__alias__("shufflechannel") #endif +#ifdef AMD_GPU +.__alias__("shufflechannel") +#endif .num_in(1) .num_out(1) .Args("group", " group number for shuffle "); diff --git a/framework/operators/slice.cpp b/framework/operators/slice.cpp index 049259ad7..60ac41d56 100644 --- a/framework/operators/slice.cpp +++ b/framework/operators/slice.cpp @@ -1,3 +1,19 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * +*/ + #include "framework/operators/slice.h" namespace anakin { @@ -72,6 +88,14 @@ template class SliceHelper; template class SliceHelper; #endif +#ifdef AMD_GPU +INSTANCE_SLICE(AMD, Precision::FP32); +template class SliceHelper; +ANAKIN_REGISTER_OP_HELPER(Slice, SliceHelper, AMD, Precision::FP32); +template class SliceHelper; +template class SliceHelper; +#endif + #if defined USE_X86_PLACE || defined(BUILD_LITE) INSTANCE_SLICE(X86, Precision::FP32); template class SliceHelper; @@ -96,6 +120,9 @@ ANAKIN_REGISTER_OP(Slice) #if defined(USE_X86_PLACE) || defined(BUILD_LITE) .__alias__("slice") #endif +#ifdef AMD_GPU +.__alias__("slice") +#endif .num_in(1) .num_out(1) .Args("slice_dim", " slice dim at input ") diff --git a/framework/operators/slice_v2.cpp b/framework/operators/slice_v2.cpp new file mode 100644 index 000000000..8bb04462f --- /dev/null +++ b/framework/operators/slice_v2.cpp @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * +*/ + +#include "framework/operators/slice_v2.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SLICE_V2(Ttype, Ptype) \ +template<> \ +void SliceV2::operator()( \ + OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_slice_v2; \ + impl->_funcs_slice_v2(ins, outs, param, ctx); \ +} + +template +Status SliceV2Helper::InitParam() { + DLOG(WARNING) << "Parsing SliceV2 op parameter."; + auto starts = GET_PARAMETER(PTuple, starts); + auto ends = GET_PARAMETER(PTuple, ends); + PTuple axes; + bool found_axes = CHECK_PARAMETER(axes); + if (found_axes) { + axes = GET_PARAMETER(PTuple, axes); + } + DLOG(INFO) << " slice_v2 starts size(" << starts.size() << ")."; + DLOG(INFO) << " slice_v2 ends size(" << ends.size() << ")."; + DLOG(INFO) << " slice_v2 axes size(" << axes.size() << ")."; + std::vector real_axes; + if (axes.size() == 0) { + real_axes.resize(starts.size()); + for (int i = 0; i < starts.size(); i++) { + real_axes[i] = i; + } + SliceV2Param param_slice_v2(real_axes, starts.vector(), ends.vector()); + _param_slice_v2 = param_slice_v2; + } else { + int min_axes = axes.data()[0]; + int max_axes = axes.data()[axes.size() - 1]; + int axes_num = max_axes - min_axes + 1; + std::vector real_starts(axes_num, 0); + std::vector real_ends(axes_num, -1); + std::vector real_axes = axes.vector(); + if (axes_num == real_axes.size()) { + real_starts = starts.vector(); + real_ends = ends.vector(); + } else { + for (int i = 0; i < starts.size(); i++) { + real_starts[axes.data()[i] - min_axes] = starts.data()[i]; + real_ends[axes.data()[i] - min_axes] = ends.data()[i]; + } + real_axes.clear(); + for (int i = min_axes; i < max_axes; i++) { + real_axes.push_back(i); + } + } + SliceV2Param param_slice_v2(real_axes, real_starts, real_ends); + _param_slice_v2 = param_slice_v2; + } + + return Status::OK(); +} + +template +Status SliceV2Helper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_slice_v2.init(ins, outs, _param_slice_v2, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SliceV2Helper::InferShape(const std::vector >& + ins, + std::vector >& outs) { + SABER_CHECK(_funcs_slice_v2.compute_output_shape(ins, outs, _param_slice_v2)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SLICE_V2(NV, Precision::FP32); +template class SliceV2Helper; +ANAKIN_REGISTER_OP_HELPER(SliceV2, SliceV2Helper, NV, Precision::FP32); +template class SliceV2Helper; +template class SliceV2Helper; +#endif + +#ifdef AMD_GPU +INSTANCE_SLICE_V2(AMD, Precision::FP32); +template class SliceV2Helper; +ANAKIN_REGISTER_OP_HELPER(SliceV2, SliceV2Helper, AMD, Precision::FP32); +template class SliceV2Helper; +template class SliceV2Helper; +#endif + +#if defined USE_X86_PLACE || defined(BUILD_LITE) +INSTANCE_SLICE_V2(X86, Precision::FP32); +template class SliceV2Helper; +ANAKIN_REGISTER_OP_HELPER(SliceV2, SliceV2Helper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SLICE_V2(ARM, Precision::FP32); +template class SliceV2Helper; +ANAKIN_REGISTER_OP_HELPER(SliceV2, SliceV2Helper, ARM, Precision::FP32); +#endif + +//! register op +ANAKIN_REGISTER_OP(SliceV2) +.Doc("SliceV2 operator") +#ifdef USE_CUDA +.__alias__("slice_v2") +#endif +#ifdef USE_ARM_PLACE +.__alias__("slice_v2") +#endif +#if defined(USE_X86_PLACE) || defined(BUILD_LITE) +.__alias__("slice_v2") +#endif +#ifdef AMD_GPU +.__alias__("slice_v2") +#endif +.num_in(1) +.num_out(1) +.Args>("starts", " slice_v2 start position ") +.Args>("ends", " slice_v2 end position ") +.Args>("axes", " slice_v2 axes position "); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/fusion_ops/conv_3x3_relu_pool.h b/framework/operators/slice_v2.h similarity index 65% rename from framework/operators/fusion_ops/conv_3x3_relu_pool.h rename to framework/operators/slice_v2.h index fc71ac12c..84f63090f 100644 --- a/framework/operators/fusion_ops/conv_3x3_relu_pool.h +++ b/framework/operators/slice_v2.h @@ -13,60 +13,60 @@ limitations under the License. */ -#ifndef ANAKIN_OPERATOR_CONV_SASS_RELU_POOL_H -#define ANAKIN_OPERATOR_CONV_SASS_RELU_POOL_H +#ifndef ANAKIN_OPERATOR_SLICE_V2_H +#define ANAKIN_OPERATOR_SLICE_V2_H #include "framework/core/base.h" #include "framework/core/data_types.h" #include "framework/core/operator/operator.h" #include "utils/logger/logger.h" -#include "saber/funcs/conv_pooling.h" +#include "saber/funcs/slice_v2.h" namespace anakin { namespace ops { template -class SassConvReluPoolHelper; +class SliceV2Helper; /// pooling op /** - * \brief SassConvReluPool implementation class + * \brief SliceV2 implementation class * public inherit Operator */ template -class SassConvReluPool : public Operator { +class SliceV2 : public Operator { public: - SassConvReluPool() {} + SliceV2() {} /// forward impl virtual void operator() (OpContext &ctx, const std::vector >& ins, std::vector >& outs) { - LOG(ERROR) << "Not Impl Yet Operator SassConvReluPool< Ttype(" + LOG(ERROR) << "Not Impl Yet Operator SliceV2< Ttype(" << target_name::value << "), Precision("<< Ptype <<") >"; } - friend class SassConvReluPoolHelper; + friend class SliceV2Helper; }; /** - * \brief SassConvReluPool helper class to implement it + * \brief SliceV2 helper class to implement SliceV2 * public inherit OperatorHelper - * including init resource and shape size in SassConvReluPool context + * including init resource and shape size in SliceV2 context */ template -class SassConvReluPoolHelper : public OperatorHelper { +class SliceV2Helper : public OperatorHelper { public: - SassConvReluPoolHelper()=default; + SliceV2Helper()=default; - ~SassConvReluPoolHelper(); + ~SliceV2Helper() {} Status InitParam() override; /** * \brief initial all the resource needed by pooling - * \param ctx stand for SassConvReluPool operation context + * \param ctx stand for SliceV2 operation context * \param ins stand for input tensor vector * \param outs stand for output tensor vector * \return status @@ -85,17 +85,12 @@ class SassConvReluPoolHelper : public OperatorHelper { std::vector >& outs) override; public: - ///< _param_conv_relu_pooling stand for SassConvReluPool parameter - saber::ConvPoolingParam _param_conv_relu_pooling; - ///< _funcs_conv_relu_pooling stand for SassConvReluPool function - saber::ConvPooling::saber_type> _funcs_conv_relu_pooling; - -private: - ///< _dims stand for SassConvReluPool size - PTuple _dims; -}; - + ///< _param_slice_v2 stand for slice_v2 parameter + saber::SliceV2Param _param_slice_v2; + ///< _funcs_slice_v2 stand for slice_v2 function + saber::SliceV2::saber_type> _funcs_slice_v2; +}; } /* namespace ops */ diff --git a/framework/operators/soft_sign.cpp b/framework/operators/soft_sign.cpp new file mode 100644 index 000000000..c0b3b9c55 --- /dev/null +++ b/framework/operators/soft_sign.cpp @@ -0,0 +1,92 @@ +#include "framework/operators/soft_sign.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SOFT_SIGN(Ttype, Ptype) \ +template<> \ +void SoftSign::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_soft_sign; \ + impl->_funcs_soft_sign(ins, outs, param, ctx); \ +} + +/// set helper +template +SoftSignHelper::~SoftSignHelper() { +} + +template +Status SoftSignHelper::InitParam() { + DLOG(WARNING) << "Parsing SoftSign op parameter."; + SoftSignParam param_soft_sign; + _param_soft_sign = param_soft_sign; + + return Status::OK(); +} + +template +Status SoftSignHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_soft_sign.init(ins, outs, _param_soft_sign, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status SoftSignHelper::InferShape(const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_soft_sign.compute_output_shape(ins, outs, _param_soft_sign)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_SOFT_SIGN(NV, Precision::FP32); +template class SoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SoftSign, SoftSignHelper, NV, Precision::FP32); +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SOFT_SIGN(X86, Precision::FP32); +template class SoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SoftSign, SoftSignHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SOFT_SIGN(ARM, Precision::FP32); +template class SoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SoftSign, SoftSignHelper, ARM, Precision::FP32); +#endif//arm + +#ifdef AMD_GPU +INSTANCE_SOFT_SIGN(AMD, Precision::FP32); +template class SoftSignHelper; +ANAKIN_REGISTER_OP_HELPER(SoftSign, SoftSignHelper, AMD, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(SoftSign) +.Doc("SoftSign operator") +#ifdef USE_CUDA +.__alias__("soft_sign") +#endif +#ifdef USE_ARM_PLACE +.__alias__("soft_sign") +#endif +#if defined USE_X86_PLACE || defined BUILD_LITE +.__alias__("soft_sign") +#endif +#ifdef AMD_GPU +.__alias__("soft_sign") +#endif +.num_in(1) +.num_out(1); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/soft_sign.h b/framework/operators/soft_sign.h new file mode 100644 index 000000000..cbf5685ef --- /dev/null +++ b/framework/operators/soft_sign.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_SOFT_SIGN_H +#define ANAKIN_OPERATOR_SOFT_SIGN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/soft_sign.h" + +namespace anakin { + +namespace ops { + +template +class SoftSignHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class SoftSign : public Operator { +public: + SoftSign() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SoftSign< Ttype(" + << target_name::value << "), Precision("<< Ptype <<") >"; + } + + friend class SoftSignHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class SoftSignHelper : public OperatorHelper { +public: + SoftSignHelper()=default; + + ~SoftSignHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_soft_sign stand for soft_sign parameter + saber::SoftSignParam _param_soft_sign; + ///< _funcs_soft_sign stand for soft_sign function + saber::SoftSign::saber_type> _funcs_soft_sign; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/softmax.cpp b/framework/operators/softmax.cpp index c6efed7bf..b44c77ceb 100644 --- a/framework/operators/softmax.cpp +++ b/framework/operators/softmax.cpp @@ -1,3 +1,18 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + #include "framework/operators/softmax.h" namespace anakin { @@ -95,7 +110,7 @@ template <> Status SoftmaxHelper::Init(OpContext &ctx, \ const std::vector >& ins, \ std::vector >& outs) { - SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, SABER_IMPL, ctx)); + SABER_CHECK(_funcs_softmax.init(ins, outs, _param_softmax, SPECIFY, VENDER_IMPL, ctx)); return Status::OK(); } ANAKIN_REGISTER_OP_HELPER(Softmax, SoftmaxHelper, AMD, Precision::FP32); diff --git a/framework/operators/split.cpp b/framework/operators/split.cpp index e1769b45a..b553688d1 100644 --- a/framework/operators/split.cpp +++ b/framework/operators/split.cpp @@ -1,3 +1,17 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "framework/operators/split.h" namespace anakin { @@ -38,6 +52,9 @@ Status SplitHelper::InferShape(const std::vector; ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, NV, Precision::FP32); +INSTANCE_SPLIT(NV, Precision::INT8); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, NV, Precision::INT8); #endif #ifdef USE_ARM_PLACE @@ -50,6 +67,15 @@ ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, ARM, Precision::FP32); INSTANCE_SPLIT(X86, Precision::FP32); template class SplitHelper; ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, X86, Precision::FP32); +INSTANCE_SPLIT(X86, Precision::INT8); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, X86, Precision::INT8); +#endif + +#ifdef AMD_GPU +INSTANCE_SPLIT(AMD, Precision::FP32); +template class SplitHelper; +ANAKIN_REGISTER_OP_HELPER(Split, SplitHelper, AMD, Precision::FP32); #endif //! register op @@ -64,6 +90,9 @@ ANAKIN_REGISTER_OP(Split) #if defined USE_X86_PLACE || defined BUILD_LITE .__alias__("split") #endif +#ifdef AMD_GPU +.__alias__("split") +#endif .num_in(1) .num_out(1) .Args("split_num", " split output number. "); diff --git a/framework/operators/sproposal.cpp b/framework/operators/sproposal.cpp new file mode 100644 index 000000000..f930a7d2a --- /dev/null +++ b/framework/operators/sproposal.cpp @@ -0,0 +1,101 @@ +#include "framework/operators/sproposal.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SPROPOSAL(Ttype, Ptype) \ +template<> \ +void SProposal::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sproposal; \ + impl->_funcs_sproposal(ins, outs, param, ctx); \ +} + +/// set helper +template +SProposalHelper::~SProposalHelper() {} + +template +Status SProposalHelper::InitParam() { + + DLOG(WARNING) << "Parsing SProposal op parameter."; + + auto scale = GET_PARAMETER(PTuple, scale); + auto ratio = GET_PARAMETER(PTuple, ratio); + + auto feat_stride = GET_PARAMETER(int, feat_stride); + auto basesize = GET_PARAMETER(int, basesize); + auto boxminsize = GET_PARAMETER(int, boxminsize); + auto pre_nms_topn = GET_PARAMETER(int, pre_nms_topn); + auto post_nms_topn = GET_PARAMETER(int, post_nms_topn); + auto nms_thresh = GET_PARAMETER(float, nms_thresh); + SProposalParam param_sproposal(scale.vector(), ratio.vector(), + feat_stride, basesize, boxminsize, pre_nms_topn, post_nms_topn, nms_thresh); + _param_sproposal = param_sproposal; + + return Status::OK(); +} + +template +Status SProposalHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + saber::ImplEnum impl_e = SABER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + SABER_CHECK(_funcs_sproposal.init(ins, outs, _param_sproposal, SPECIFY, impl_e, ctx)); + return Status::OK(); +} + +template +Status SProposalHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sproposal.compute_output_shape(ins, outs, _param_sproposal)); + return Status::OK(); +} + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SPROPOSAL(X86, Precision::FP32); +INSTANCE_SPROPOSAL(X86, Precision::FP16); +INSTANCE_SPROPOSAL(X86, Precision::INT8); +template class SProposalHelper; +ANAKIN_REGISTER_OP_HELPER(SProposal, SProposalHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SPROPOSAL(ARM, Precision::FP32); +template class SProposalHelper; +ANAKIN_REGISTER_OP_HELPER(SProposal, SProposalHelper, ARM, Precision::FP32); +#endif//arm + +//! register op +ANAKIN_REGISTER_OP(SProposal) +.Doc("SProposal operator") +#if defined USE_X86_PLACE || defined(BUILD_LITE) +.__alias__("sproposal") +#endif +#ifdef USE_ARM_PLACE +.__alias__("sproposal") +#endif +.num_in(1) +.num_out(1) +.Args>("scale", "scale of sproposal") +.Args>("ratio", "ratio of sproposal") +.Args("feat_stride", "feat_stride of sproposal") +.Args("basesize", "basesize of sproposal") +.Args("boxminsize", "boxminsize of sproposal") +.Args("pre_nms_topn", "pre_nms_topn of sproposal") +.Args("post_nms_topn", "post_nms_topn of sproposal") +.Args("nms_thresh", "nms_thresh of sproposal"); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/sproposal.h b/framework/operators/sproposal.h new file mode 100644 index 000000000..8eea506bb --- /dev/null +++ b/framework/operators/sproposal.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_OPERATOR_SPROPOSAL_H +#define ANAKIN_FRAMEWORK_OPERATOR_SPROPOSAL_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sproposal.h" + +namespace anakin { + +namespace ops { + +template +class SProposalHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class SProposal : public Operator { +public: + SProposal() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SProposal< Ttype(" + << target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class SProposalHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class SProposalHelper : public OperatorHelper { +public: + SProposalHelper() = default; + + ~SProposalHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_sproposal stand for sproposal parameter + saber::SProposalParam _param_sproposal; + ///< _funcs_sproposal stand for sproposal function + saber::SProposal::saber_type> _funcs_sproposal; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif //ANAKIN_FRAMEWORK_OPERATOR_SPROPOSAL_H diff --git a/framework/operators/sroi_align.cpp b/framework/operators/sroi_align.cpp new file mode 100644 index 000000000..3c9911084 --- /dev/null +++ b/framework/operators/sroi_align.cpp @@ -0,0 +1,87 @@ +#include "framework/operators/sroi_align.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_SROI_ALIGN(Ttype, Ptype) \ +template<> \ +void SRoiAlign::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_sroi_align; \ + impl->_funcs_sroi_align(ins, outs, param, ctx); \ +} + +/// set helper +template +SRoiAlignHelper::~SRoiAlignHelper() {} + +template +Status SRoiAlignHelper::InitParam() { + DLOG(WARNING) << "Parsing SRoiAlign op parameter."; + auto pooled_h = GET_PARAMETER(int, pooled_h); + auto pooled_w = GET_PARAMETER(int, pooled_w); + auto spatial_scale = GET_PARAMETER(float, spatial_scale); + SRoiAlignParam param_sroi_align(pooled_h, pooled_w, spatial_scale); + _param_sroi_align = param_sroi_align; + + return Status::OK(); +} + +template +Status SRoiAlignHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + saber::ImplEnum impl_e = SABER_IMPL; + if (std::is_same::value) { + impl_e = SABER_IMPL; + } + SABER_CHECK(_funcs_sroi_align.init(ins, outs, _param_sroi_align, SPECIFY, impl_e, ctx)); + return Status::OK(); +} + +template +Status SRoiAlignHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_sroi_align.compute_output_shape(ins, outs, _param_sroi_align)); + return Status::OK(); +} + +#if defined USE_X86_PLACE || defined BUILD_LITE +INSTANCE_SROI_ALIGN(X86, Precision::FP32); +INSTANCE_SROI_ALIGN(X86, Precision::FP16); +INSTANCE_SROI_ALIGN(X86, Precision::INT8); +template class SRoiAlignHelper; +ANAKIN_REGISTER_OP_HELPER(SRoiAlign, SRoiAlignHelper, X86, Precision::FP32); +#endif + +#ifdef USE_ARM_PLACE +INSTANCE_SROI_ALIGN(ARM, Precision::FP32); +template class SRoiAlignHelper; +ANAKIN_REGISTER_OP_HELPER(SRoiAlign, SRoiAlignHelper, ARM, Precision::FP32); +#endif//arm + +//! register op +ANAKIN_REGISTER_OP(SRoiAlign) +.Doc("SRoiAlign operator") +#if defined USE_X86_PLACE || defined(BUILD_LITE) +.__alias__("sroi_align") +#endif +#ifdef USE_ARM_PLACE +.__alias__("sroi_align") +#endif +.num_in(1) +.num_out(1) +.Args("pooled_h", "pooled_h of SRoiAlign") +.Args("pooled_w", "pooled_w of SRoiAlign") +.Args("spatial_scale", "spatial_scale of SRoiAlign"); + +} /* namespace ops */ + +} /* namespace anakin */ + diff --git a/framework/operators/sroi_align.h b/framework/operators/sroi_align.h new file mode 100644 index 000000000..d6b69fad4 --- /dev/null +++ b/framework/operators/sroi_align.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_FRAMEWORK_OPERATOR_SROI_ALIGN_H +#define ANAKIN_FRAMEWORK_OPERATOR_SROI_ALIGN_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/sroi_align.h" + +namespace anakin { + +namespace ops { + +template +class SRoiAlignHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class SRoiAlign : public Operator { +public: + SRoiAlign() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator SRoiAlign< Ttype(" + << target_name::value << "), Precision(" << Ptype << ") >"; + } + + friend class SRoiAlignHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class SRoiAlignHelper : public OperatorHelper { +public: + SRoiAlignHelper()=default; + + ~SRoiAlignHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_roi_align stand for roi_align parameter + saber::SRoiAlignParam _param_sroi_align; + ///< _funcs_roi_align stand for roi_align function + saber::SRoiAlign::saber_type> _funcs_sroi_align; +}; + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/operators/topk_avg_pooling.cpp b/framework/operators/topk_avg_pooling.cpp index 23c232c19..5a99ab1e8 100644 --- a/framework/operators/topk_avg_pooling.cpp +++ b/framework/operators/topk_avg_pooling.cpp @@ -4,36 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void TopKAvgPooling::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_topk_avg_pooling; - impl->_funcs_topk_avg_pooling(ins, outs, param, ctx); +#define INSTANCE_TOPK_AVG_POOLING(Ttype, Ptype) \ +template<> \ +void TopKAvgPooling::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_topk_avg_pooling; \ + impl->_funcs_topk_avg_pooling(ins, outs, param, ctx); \ } -#endif - -#ifdef USE_X86_PLACE -template<> -void TopKAvgPooling::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_topk_avg_pooling; - impl->_funcs_topk_avg_pooling(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - /// set helper template @@ -71,16 +52,19 @@ Status TopKAvgPoolingHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_TOPK_AVG_POOLING(NV, Precision::FP32); template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; #endif #ifdef USE_ARM_PLACE +INSTANCE_TOPK_AVG_POOLING(ARM, Precision::FP32); template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; #endif #ifdef USE_X86_PLACE +INSTANCE_TOPK_AVG_POOLING(X86, Precision::FP32); template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; template class TopKAvgPoolingHelper; @@ -107,6 +91,9 @@ ANAKIN_REGISTER_OP(TopKAvgPooling) #ifdef USE_X86_PLACE .__alias__("topk_avg_pooling") #endif +#ifdef AMD_GPU +//.__alias__("topk_avg_pooling") +#endif .num_in(1) .num_out(1) .Args("feat_map_num", "feat map nums") diff --git a/framework/operators/topk_pooling.cpp b/framework/operators/topk_pooling.cpp index afca0c2dc..17390ecf8 100644 --- a/framework/operators/topk_pooling.cpp +++ b/framework/operators/topk_pooling.cpp @@ -4,36 +4,17 @@ namespace anakin { namespace ops { -#ifdef USE_CUDA -template<> -void TopKPooling::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_topk_pooling; - impl->_funcs_topk_pooling(ins, outs, param, ctx); +#define INSTANCE_TOPK_POOLING(Ttype, Ptype) \ +template<> \ +void TopKPooling::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_topk_pooling; \ + impl->_funcs_topk_pooling(ins, outs, param, ctx); \ } -#endif - -#ifdef USE_X86_PLACE -template<> -void TopKPooling::operator()( - OpContext& ctx, - const std::vector >& ins, - std::vector >& outs) { - auto* impl = - static_cast*>(this->_helper); - auto& param = - static_cast*>(this->_helper)->_param_topk_pooling; - impl->_funcs_topk_pooling(ins, outs, param, ctx); -} -#endif - -/// TODO ... specialization other type of operator - /// set helper template @@ -69,16 +50,19 @@ Status TopKPoolingHelper::InferShape(const } #ifdef USE_CUDA +INSTANCE_TOPK_POOLING(NV, Precision::FP32); template class TopKPoolingHelper; template class TopKPoolingHelper; template class TopKPoolingHelper; #endif #ifdef USE_ARM_PLACE +INSTANCE_TOPK_POOLING(ARM, Precision::FP32); template class TopKPoolingHelper; template class TopKPoolingHelper; template class TopKPoolingHelper; #endif #ifdef USE_X86_PLACE +INSTANCE_TOPK_POOLING(X86, Precision::FP32); template class TopKPoolingHelper; template class TopKPoolingHelper; template class TopKPoolingHelper; @@ -105,6 +89,9 @@ ANAKIN_REGISTER_OP(TopKPooling) #ifdef USE_X86_PLACE .__alias__("topk_pooling") #endif +#ifdef AMD_GPU +//.__alias__("topk_pooling") +#endif .num_in(1) .num_out(1) .Args("top_k", "get top k max data of each feature map") diff --git a/framework/operators/yolo_box.cpp b/framework/operators/yolo_box.cpp new file mode 100644 index 000000000..44747806e --- /dev/null +++ b/framework/operators/yolo_box.cpp @@ -0,0 +1,104 @@ +#include "framework/operators/yolo_box.h" + +namespace anakin { + +namespace ops { + +#define INSTANCE_YOLO_BOX(Ttype, Ptype) \ +template<> \ +void YoloBox::operator()(OpContext& ctx, \ + const std::vector >& ins, \ + std::vector >& outs) { \ + auto* impl = \ + static_cast*>(this->_helper); \ + auto& param = \ + static_cast*>(this->_helper)->_param_yolo_box; \ + impl->_funcs_yolo_box(ins, outs, param, ctx); \ +} + +/// set helper +template +YoloBoxHelper::~YoloBoxHelper() { +} + +template +Status YoloBoxHelper::InitParam() { + DLOG(WARNING) << "Parsing YoloBox op parameter."; + auto anchors = GET_PARAMETER(PTuple, anchors); + auto class_num = GET_PARAMETER(int, class_num); + auto conf_thresh = GET_PARAMETER(float, conf_thresh); + auto downsample_ratio = GET_PARAMETER(int, downsample_ratio); + YoloBoxParam param_yolo_box(anchors.vector(), class_num, conf_thresh, downsample_ratio); + _param_yolo_box = param_yolo_box; + + return Status::OK(); +} + +template +Status YoloBoxHelper::Init(OpContext& ctx, + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_yolo_box.init(ins, outs, _param_yolo_box, SPECIFY, SABER_IMPL, ctx)); + return Status::OK(); +} + +template +Status YoloBoxHelper::InferShape( + const std::vector >& ins, + std::vector >& outs) { + SABER_CHECK(_funcs_yolo_box.compute_output_shape(ins, outs, _param_yolo_box)); + return Status::OK(); +} + +#ifdef USE_CUDA +INSTANCE_YOLO_BOX(NV, Precision::FP32); +template class YoloBoxHelper; +template class YoloBoxHelper; +template class YoloBoxHelper; +#endif +#ifdef USE_X86_PLACE +INSTANCE_YOLO_BOX(X86, Precision::FP32); +template class YoloBoxHelper; +template class YoloBoxHelper; +template class YoloBoxHelper; +#endif +#ifdef USE_ARM_PLACE +INSTANCE_YOLO_BOX(ARM, Precision::FP32); +template class YoloBoxHelper; +template class YoloBoxHelper; +template class YoloBoxHelper; +#endif +// register helper +#ifdef USE_CUDA +ANAKIN_REGISTER_OP_HELPER(YoloBox, YoloBoxHelper, NV, Precision::FP32); +#endif +#ifdef USE_X86_PLACE +ANAKIN_REGISTER_OP_HELPER(YoloBox, YoloBoxHelper, X86, Precision::FP32); +#endif +#ifdef USE_ARM_PLACE +ANAKIN_REGISTER_OP_HELPER(YoloBox, YoloBoxHelper, ARM, Precision::FP32); +#endif +//! register op +ANAKIN_REGISTER_OP(YoloBox) +.Doc("YoloBox operator") +#ifdef USE_CUDA +.__alias__("yolo_box") +#endif +#ifdef USE_X86_PLACE +.__alias__("yolo_box") +#endif +#ifdef USE_ARM_PLACE +.__alias__("yolo_box") +#endif +.num_in(2) +.num_out(2) +.Args>("anchors", "anchor of yolo_box_param") +.Args("class_num", "get class_num") +.Args("conf_thresh", "conf_thresh map num") +.Args("downsample_ratio", "get downsample_ratio"); + +} /* namespace ops */ + +} /* namespace anakin */ + + diff --git a/framework/operators/yolo_box.h b/framework/operators/yolo_box.h new file mode 100644 index 000000000..dc6c713e1 --- /dev/null +++ b/framework/operators/yolo_box.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_OPERATOR_YOLO_BOX_H +#define ANAKIN_OPERATOR_YOLO_BOX_H + +#include "framework/core/base.h" +#include "framework/core/data_types.h" +#include "framework/core/operator/operator.h" +#include "utils/logger/logger.h" +#include "saber/funcs/yolo_box.h" + +namespace anakin { + +namespace ops { + +template +class YoloBoxHelper; + +/// pooling op +/** + * \brief operation of ops class + * public inheritance Operator + */ +template +class YoloBox : public Operator { +public: + YoloBox() {} + + /// forward impl + virtual void operator() (OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) { + LOG(ERROR) << "Not Impl Yet Operator YoloBox< Ttype(" + << target_name::value << "), Precision(" + << Ptype << ") >"; + } + + friend class YoloBoxHelper; +}; + +/** + * \breif provide defined help for some operation + * public inheritance OperatorHelper + * including init operation context and the size of shape + */ +template +class YoloBoxHelper : public OperatorHelper { +public: + YoloBoxHelper()=default; + + ~YoloBoxHelper(); + + Status InitParam() override; + + /** + * \brief initial all the resource needed by pooling + * \param ctx stand for operation context + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status Init(OpContext &ctx, + const std::vector >& ins, + std::vector >& outs) override; + + /** + * \brief infer the shape of output and input. + * \param ins stand for input tensor vector + * \param outs stand for output tensor vector + * \return status + */ + Status InferShape(const std::vector >& ins, + std::vector >& outs) override; + +public: + ///< _param_yolo_box stand for yolo_box parameter + saber::YoloBoxParam _param_yolo_box; + ///< _funcs_yolo_box stand for yolo_box function + saber::YoloBox::saber_type> _funcs_yolo_box; +}; + + + +} /* namespace ops */ + +} /* namespace anakin */ + +#endif diff --git a/framework/service/device_info.h b/framework/service/device_info.h index 9be0ef0ab..c11b7f6d3 100644 --- a/framework/service/device_info.h +++ b/framework/service/device_info.h @@ -24,6 +24,9 @@ #include #include #include +#ifdef USE_SGX +#include +#endif #include #ifdef USE_CUDA diff --git a/framework/utils/csv.h b/framework/utils/csv.h new file mode 100644 index 000000000..a3c9b2b7e --- /dev/null +++ b/framework/utils/csv.h @@ -0,0 +1,111 @@ +#ifndef ANAKIN_FRAMEWORK_UTILS_CSV_H +#define ANAKIN_FRAMEWORK_UTILS_CSV_H + +#include +#include + +#ifdef ENABLE_OP_TIMER + +namespace anakin { + +class Csvfile; + +inline static Csvfile& endrow(Csvfile& file); +inline static Csvfile& flush(Csvfile& file); + +class Csvfile { + +public: + Csvfile(std::string const& file, bool app_mode = false, \ + std::string const& sep = ",") + : _fs() + , _is_first(true) + , _sep(sep) + , _esc("\"") + , _special_chars("\"") { + _fs.exceptions(std::ios::failbit | std::ios::badbit); + if (app_mode) { + _fs.open(file, std::ofstream::app); + } else { + _fs.open(file); + } + } + + ~Csvfile() { + flush(); + _fs.close(); + } + + void flush() { + _fs.flush(); + } + + void endrow() { + _fs << std::endl; + _is_first = true; + } + + Csvfile& operator << (Csvfile& (*func)(Csvfile&)) { + return func(*this); + } + + template + Csvfile& operator << (const T& val) { + return write(val); + } + + Csvfile& operator << (const char* val) { + return write(escape(val)); + } + + Csvfile& operator << (const std::string& val) { + return write(escape(val)); + } + +private: + std::ofstream _fs; + bool _is_first; + const std::string _sep; + const std::string _esc; + const std::string _special_chars; + + template + Csvfile& write(const T& val) { + if (!_is_first) { + _fs << _sep; + } else { + _is_first = false; + } + _fs << val; + return *this; + } + + std::string escape(const std::string & val) { + std::ostringstream result; + result << '"'; + std::string::size_type to, from = 0u, len = val.length(); + while (from < len && \ + std::string::npos != (to = val.find_first_of(_special_chars, from))) { + result << val.substr(from, to - from) << _esc << val[to]; + from = to + 1; + } + result << val.substr(from) << '"'; + return result.str(); + } +}; + +inline static Csvfile& endrow(Csvfile& file) { + file.endrow(); + return file; +} + +inline static Csvfile& flush(Csvfile& file) { + file.flush(); + return file; +} + +} + +#endif /* ENABLE_OP_TIMER */ + +#endif /* ANAKIN_FRAMEWORK_UTILS_CSV_H */ diff --git a/framework/utils/layout_common.h b/framework/utils/layout_common.h index a4060d1be..d08c70b88 100644 --- a/framework/utils/layout_common.h +++ b/framework/utils/layout_common.h @@ -78,6 +78,6 @@ int dims_from_layout(const LayoutType layouttype) { } } -#endif - } /* namespace anakin */ + +#endif diff --git a/framework/utils/parameter_fusion.cpp b/framework/utils/parameter_fusion.cpp new file mode 100644 index 000000000..83c3b8e33 --- /dev/null +++ b/framework/utils/parameter_fusion.cpp @@ -0,0 +1,559 @@ +#include "framework/utils/parameter_fusion.h" +namespace anakin { +/** + * \brief update fp32 conv weights with batchnorm and scale parameters. + */ +template +void WeightsFusion::update_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term) { + float* weights_p = (float*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // insert scale parameters + alpha = scale_w[i] * alpha; + if (scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta = beta * scale_w[i]; + } + int start_index = i * chw; + for (int j = 0; j < chw; j++) { + weights_p[start_index + j] *= alpha; + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update fp32 conv weights with affine channel parameters. + */ +template +void WeightsFusion::update_conv_affine_channel_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, + std::vector affine_channel_w, + std::vector affine_channel_b) { + float* weights_p = (float*)(weights.h_tensor().mutable_data()); + float* bias_p = (float* )(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + int chw = c * h * w; + for (int i = 0; i < n; i++) { + for (int j = 0; j < chw; j++) { + weights_p[i * chw + j] *= affine_channel_w[i]; + } + bias_p[i] = bias_p[i] * affine_channel_w[i] + affine_channel_b[i]; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update fp32 conv weights with batchnorm. + */ +template +void WeightsFusion::update_weights_without_scale( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance) { + float* weights_p = (float* )(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + int start_index = i * chw; + for (int j = 0; j < chw; j++) { + weights_p[start_index + j] *= alpha; + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +template +void WeightsFusion::update_weights_conv_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term){ + float* weights_p = (float*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, scale_w.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = scale_w[i]; + float beta = 0.f; + if (scale_bias_term) { + beta = scale_b[i]; + } + int start_index = i * chw; + for (int j = 0; j < chw; j++) { + weights_p[start_index + j] *= alpha; + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update fp32 deconv weights with batchnorm and scale parameters. + */ +template +void WeightsFusion::update_deconv_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term) { + float* weights_p = (float*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + //swap n and c + int tn = c; + c = n; + n = tn; + + int chw = c * h * w; + int hw = h * w; + for (int i = 0; i < c; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // insert scale parameters + alpha = scale_w[i] * alpha; + if (scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta = beta * scale_w[i]; + } + for (int ni = 0; ni < n; ++ni){ + for (int j=0; j < hw; j++) { + weights_p[ni * chw + i * hw + j] *= alpha; + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update fp32 deconv weights with batchnorm. + */ +template +void WeightsFusion::update_deconv_weights_without_scale( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance) { + float* weights_p = (float*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + //swap n and c + int tn = c; + c = n; + n = tn; + + int chw = c * h * w; + int hw = h * w; + for (int i = 0; i < c; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + for (int ni = 0; ni < n; ++ni){ + for (int j=0; j < hw; j++){ + weights_p[ni * chw + i * hw + j] *= alpha; + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update int8 conv weights with batchnorm and scale parameters. + */ +template +void WeightsFusion::update_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term) { + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // insert scale parameters + alpha = scale_w[i] * alpha; + if (scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta = beta * scale_w[i]; + } + // change weights scale + w_scale[i] *= alpha; + if (w_scale[i] < 0){ + w_scale[i] = fabs(w_scale[i]); + for (int j = 0; j < chw; ++j){ + weights_p[i * chw + j] *= -1; + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +template +void WeightsFusion::update_weights_conv_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term){ + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, scale_w.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = scale_w[i]; + float beta = 0.f; + // insert scale parameters + if (scale_bias_term) { + beta = scale_b[i]; + } + int start_index = i * chw; + for (int j = 0; j < chw; j++) { + weights_p[start_index + j] *= alpha; + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update int8 conv weights with affine channel parameters. + */ +template +void WeightsFusion::update_conv_affine_channel_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, + std::vector affine_channel_w, + std::vector affine_channel_b) { + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + int chw = c * h * w; + for (int i = 0; i < n; i++) { + // change weights scale + w_scale[i] *= affine_channel_w[i]; + if (w_scale[i] < 0){ + w_scale[i] = fabs(w_scale[i]); + for (int j = 0; j < chw; ++j){ + weights_p[i * chw + j] *= -1; + } + } + bias_p[i] = bias_p[i] * affine_channel_w[i] + affine_channel_b[i]; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** + * \brief update int8 conv weights with batchnorm. + */ +template +void WeightsFusion::update_weights_without_scale( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance) { + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + std::vector w_scale = weights.h_tensor().get_scale(); + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + int chw = c * h * w; + for (int i = 0; i < n; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // change weights scale + w_scale[i] *= alpha; + if (w_scale[i] < 0){ + w_scale[i] = fabs(w_scale[i]); + for (int j = 0; j < chw; ++j){ + int start_index = i * chw; + weights_p[start_index + j] *= -1; + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} +/** + * \brief update int8 deconv weights with batchnorm and scale parameters. + */ +template +void WeightsFusion::update_deconv_weights( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term) { + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + std::vector w_scale = weights.h_tensor().get_scale(); + //swap n and c + int tn = c; + c = n; + n = tn; + + int chw = c * h * w; + int hw = h * w; + for (int i = 0; i < c; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + + // insert scale parameters + alpha = scale_w[i] * alpha; + if (scale_bias_term) { + beta = beta * scale_w[i] + scale_b[i]; + } else { + beta = beta * scale_w[i]; + } + // change weights scale + w_scale[i] *= alpha; + if (w_scale[i] < 0){ + w_scale[i] = fabs(w_scale[i]); + for (int ni = 0; ni < n; ++ni){ + for (int j = 0; j < hw; j++) { + weights_p[ni * chw + i * hw + j] *= -1; + } + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} + +/** +* \brief update int8 deconv weights with batchnorm. +*/ +template +void WeightsFusion::update_deconv_weights_without_scale( + PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance) { + char* weights_p = (char*)(weights.h_tensor().mutable_data()); + if (!conv_bias_term) { + bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); + void* new_bias_data = bias.h_tensor().mutable_data(); + memset(new_bias_data, 0, sizeof(float) * bias.h_tensor().size()); + } + float* bias_p = (float*)(bias.h_tensor().mutable_data()); + + batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; + std::vector w_scale = weights.h_tensor().get_scale(); + //swap n and c + int tn = c; + c = n; + n = tn; + + int chw = c * h * w; + int hw = h * w; + for (int i = 0; i < c; i++) { + float alpha = 1.f; + float beta = 0.f; + // insert batchnorm parameters + alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; + alpha = 1.f / sqrtf(alpha); + beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); + beta = beta * alpha; + w_scale[i] *= alpha; + if (w_scale[i] < 0){ + w_scale[i] = fabs(w_scale[i]); + for (int ni = 0; ni < n; ++ni){ + for (int j = 0; j < hw; j++) { + weights_p[ni * chw + i * hw + j] *= -1; + } + } + } + bias_p[i] *= alpha; + bias_p[i] += beta; + } + weights.h_tensor().set_scale(w_scale); + weights.d_tensor().copy_from(weights.h_tensor()); + weights.d_tensor().set_scale(w_scale); + bias.d_tensor().copy_from(bias.h_tensor()); +} +#if defined USE_CUDA +template class WeightsFusion; +template class WeightsFusion; +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +template class WeightsFusion; +template class WeightsFusion; +#endif +#if defined USE_ARM_PLACE +template class WeightsFusion; +template class WeightsFusion; +#endif + +} diff --git a/framework/utils/parameter_fusion.h b/framework/utils/parameter_fusion.h index 0b8d2dc5c..d17281150 100644 --- a/framework/utils/parameter_fusion.h +++ b/framework/utils/parameter_fusion.h @@ -22,90 +22,213 @@ namespace anakin { -/** - * \brief update conv weights with batchnorm and scale parameters. - */ template -void update_weights(PBlock weights, PBlock bias, - int n, int c, int h, int w, bool conv_bias_term, - float batchnorm_scale, float batchnorm_eps, - std::vector batchnorm_mean, - std::vector batchnorm_variance, - std::vector scale_w, - std::vector scale_b, - bool scale_bias_term) { - D* weights_p = (D* )(weights.h_tensor().mutable_data()); - if(!conv_bias_term) { - bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); - void* new_bias_data = bias.h_tensor().mutable_data(); - memset(new_bias_data, 0, sizeof(D) * bias.h_tensor().size()); - } - D* bias_p = (D* )(bias.h_tensor().mutable_data()); - - batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; - int chw = c * h * w; - for (int i = 0; i < n; i++) { - D alpha = 1.f; - D beta = 0.f; - // insert batchnorm parameters - alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; - alpha = 1.f / sqrtf(alpha); - beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); - beta = beta * alpha; - - // insert scale parameters - alpha = scale_w[i] * alpha; - if(scale_bias_term) { - beta = beta * scale_w[i] + scale_b[i]; - } else { - beta = beta * scale_w[i]; - } - for(int j=0; j < chw; j++) { - weights_p[i * chw + j] *= alpha; - } - bias_p[i] *= alpha; - bias_p[i] += beta; - } - weights.d_tensor().copy_from(weights.h_tensor()); - bias.d_tensor().copy_from(bias.h_tensor()); -} - -/** - * \brief update conv weights with batchnorm. - */ -template -void update_weights_without_scale(PBlock weights, PBlock bias, - int n, int c, int h, int w, bool conv_bias_term, - float batchnorm_scale, float batchnorm_eps, - std::vector batchnorm_mean, - std::vector batchnorm_variance) { - D* weights_p = (D* )(weights.h_tensor().mutable_data()); - if(!conv_bias_term) { - bias.re_alloc(Shape4d({1, batchnorm_mean.size(), 1, 1})); - void* new_bias_data = bias.h_tensor().mutable_data(); - memset(new_bias_data, 0, sizeof(D) * bias.h_tensor().size()); - } - D* bias_p = (D* )(bias.h_tensor().mutable_data()); - - batchnorm_scale = (batchnorm_scale == 0) ? 1.f : 1.f / batchnorm_scale; - int chw = c * h * w; - for (int i = 0; i < n; i++) { - D alpha = 1.f; - D beta = 0.f; - // insert batchnorm parameters - alpha = batchnorm_variance[i] * batchnorm_scale + batchnorm_eps; - alpha = 1.f / sqrtf(alpha); - beta = -1.f * (batchnorm_mean[i] * batchnorm_scale); - beta = beta * alpha; - for(int j=0; j < chw; j++) { - weights_p[i * chw + j] *= alpha; - } - bias_p[i] *= alpha; - bias_p[i] += beta; - } - weights.d_tensor().copy_from(weights.h_tensor()); - bias.d_tensor().copy_from(bias.h_tensor()); -} +class WeightsFusion{ +public: + WeightsFusion(){}; + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term){ + LOG(ERROR) << "unsupport weights dtype"; + } + + /** + * \brief update conv weights with affine channel parameters. + */ + static void update_conv_affine_channel_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, + std::vector affine_channel_w, + std::vector affine_channel_b){ + LOG(ERROR) << "unsupport weights dtype"; + }; + + /** + * \brief update conv weights with batchnorm. + */ + static void update_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance){ + LOG(ERROR) << "unsupport weights dtype"; + } + + /** + * \brief update conv weights with scale. + */ + static void update_weights_conv_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term){ + LOG(ERROR) << "unsupport weights dtype"; + } + + + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_deconv_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term){ + LOG(ERROR) << "unsupport weights dtype"; + } + + /** + * \brief update conv weights with batchnorm. + */ + static void update_deconv_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance){ + LOG(ERROR) << "unsupport weights dtype"; + }; +}; + +template +class WeightsFusion{ +public: + WeightsFusion(){}; + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + /** + * \brief update conv weights with affine channel parameters. + */ + static void update_conv_affine_channel_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, + std::vector affine_channel_w, + std::vector affine_channel_b); + + /** + * \brief update conv weights with batchnorm. + */ + static void update_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance); + + /** + * \brief update conv weights with scale. + */ + static void update_weights_conv_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_deconv_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + /** + * \brief update conv weights with batchnorm. + */ + static void update_deconv_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance); +}; + +template +class WeightsFusion{ +public: + WeightsFusion(){}; + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + /** + * \brief update conv weights with affine channel parameters. + */ + static void update_conv_affine_channel_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, + std::vector affine_channel_w, + std::vector affine_channel_b); + + /** + * \brief update conv weights with batchnorm. + */ + static void update_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance); + + /** + * \brief update conv weights with scale. + */ + static void update_weights_conv_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + + /** + * \brief update conv weights with batchnorm and scale parameters. + */ + static void update_deconv_weights(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance, + std::vector scale_w, + std::vector scale_b, + bool scale_bias_term); + + /** + * \brief update conv weights with batchnorm. + */ + static void update_deconv_weights_without_scale(PBlock weights, PBlock bias, + int n, int c, int h, int w, bool conv_bias_term, + float batchnorm_scale, float batchnorm_eps, + std::vector batchnorm_mean, + std::vector batchnorm_variance); +}; + + } /* namespace anakin */ #endif diff --git a/saber/.DS_Store b/saber/.DS_Store new file mode 100644 index 000000000..4ef147706 Binary files /dev/null and b/saber/.DS_Store differ diff --git a/saber/CMakeLists.txt b/saber/CMakeLists.txt index 86d4b0836..f18995af8 100644 --- a/saber/CMakeLists.txt +++ b/saber/CMakeLists.txt @@ -17,6 +17,8 @@ set(ANAKIN_SABER_STATIC_RELAY "" ) set(ANAKIN_SABER_BASE_SRC "") anakin_fetch_include_recursively(${ANAKIN_SABER}) anakin_fetch_include_recursively(${ANAKIN_UTILS}) +anakin_fetch_include_recursively(${ANAKIN_THIRD_PARTY_PATH}/hash) + # add ak_base_source files anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core "cpp" ANAKIN_SABER_BASE_SRC) @@ -26,7 +28,8 @@ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs "cpp" ANAKIN_SABER_BASE_SRC if(USE_ARM_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/arm "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm "cpp" ANAKIN_SABER_BASE_SRC) - anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm/impl "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm/neon "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/arm/neon/impl "cpp" ANAKIN_SABER_BASE_SRC) endif() if(USE_BM_PLACE) @@ -36,7 +39,7 @@ if(USE_BM_PLACE) endif() if(USE_GPU_PLACE) - if(USE_CUDA) + if(USE_CUDA) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/cuda "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/cuda "cpp" ANAKIN_SABER_BASE_SRC) else() @@ -53,12 +56,15 @@ endif() if(USE_X86_PLACE) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/x86 "cpp" ANAKIN_SABER_BASE_SRC) anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/x86/kernel "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_THIRD_PARTY_PATH}/hash/src/bloomfilter "c" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_THIRD_PARTY_PATH}/hash/src/xxHash "c" ANAKIN_SABER_BASE_SRC) + endif() # compile cpp objs # add_library(ANAKIN_SABER_BASE_OBJS OBJECT ${ANAKIN_SABER_BASE_SRC}) -set(ANAKIN_SABER_TEMP_COMMMON_LIB "anakin_saber_common") +set(ANAKIN_SABER_TEMP_COMMON_LIB "anakin_saber_common") if(USE_CUDA) # set root @@ -66,7 +72,7 @@ if(USE_CUDA) # set select arch for cuda add_subdirectory(${ANAKIN_SABER}/funcs/impl/cuda/base) - set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) + set(FLAGS_BACKUP ${CMAKE_CXX_FLAGS}) set(CMAKE_CXX_FLAGS "") if(BUILD_SHARED) CUDA_COMPILE(ANAKIN_SABER_CUDA_C_SRC_OBJS SHARED ${ANAKIN_SABER_CUDA_C_SRC} OPTIONS ${ANAKIN_NVCC_FLAG}) @@ -76,39 +82,58 @@ if(USE_CUDA) endif() set(CMAKE_CXX_FLAGS ${FLAGS_BACKUP}) - set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} - ${BEGIN_WHOLE_ARCHIVE} - ${ANAKIN_SABER_SASS_STATIC_LIB} - ${WHOLE_ARCHIVE_END}) + set(ANAKIN_SABER_STATIC_RELAY ${ANAKIN_SABER_STATIC_RELAY} + ${BEGIN_WHOLE_ARCHIVE} + ${ANAKIN_SABER_SASS_STATIC_LIB} + ${WHOLE_ARCHIVE_END}) +endif() + + +if(USE_MLU) + if (USE_BANG) + add_subdirectory(${ANAKIN_SABER}/funcs/impl/mlu/base) + endif() + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/mlu "cpp" ANAKIN_SABER_BASE_SRC) + anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/mlu "cpp" ANAKIN_SABER_BASE_SRC) endif() # add saber library to static if(UNIX OR APPLE) - if (USE_ARM_PLACE) - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY - ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) + if (USE_ARM_PLACE) + add_library(${ANAKIN_SABER_TEMP_COMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) + else() + if (BUILD_SHARED) + add_library(${ANAKIN_SABER_TEMP_COMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + #$) + if(USE_X86_PLACE OR USE_CUDA) + list(LENGTH ANAKIN_SABER_DEPENDENCIES dependencies_len) + if(dependencies_len GREATER 0) + add_dependencies(${ANAKIN_SABER_TEMP_COMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES}) + endif() + endif() + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES VERSION ${VERSION}) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMON_LIB} ${ANAKIN_LINKER_LIBS}) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMON_LIB} ${ANAKIN_SABER_STATIC_RELAY}) + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES LINK_FLAGS "") + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) else() - if (BUILD_SHARED) - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} SHARED ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) - #$) - if(USE_X86_PLACE OR USE_CUDA) - list(LENGTH ANAKIN_SABER_DEPENDENCIES dependencies_len) - if(dependencies_len GREATER 0) - add_dependencies(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_DEPENDENCIES}) - endif() - endif() - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES VERSION ${VERSION}) - target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_LINKER_LIBS}) - target_link_libraries(${ANAKIN_SABER_TEMP_COMMMON_LIB} ${ANAKIN_SABER_STATIC_RELAY}) - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LINK_FLAGS "") - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY - ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) - else() - ADD_LIBRARY(${ANAKIN_SABER_TEMP_COMMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) - set_target_properties(${ANAKIN_SABER_TEMP_COMMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY - ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) - endif () + add_library(${ANAKIN_SABER_TEMP_COMMON_LIB} STATIC ${ANAKIN_SABER_CUDA_C_SRC_OBJS} ${ANAKIN_SABER_BASE_SRC}) + add_dependencies(${ANAKIN_SABER_TEMP_COMMON_LIB} xbyak) + if(USE_SGX) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMON_LIB} ${SGX_CONFIG_INTERFACE}) + endif() + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES LINK_FLAGS "") + set_target_properties(${ANAKIN_SABER_TEMP_COMMON_LIB} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${ANAKIN_ROOT}/${AK_OUTPUT_PATH}/) endif() + endif() +endif() + +if (USE_BANG) + target_link_libraries(${ANAKIN_SABER_TEMP_COMMON_LIB} ${CMAKE_CURRENT_SOURCE_DIR}/funcs/impl/mlu/base/bang_kernel.o) endif() -set(ANAKIN_SABER_LIB_TARGET ${ANAKIN_SABER_TEMP_COMMMON_LIB} PARENT_SCOPE) + +set(ANAKIN_SABER_LIB_TARGET ${ANAKIN_SABER_TEMP_COMMON_LIB} PARENT_SCOPE) diff --git a/saber/core/buffer.h b/saber/core/buffer.h index f76174329..7dd54951c 100644 --- a/saber/core/buffer.h +++ b/saber/core/buffer.h @@ -193,8 +193,7 @@ class Buffer { if (_capacity < vec_cap) { alloc(vec_cap); } - API::sync_memcpy(_data, 0, _id, &data[0], \ - 0, 0, vec_cap, flag_type()); + API::sync_memcpy(_data, 0, _id, data.data(), 0, 0, vec_cap, flag_type()); return SaberSuccess; } @@ -202,14 +201,14 @@ class Buffer { /** * \brief return const data pointer */ - const TPtr get_data(){ + const TPtr get_data()const { return _data; } /** * \brief return mutable data pointer */ - TPtr get_data_mutable(){ + TPtr get_data_mutable()const{ return _data; } @@ -299,7 +298,7 @@ static inline int BufferMemShare(std::shared_ptr>& dst, \ typedef typename IF::value, then_type, else_type>::Type flag_type; CHECK_EQ(src == nullptr, false) << "input buffer is null!"; if (!dst){ - dst = std::make_shared>(src->get_count()); + dst = std::make_shared>(); } return MemShare(dst, src, flag_type()); } diff --git a/saber/core/common.h b/saber/core/common.h index e10d4ce07..a755f51d8 100644 --- a/saber/core/common.h +++ b/saber/core/common.h @@ -5,18 +5,19 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_CORE_COMMON_H #define ANAKIN_SABER_CORE_COMMON_H #include +#include #include #include #include @@ -170,16 +171,26 @@ const char* cudnn_get_errorstring(cudnnStatus_t status); #endif -#ifdef USE_ARM_PLACE + #ifdef USE_OPENMP #include -#include #endif //openmp -#endif //ARM -#endif //ANAKIN_SABER_CORE_COMMON_H +#ifdef USE_ARM_PLACE +#include +#include +namespace std{ + template + std::string to_string(T value) + { + std::ostringstream os ; + os << value ; + return os.str() ; + } +} +#endif //ARM -#ifdef USE_BM_PLACE +#ifdef USE_BM_PLACE #include "bmlib_runtime.h" #include "bmdnn_api.h" @@ -192,4 +203,7 @@ const char* cudnn_get_errorstring(cudnnStatus_t status); CHECK_EQ(error, BM_SUCCESS) << " Failed with error code:" << error; \ } while (0) -#endif // USE_BM_PLACE +#endif // USE_BM_PLACE + +#endif //ANAKIN_SABER_CORE_COMMON_H + diff --git a/saber/core/context.h b/saber/core/context.h index e8646b4e6..123396e26 100644 --- a/saber/core/context.h +++ b/saber/core/context.h @@ -16,8 +16,11 @@ #ifndef ANAKIN_SABER_CORE_CONTEXT_H #define ANAKIN_SABER_CORE_CONTEXT_H -#include "core/env.h" +#include "saber/core/env.h" #include "saber/saber_types.h" +#ifdef USE_ARM_PLACE +#include "saber/core/tensor.h" +#endif namespace anakin{ @@ -35,7 +38,7 @@ class Context final{ * @param compute_stream_id */ Context(int device_id = 0, int data_stream_id = 0, int compute_stream_id = 0){ -#ifdef USE_BM +#ifdef USE_BM if(std::is_same::value){ LOG(INFO) << "context init for BM"; int dev_count = 0; @@ -70,6 +73,24 @@ class Context final{ } _stream_compute = devs[_device_id]._compute_stream[compute_stream_id]; _compute_stream_id = compute_stream_id; +#ifdef USE_ARM_PLACE + //! 1 thread, big core + if (devs[_device_id]._info._big_core_ids.size() > 0){ + _act_ids = {devs[_device_id]._info._big_core_ids[0]}; + } else { + _act_ids = {0}; + } + _mode = SABER_POWER_HIGH; + int temp_mem_size = devs[_device_id]._info._L2_cache[_act_ids[0]] / sizeof(float); + _work_space.reshape(Shape({1, 1, 1, temp_mem_size})); +#ifdef TARGET_IOS + _arch = APPLE; //use 6x8 +#else + if (devs[_device_id]._info._big_core_ids.size() > 0) { + _arch = devs[_device_id]._info._archs[_act_ids[0]]; + } +#endif +#endif } Context(const Context& ctx){ @@ -88,8 +109,10 @@ class Context final{ #ifdef USE_ARM_PLACE _act_ids = ctx._act_ids; _mode = ctx._mode; + _work_space.copy_from(ctx._work_space); + _arch = ctx._arch; + _count = ctx._count; #endif - } Context& operator=(const Context& ctx){ @@ -101,6 +124,9 @@ class Context final{ #ifdef USE_ARM_PLACE this->_act_ids = ctx._act_ids; this->_mode = ctx._mode; + this->_work_space.copy_from(ctx._work_space); + this->_arch = ctx._arch; + this->_count = ctx._count; #endif #ifdef USE_BM this->_bm_handle = ctx._bm_handle; @@ -113,6 +139,12 @@ class Context final{ comp_eq = comp_eq && (_device_id == right._device_id); comp_eq = comp_eq && (_data_stream_id == right._data_stream_id); comp_eq = comp_eq && (_compute_stream_id == right._compute_stream_id); +#ifdef USE_ARM_PLACE + comp_eq = comp_eq && (_act_ids == right._act_ids); + comp_eq = comp_eq && (_mode == right._mode); + comp_eq = comp_eq && (_arch == right._arch); + comp_eq = comp_eq && (_count == right._count); +#endif #ifdef USE_BM comp_eq = comp_eq && (_bm_handle == right._bm_handle); #endif @@ -143,18 +175,6 @@ class Context final{ return _stream_compute; } - -#ifdef USE_ARM_PLACE - //void set_act_cores(std::vector ids); - //void set_power_mode(PowerMode mode); - void set_run_mode(PowerMode mode, int threads); - //void set_cache(size_t l1size, size_t l2size, size_t l3size); - void bind_dev(); - PowerMode get_mode(int& threads); - //PowerMode get_mode(); - //std::vector get_act_ids(); -#endif - #ifdef USE_BM bm_handle_t get_handle() { return _bm_handle; @@ -168,8 +188,23 @@ class Context final{ return "null"; } } - - +#ifdef USE_ARM_PLACE + //! SABER_POWER_HIGH stands for using big cores, + //! SABER_POWER_LOW stands for using small core, + //! SABER_POWER_FULL stands for using all cores + void set_run_mode(PowerMode mode, int threads); + void set_cache(int l1size, int l2size, int l3size); + int get_l1_cache_size() const; + int get_l2_cache_size() const; + int get_l3_cache_size() const; + void* get_work_space(); + int get_threads() const; + ARMArch get_arch() const; + PowerMode get_mode() const; + void set_arch(ARMArch arch); + void bind_dev(); + SaberStatus workspace_extend(Shape sh); +#endif private: //! current stream to process typename API::stream_t _stream_data; @@ -179,8 +214,11 @@ class Context final{ int _data_stream_id; int _compute_stream_id; #ifdef USE_ARM_PLACE + ARMArch _arch; PowerMode _mode{SABER_POWER_HIGH}; std::vector _act_ids{0}; + Tensor _work_space; + long long _count{0}; #endif #ifdef USE_BM bm_handle_t _bm_handle; diff --git a/saber/core/data_traits.h b/saber/core/data_traits.h index 7f44a1d33..373eebce7 100644 --- a/saber/core/data_traits.h +++ b/saber/core/data_traits.h @@ -16,7 +16,7 @@ #ifndef ANAKIN_SABER_CORE_DATA_TRAITS_H #define ANAKIN_SABER_CORE_DATA_TRAITS_H -#include "saber_types.h" +#include "saber/saber_types.h" #ifdef USE_BM_PLACE #include "bmlib_runtime.h" @@ -66,6 +66,8 @@ static size_t type_length(DataType type) { return 4; case AK_INT64: return 8; + case AK_UINT64: + return 8; case AK_HALF: return 2; case AK_FLOAT: @@ -143,6 +145,12 @@ struct DataTrait { typedef unsigned int* PtrDtype; }; +template +struct DataTrait { + typedef unsigned int Dtype; + typedef unsigned int* PtrDtype; +}; + #ifdef USE_BM_PLACE struct BM_mem_addr: bm_mem_desc { diff --git a/saber/core/device.h b/saber/core/device.h index ced61d6a8..37c703195 100644 --- a/saber/core/device.h +++ b/saber/core/device.h @@ -15,7 +15,8 @@ #ifndef ANAKIN_SABER_CORE_DEVICE_H #define ANAKIN_SABER_CORE_DEVICE_H -#include "core/target_wrapper.h" +#include "saber/core/target_wrapper.h" +#include namespace anakin { @@ -39,6 +40,29 @@ struct DeviceInfo { std::vector _cluster_ids; }; +#ifdef USE_ARM_PLACE +template <> +struct DeviceInfo { + int _idx; + std::string _device_name; + int _max_frequence; + int _min_frequence; + std::string _compute_ability; + int _generate_arch; + int _compute_core_num; + int _max_memory; + int _sharemem_size; + std::vector _L1_cache; + std::vector _L2_cache; + std::vector _L3_cache; + std::vector _core_ids; + std::vector _big_core_ids; + std::vector _little_core_ids; + std::vector _cluster_ids; + std::vector _archs; +}; +#endif + template struct Device { diff --git a/saber/core/env.h b/saber/core/env.h index edd72a3a4..ab89c4f84 100644 --- a/saber/core/env.h +++ b/saber/core/env.h @@ -16,7 +16,7 @@ #ifndef ANAKIN_SABER_CORE_ENV_H #define ANAKIN_SABER_CORE_ENV_H -#include "core/device.h" +#include "saber/core/device.h" namespace anakin{ @@ -56,7 +56,7 @@ class Env { Env(){} }; -#ifdef AMD_GPU +#ifdef AMD_GPU typedef std::list cl_event_list; template <> @@ -70,7 +70,7 @@ class Env { } static void env_init(int max_stream = 4); - static bool is_init(); + static bool is_init(); static cl_platform_id get_platform_id(); static void add_event(const char *tag, cl_event_list event); diff --git a/saber/core/events.h b/saber/core/events.h index e83f3a767..6796d9392 100644 --- a/saber/core/events.h +++ b/saber/core/events.h @@ -16,7 +16,7 @@ #ifndef ANAKIN_SABER_CORE_EVENTS_H #define ANAKIN_SABER_CORE_EVENTS_H -#include "core/target_wrapper.h" +#include "saber/core/target_wrapper.h" namespace anakin{ diff --git a/saber/core/impl/arm/arm_device.cpp b/saber/core/impl/arm/arm_device.cpp index f8b3ea9bf..366372292 100644 --- a/saber/core/impl/arm/arm_device.cpp +++ b/saber/core/impl/arm/arm_device.cpp @@ -6,14 +6,7 @@ #ifdef PLATFORM_ANDROID #include #include -#define __NCPUBITS__ (8 * sizeof (unsigned long)) - -#define __CPU_SET(cpu, cpusetp) \ - ((cpusetp)->mask_bits[(cpu) / __NCPUBITS__] |= (1UL << ((cpu) % __NCPUBITS__))) - -#define __CPU_ZERO(cpusetp) \ - memset((cpusetp), 0, sizeof(cpu_set_t)) - +#include "cpu_info.h" #endif //PLATFORM_ANDROID #if __APPLE__ @@ -31,32 +24,24 @@ namespace saber{ int arm_get_cpucount() { #ifdef PLATFORM_ANDROID - // get cpu count from /proc/cpuinfo - FILE* fp = fopen("/proc/cpuinfo", "rb"); - if (!fp) { - return 1; - } + // get cpu count from /sys/devices/system/cpu/cpunum/uevent + int max_cpu_count = 20; int count = 0; - char line[1024]; - while (!feof(fp)) { - char* s = fgets(line, 1024, fp); - if (!s) { + for (int i = 0; i < max_cpu_count; ++i) { + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); + FILE* fp = fopen(path, "rb"); + if (!fp) { break; } - - if (memcmp(line, "processor", 9) == 0) { - count++; - } + count++; + fclose(fp); } - - fclose(fp); - if (count < 1) { count = 1; } return count; - -#elif TARGET_IOS +#elif defined(TARGET_IOS) int count = 0; size_t len = sizeof(count); sysctlbyname("hw.ncpu", &count, &len, NULL, 0); @@ -69,6 +54,92 @@ int arm_get_cpucount() { #endif } +void arm_get_cpu_arch(std::vector& archs){ +#ifdef PLATFORM_ANDROID + archs.clear(); + //! get CPU ARCH + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return; + } + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + if (strstr(line, "part") != NULL) { + int arch_id = 0; + sscanf(s, "CPU part\t: %x", &arch_id); + switch (arch_id) { + case 0xd03: + archs.push_back(A53); + break; + case 0xd05: + archs.push_back(A55); + break; + case 0xd07: + archs.push_back(A57); + break; + case 0xd08: + archs.push_back(A72); + break; + case 0xd09: + archs.push_back(A73); + break; + case 0xd0a: + archs.push_back(A75); + break; + case 0x800: + // 835 + archs.push_back(A73); + break; + case 0x205: + // 820 + archs.push_back(A72); + break; + default: + LOG(ERROR) << "unknow type"; + archs.push_back(ARM_UNKOWN); + } + } + } + fclose(fp); + int cpu_count = arm_get_cpucount(); + if (archs.size() < cpu_count) { + for (int i = archs.size(); i < cpu_count; ++i) { + archs.push_back(archs[i - 1]); + } + } +#endif +#ifdef TARGET_IOS + int cpu_count = arm_get_cpucount(); + for(int i = 0; i < cpu_count; ++i){ + archs.push_back(APPLE); + } +#endif +} + +void set_default_cache(DeviceInfo& dev){ + int cpu_count = arm_get_cpucount(); + dev._L1_cache.resize(cpu_count); + dev._L2_cache.resize(cpu_count); + dev._L3_cache.resize(cpu_count); +#ifdef TARGET_IOS + for (int i = 0; i < cpu_count; ++i){ + dev._L1_cache[i] = 64 * 1024; + dev._L2_cache[i] = 2048 * 1024; + dev._L3_cache[i] = 0; + } +#else + for (int i = 0; i < cpu_count; ++i){ + dev._L1_cache[i] = 32 * 1024; + dev._L2_cache[i] = 512 * 1024; + dev._L3_cache[i] = 0; + } +#endif +} + size_t arm_get_meminfo() { #ifdef PLATFORM_ANDROID // get cpu count from /proc/cpuinfo @@ -79,8 +150,7 @@ size_t arm_get_meminfo() { size_t memsize = 0; char line[1024]; - while (!feof(fp)) - { + while (!feof(fp)) { char* s = fgets(line, 1024, fp); if (!s) { break; @@ -91,16 +161,36 @@ size_t arm_get_meminfo() { fclose(fp); return memsize; -#elif TARGET_IOS +#elif defined(TARGET_IOS) // to be implemented - LOG(ERROR) << "not implemented"; + printf("not implemented\n"); return 0; #endif } #ifdef PLATFORM_ANDROID -static int get_max_freq_khz(int cpuid) -{ +std::string arm_get_cpu_name(){ + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return ""; + } + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + if (strstr(line, "Hardware") != NULL){ + fclose(fp); + return std::string(line); + } + } + fclose(fp); + return ""; +} + + +int get_max_freq_khz(int cpuid) { // first try, for all possible cpu char path[256]; snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state",\ @@ -108,15 +198,13 @@ static int get_max_freq_khz(int cpuid) FILE* fp = fopen(path, "rb"); - if (!fp) - { + if (!fp) { // second try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",\ cpuid); fp = fopen(path, "rb"); - if (!fp) - { + if (!fp) { // third try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",\ cpuid); @@ -136,8 +224,7 @@ static int get_max_freq_khz(int cpuid) } int max_freq_khz = 0; - while (!feof(fp)) - { + while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) { @@ -156,29 +243,39 @@ static int get_max_freq_khz(int cpuid) int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, \ std::vector& cpu_freq, std::vector& cluster_ids) { - //const int cpu_count = cpuids.size(); if (cpu_count == 0) { return 0; } - //std::vector cpu_max_freq_khz; cpuids.resize(cpu_count); - cpu_freq.resize(cpu_count); cluster_ids.resize(cpu_count); - for (int i = 0; i < cpu_count; i++) - { - int max_freq_khz = get_max_freq_khz(i); - //printf("%d max freq = %d khz\n", i, max_freq_khz); + for (int i = 0; i < cpu_count; i++) { cpuids[i] = i; - cpu_freq[i] = max_freq_khz / 1000; } + // sort cpuid as big core first + //simple bubble sort + + for (int i = 0; i < cpu_count; i++) + { + for (int j = i+1; j < cpu_count; j++) + { + if (cpu_freq[i] < cpu_freq[j]) + { + // swap + int tmp = cpuids[i]; + cpuids[i] = cpuids[j]; + cpuids[j] = tmp; + } + } + } // SMP - int mid_max_freq_khz = (cpu_freq.front() + cpu_freq.back()) / 2; + int mid_max_freq_khz = (cpu_freq[cpuids[0]] + cpu_freq[cpuids[cpu_count - 1]]) / 2; for (int i = 0; i < cpu_count; i++) { + cpuids[i] = i; if (cpu_freq[i] >= mid_max_freq_khz) { cluster_ids[i] = 0; } @@ -190,71 +287,64 @@ int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector& cpuids, \ return 0; } +int check_online(std::vector& core_ids){ + + if (core_ids.size() == 0){ + return 0; + } + char path[256]; + int online = 1; + for (int i = 0; i < core_ids.size(); ++i){ + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",\ + core_ids[i]); + FILE* fp = fopen(path, "rb"); + if (!fp){ + return 0; + } + int cur_online = 0; + fscanf(fp, "%d", &cur_online); + online &= cur_online; + fclose(fp); + } + return online; +} + int set_sched_affinity(const std::vector& cpuids) { // cpu_set_t definition // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity - +#define CPU_SETSIZE 1024 +#define __NCPUBITS (8 * sizeof (unsigned long)) typedef struct { - unsigned long mask_bits[1024 / __NCPUBITS__]; - }cpu_set_t; + unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; + } cpu_set_t; + +#define CPU_SET(cpu, cpusetp) \ + ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) + +#define CPU_ZERO(cpusetp) \ + memset((cpusetp), 0, sizeof(cpu_set_t)) // set affinity for thread +#ifdef __GLIBC__ + pid_t pid = syscall(SYS_gettid); +#else pid_t pid = gettid(); - +#endif cpu_set_t mask; - __CPU_ZERO(&mask); - for (int i = 0; i < (int)cpuids.size(); i++) - { - __CPU_SET(cpuids[i], &mask); + CPU_ZERO(&mask); + for (int i = 0; i < cpuids.size(); i++) { + CPU_SET(cpuids[i], &mask); } int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); - if (syscallret) - { - LOG(ERROR) << "syscall error " << syscallret; + if (syscallret) { + LOG(ERROR) << "syscall error" << syscallret; return -1; } return 0; } - -int set_cpu_affinity(const std::vector& cpuids) { -#ifdef USE_OPENMP - int num_threads = cpuids.size(); - omp_set_num_threads(num_threads); - std::vector ssarets(num_threads, 0); -#pragma omp parallel for - for (int i = 0; i < num_threads; i++) { - ssarets[i] = set_sched_affinity(cpuids); - } - for (int i = 0; i < num_threads; i++) { - if (ssarets[i] != 0) { - LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[i]; - return -1; - } - } -#else - std::vector cpuid1; - cpuid1.push_back(cpuids[0]); - int ssaret = set_sched_affinity(cpuid1); - if (ssaret != 0) { - LOG(ERROR)<<"set cpu affinity failed, cpuID: " << cpuids[0]; - return -1; - } -#endif - return 0; -} -#endif //PLATFORM_ANDROID - -#ifdef TARGET_IOS -int set_cpu_affinity(const std::vector& cpuids) { -#ifdef USE_OPENMP - int num_threads = cpuids.size(); - omp_set_num_threads(num_threads); -#endif - return 0; -} -#endif +#endif //android template <> void Device::create_stream() { @@ -268,141 +358,334 @@ void Device::create_stream() { template <> void Device::get_info() { - - //! set to const value, need to fetch from device - _info._L1_cache = 31000; - _info._L2_cache = 2000000; - _info._L3_cache = 0; - - _info._idx = 0; + set_default_cache(_info); _info._compute_core_num = arm_get_cpucount(); _info._max_memory = arm_get_meminfo(); + //get max freq +#ifdef PLATFORM_ANDROID + std::vector max_freq(_info._compute_core_num); + for (int i = 0; i < _info._compute_core_num; ++i){ + max_freq[i] = get_max_freq_khz(i) / 1000; + } + std::string cpu_name = arm_get_cpu_name(); + if (get_cpu_info_from_name(_info, cpu_name) != SaberSuccess){ + arm_sort_cpuid_by_max_frequency(_info._compute_core_num, _info._core_ids, max_freq, _info._cluster_ids); + _info._big_core_ids.clear(); + _info._little_core_ids.clear(); + for (int i = 0; i < _info._cluster_ids.size(); ++i) { + if (_info._cluster_ids[i] == 0) { + _info._big_core_ids.push_back(_info._core_ids[i]); + } else { + _info._little_core_ids.push_back(_info._core_ids[i]); + } + } + arm_get_cpu_arch(_info._archs); + } - _max_stream = _info._compute_core_num; - - std::vector max_freq; - - arm_sort_cpuid_by_max_frequency(_info._compute_core_num, _info._core_ids, max_freq, _info._cluster_ids); - - LOG(INFO) << "ARM multiprocessors number: " << _info._compute_core_num; + LOG(INFO) << "ARM multiprocessors number: " << _info._compute_core_num; for (int i = 0; i < _info._compute_core_num; ++i) { - LOG(INFO) << "ARM multiprocessors ID: " << _info._core_ids[i] \ - << ", frequence: " << max_freq[_info._core_ids[i]] << " MHz" << \ - ", cluster ID: " << _info._cluster_ids[_info._core_ids[i]]; + LOG(INFO) <<"ARM multiprocessors ID:" << _info._core_ids[i] << ", frequence:" << max_freq[i] << \ + ", cluster ID: " << _info._cluster_ids[_info._core_ids[i]] << ", CPU ARCH: " << _info._archs[i]; + } + LOG(INFO) << "L1 Cache size is: "; + if (_info._big_core_ids.size() > 0){ + LOG(INFO) << "big core: " << _info._L1_cache[_info._big_core_ids[0]] / 1024 << "KB"; + } + if (_info._little_core_ids.size() > 0){ + LOG(INFO) << "little core: " << _info._L1_cache[_info._little_core_ids[0]] / 1024 << "KB"; + } + LOG(INFO) << "L2 Cache size is: "; + if (_info._big_core_ids.size() > 0){ + LOG(INFO) << "big core: " << _info._L2_cache[_info._big_core_ids[0]] / 1024 << "KB"; + } + if (_info._little_core_ids.size() > 0){ + LOG(INFO) << "little core: " << _info._L2_cache[_info._little_core_ids[0]] / 1024 << "KB"; } - //LOG(INFO) << "L1 DataCache size: " << L1_cache << "B"; - //LOG(INFO) << "L2 Cache size: " << L2_cache << "B"; - LOG(INFO) << "Total memory: " << _info._max_memory << "kB"; + LOG(INFO) << "Total memory: " << _info._max_memory << "KB"; _info._max_frequence = max_freq[0]; for (int j = 1; j < _info._compute_core_num; ++j) { - if(_info._max_frequence < max_freq[j]){ + if (_info._max_frequence < max_freq[j]){ _info._max_frequence = max_freq[j]; } } +#elif defined(TARGET_IOS) + arm_get_cpu_arch(_info._archs); +#endif } template <> void Context::bind_dev() { - set_cpu_affinity(_act_ids); +#ifdef USE_OPENMP + int num_threads = _act_ids.size(); + omp_set_num_threads(num_threads); +#ifdef PLATFORM_ANDROID + std::vector ssarets; + for (int j = 0; j < num_threads; ++j) { + ssarets.push_back(0); + } +#pragma omp parallel for + for (int i = 0; i < num_threads; i++) { + ssarets[i] = set_sched_affinity(_act_ids); + } + for (int i = 0; i < num_threads; i++) { + if (ssarets[i] != 0) { + LOG(ERROR) << "set cpu affinity failed, cpuID: " << _act_ids[i]; + return; + } + } +#endif //PLATFORM_ANDROID +#else //USE_OPENMP +#ifdef PLATFORM_ANDROID + std::vector cpuid1; + cpuid1.push_back(_act_ids[0]); + int ssaret = set_sched_affinity(cpuid1); + if (ssaret != 0) { + printf("set cpu affinity failed, cpuID: %d\n", _act_ids[0]); + return; + } +#endif //PLATFORM_ANDROID +#endif//USE_OPENMP } template <> void Context::set_run_mode(PowerMode mode, int threads) { - std::vector big_cores; - std::vector small_cores; - for (int i = 0; i < devs[0]._info._cluster_ids.size(); ++i) { - if (devs[0]._info._cluster_ids[i] == 0) { - big_cores.push_back(devs[0]._info._core_ids[i]); - } else { - small_cores.push_back(devs[0]._info._core_ids[i]); - } - } - int big_core_size = big_cores.size(); - int small_core_size = small_cores.size(); +#ifdef USE_OPENMP + int big_core_size = devs[_device_id]._info._big_core_ids.size(); + int small_core_size = devs[_device_id]._info._little_core_ids.size(); if (threads > big_core_size + small_core_size) { threads = big_core_size + small_core_size; } + _count++; + int shift_num = (_count / 10) % big_core_size; switch (mode) { case SABER_POWER_FULL: _mode = mode; _act_ids.clear(); for (int i = 0; i < threads; ++i) { if (i < big_core_size) { - _act_ids.push_back(big_cores[i]); + _act_ids.push_back(devs[_device_id]._info._big_core_ids[i]); } else { - _act_ids.push_back(small_cores[i - big_core_size]); + _act_ids.push_back(devs[_device_id]._info._little_core_ids[i - big_core_size]); } } + if (_act_ids.size() == 0) { + _act_ids.push_back(0); + } break; case SABER_POWER_HIGH: _act_ids.clear(); if (big_core_size > 0) { _mode = SABER_POWER_HIGH; if (threads > big_core_size) { - LOG(ERROR) << "threads: " << threads << " exceed the big cores size: " << big_core_size; - _act_ids = big_cores; + LOG(ERROR) << "threads: " << threads << ", exceed the big cores size: " << big_core_size; + _act_ids = devs[_device_id]._info._big_core_ids; } else { for (int i = 0; i < threads; ++i) { - _act_ids.push_back(big_cores[i]); + _act_ids.push_back(devs[_device_id]._info._big_core_ids[i]); } } } else { _mode = SABER_POWER_LOW; - LOG(ERROR) << "HIGH POWER MODE is not support, switch to small cores"; - if(threads > small_core_size) { - _act_ids = small_cores; + LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores"; + if (threads > small_core_size) { + _act_ids = devs[_device_id]._info._little_core_ids; } else { for (int i = 0; i < threads; ++i) { - _act_ids.push_back(small_cores[i]); + _act_ids.push_back(devs[_device_id]._info._little_core_ids[i]); } } } + if (_act_ids.size() == 0) { + _act_ids.push_back(0); + } break; case SABER_POWER_LOW: _act_ids.clear(); if (small_core_size > 0) { _mode = SABER_POWER_LOW; if (threads > small_core_size) { - LOG(ERROR) << "threads: " << threads << " exceed the small cores size: " << small_core_size; - _act_ids = small_cores; + LOG(WARNING) << "threads: " << threads << ", exceed the little cores size:" << small_core_size; + _act_ids = devs[_device_id]._info._little_core_ids; } else { for (int i = 0; i < threads; ++i) { - _act_ids.push_back(small_cores[i]); + _act_ids.push_back(devs[_device_id]._info._little_core_ids[i]); } } } else { _mode = SABER_POWER_HIGH; - LOG(ERROR) << "LOW POWER MODE is not support, switch to big cores"; - if(threads > big_core_size) { - _act_ids = big_cores; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; + if (threads > big_core_size) { + _act_ids = devs[_device_id]._info._big_core_ids; } else { for (int i = 0; i < threads; ++i) { - _act_ids.push_back(small_cores[i]); + _act_ids.push_back(devs[_device_id]._info._big_core_ids[i]); } } } + if (_act_ids.size() == 0) { + _act_ids.push_back(0); + } + break; + case SABER_POWER_NO_BIND: + _mode = SABER_POWER_NO_BIND; + _act_ids.clear(); + if (threads > devs[_device_id]._info._core_ids.size()) { + _act_ids.resize(devs[_device_id]._info._core_ids.size()); + } else { + _act_ids.resize(threads); + } break; + case SABER_POWER_RAND_HIGH: + _act_ids.clear(); + if (big_core_size > 0) { + _mode = SABER_POWER_RAND_HIGH; + if (threads > big_core_size) { + LOG(WARNING) << "threads: " << threads << ", exceed the big cores size: " << big_core_size; + _act_ids = devs[_device_id]._info._big_core_ids; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(devs[_device_id]._info._big_core_ids[(i + shift_num) % big_core_size]); + } + } + } else { + _mode = SABER_POWER_LOW; + LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores"; + if (threads > small_core_size) { + _act_ids = devs[_device_id]._info._little_core_ids; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(devs[_device_id]._info._little_core_ids[i]); + } + } + + } + if (_act_ids.size() == 0) { + _act_ids.push_back(0); + } + break; + case SABER_POWER_RAND_LOW: + _act_ids.clear(); + if (small_core_size > 0) { + _mode = SABER_POWER_RAND_LOW; + if (threads > small_core_size) { + LOG(WARNING) << "threads: " << threads << ", exceed the little cores size: " << small_core_size; + _act_ids = devs[0]._info._little_core_ids; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(devs[_device_id]._info._little_core_ids[(i + shift_num) % small_core_size]); + } + } + } else { + _mode = SABER_POWER_HIGH; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; + if (threads > big_core_size) { + _act_ids = devs[_device_id]._info._big_core_ids; + } else { + for (int i = 0; i < threads; ++i) { + _act_ids.push_back(devs[_device_id]._info._big_core_ids[i]); + } + } + + } + if (_act_ids.size() == 0) { + _act_ids.push_back(0); + } + break; + } + //! fix multi-threads SABER_POWER_HIGH mode + if (_mode == SABER_POWER_NO_BIND) { + int threads = _act_ids.size(); + omp_set_num_threads(threads); + } else { + if (check_online(_act_ids)){ + bind_dev(); + } else { + LOG(INFO) << "some cpu is offline, switch to NO BIND MODE"; + int threads = _act_ids.size(); + omp_set_num_threads(threads); + } } - LOG(INFO) << "mode: \n0: big cores only;\n1: small cores only;\n2: all cores"; - LOG(INFO) << "|----run mode: " << 0; - LOG(INFO) << "|----thread num: " << _act_ids.size(); - for (int j = 0; j < _act_ids.size(); ++j) { - LOG(INFO) << "|----active id: " << _act_ids[j]; +#else + if (big_core_size > 0){ + _act_ids = {devs[_device_id]._info._big_core_ids[0]}; + } else { + _act_ids = {0}; } - bind_dev(); +#endif + _arch = devs[_device_id]._info._archs[_act_ids[0]]; } template <> -PowerMode Context::get_mode(int& threads) { - threads = _act_ids.size(); +PowerMode Context::get_mode() const{ return _mode; } +template <> +ARMArch Context::get_arch() const{ + return _arch; +} +template <> +void Context::set_arch(ARMArch arch) { + _arch = arch; +} + +template <> +void Context::set_cache(int l1size, int l2size, int l3size) { + int cpu_count = arm_get_cpucount(); + devs[_device_id]._info._L1_cache.resize(cpu_count); + devs[_device_id]._info._L2_cache.resize(cpu_count); + devs[_device_id]._info._L3_cache.resize(cpu_count); + for (int i = 0;i < cpu_count; ++i){ + devs[_device_id]._info._L1_cache[i] = l1size; + devs[_device_id]._info._L2_cache[i] = l2size; + devs[_device_id]._info._L3_cache[i] = l3size; + } + int temp_mem_size = 2 * (l1size + l2size); + _work_space.reshape(Shape({1, 1, 1, temp_mem_size})); +} + +template<> +int Context::get_l1_cache_size() const{ + return devs[_device_id]._info._L1_cache[_act_ids[0]]; +} + +template<> +int Context::get_l2_cache_size() const{ + return devs[_device_id]._info._L2_cache[_act_ids[0]]; +} + +template<> +int Context::get_l3_cache_size() const{ + return devs[_device_id]._info._L3_cache[_act_ids[0]]; +} + +template<> +void* Context::get_work_space() { + return (void*)_work_space.mutable_data(); +} + +template<> +int Context::get_threads() const { + return _act_ids.size(); +} + +template<> +SaberStatus Context::workspace_extend(Shape sh) { + int count = sh.count(); + Shape old = _work_space.shape(); + _work_space.reshape(Shape({1, 1, 1, count + devs[_device_id]._info._L2_cache[_act_ids[0]] / sizeof(float)})); + + if (_work_space.data() == nullptr) { + _work_space.re_alloc(old, AK_FLOAT); + return SaberInvalidValue; + } + return SaberSuccess; +} } //namespace saber } //namespace anakin -#endif //USE_ARM_PLACE \ No newline at end of file +#endif //USE_ARM_PLACE diff --git a/saber/core/impl/arm/cpu_info.cpp b/saber/core/impl/arm/cpu_info.cpp new file mode 100644 index 000000000..d62609441 --- /dev/null +++ b/saber/core/impl/arm/cpu_info.cpp @@ -0,0 +1,263 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include "saber/core/impl/arm/cpu_info.h" +namespace anakin{ + +namespace saber{ + +#ifdef PLATFORM_ANDROID + +// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 +void set_cache_info(DeviceInfo& cpu_info, int cache_id, int argc, ...){ + va_list arg_ptr; + va_start(arg_ptr, argc); + std::vector* cache; + switch (cache_id){ + case 0: + cache = &cpu_info._L1_cache; + break; + case 1: + cache = &cpu_info._L2_cache; + break; + case 2: + cache = &cpu_info._L3_cache; + break; + default: + break; + } + int core_num = cpu_info._compute_core_num; + cache->resize(core_num); + if (argc == 1){ + int cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < core_num; ++i){ + (*cache)[i] = cache_size; + } + } else { + int big_core_num = cpu_info._big_core_ids.size(); + int little_core_num = cpu_info._little_core_ids.size(); + int big_core_cache_size = va_arg(arg_ptr, int); + int little_core_cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < big_core_num; ++i){ + (*cache)[cpu_info._big_core_ids[i]] = big_core_cache_size; + } + for (int i = 0; i < little_core_num; ++i){ + (*cache)[cpu_info._little_core_ids[i]] = little_core_cache_size; + } + } + va_end(arg_ptr); +} + +void set_arch_info(DeviceInfo& cpu_info, int argc, ...){ + va_list arg_ptr; + va_start(arg_ptr, argc); + int core_num = cpu_info._compute_core_num; + cpu_info._archs.resize(core_num); + if (argc == 1){ + ARMArch arch = (ARMArch)va_arg(arg_ptr, int); + for (int i = 0; i < core_num; ++i){ + cpu_info._archs[i] = arch; + } + } else { + ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); + ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); + int big_core_num = cpu_info._big_core_ids.size(); + int little_core_num = cpu_info._little_core_ids.size(); + for (int i = 0; i < big_core_num; ++i){ + cpu_info._archs[cpu_info._big_core_ids[i]] = big_core_arch; + } + for (int i = 0; i < little_core_num; ++i){ + cpu_info._archs[cpu_info._little_core_ids[i]] = little_core_arch; + } + } + va_end(arg_ptr); +} + +SaberStatus get_cpu_info_from_name(DeviceInfo& cpu_info, std::string hardware_name){ + + /* Snapdragon */ + + if (hardware_name.find("SDM845") != std::string::npos){ //845 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, A75, A55); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024); + set_cache_info(cpu_info, 2, 1, 2048 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("SDM710") != std::string::npos){ //710 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3, 4, 5}; + cpu_info._cluster_ids = {1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, A75, A55); + return SaberSuccess; + + } else if (hardware_name.find("MSM8998") != std::string::npos){ //835 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, A73, A53); + set_cache_info(cpu_info, 0, 2, 64 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, + /*real cache size is 2M, while that will get bad performace on conv3x3s1 or gemm, set to 1M or 512K*/ + 1024 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("MSM8996") != std::string::npos){ //820 + cpu_info._compute_core_num = 4; + cpu_info._core_ids = {0, 1, 2, 3}; + cpu_info._big_core_ids = {2, 3}; + cpu_info._little_core_ids = {0, 1}; + cpu_info._cluster_ids = {1, 1, 0, 0}; + set_arch_info(cpu_info, 1, A72); + set_cache_info(cpu_info, 0, 1, 24 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("SDM660") != std::string::npos || + hardware_name.find("SDM636") != std::string::npos){ // 660, 636 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, A73); + set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024); + set_cache_info(cpu_info, 1, 1, 1024 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("MSM8976") != std::string::npos){ // 652,653 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, A72, A53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("MSM8953") != std::string::npos){ // 625 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._little_core_ids = {}; + cpu_info._cluster_ids = {0, 0, 0, 0, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, A53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 1, 1024 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("MSM8939") != std::string::npos){ // 615 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {0, 1, 2, 3}; + cpu_info._little_core_ids = {4, 5, 6, 7}; + cpu_info._cluster_ids = {0, 0, 0, 0, 1, 1, 1, 1}; + set_arch_info(cpu_info, 1, A53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024); + return SaberSuccess; + + /* MediaTek */ + + } else if (hardware_name.find("MT6797") != std::string::npos){ // X20/X23/X25/X27 + cpu_info._compute_core_num = 10; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + cpu_info._big_core_ids = {8, 9}; + cpu_info._little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._cluster_ids = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, A72, A53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return SaberSuccess; + + } else if (hardware_name.find("MT6799") != std::string::npos){ // X30 + cpu_info._compute_core_num = 10; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + cpu_info._big_core_ids = {8, 9}; + cpu_info._little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._cluster_ids = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, A73, A53); + return SaberSuccess; + + }else if (hardware_name.find("MT6795") != std::string::npos || + hardware_name.find("MT6762") != std::string::npos || + hardware_name.find("MT6755T") != std::string::npos || + hardware_name.find("MT6755S") != std::string::npos || + hardware_name.find("MT6753") != std::string::npos || + hardware_name.find("MT6752") != std::string::npos || + hardware_name.find("MT6750") != std::string::npos){ // X10, P22, P15/P18, MT6753 \ + MT6752/MT6752M, MT6750 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._little_core_ids = {}; + cpu_info._cluster_ids = {0, 0, 0, 0, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, A53); + return SaberSuccess; + + } else if (hardware_name.find("MT6758") != std::string::npos || + hardware_name.find("MT6757") != std::string::npos || + hardware_name.find("MT6763") != std::string::npos || + hardware_name.find("MT6755M") != std::string::npos || + hardware_name.find("MT6755") != std::string::npos){ // P30, P20/P25, P23, P10 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, A53); + return SaberSuccess; + + } else if (hardware_name.find("MT6771") != std::string::npos){ // P60 + cpu_info._compute_core_num = 8; + cpu_info._core_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info._big_core_ids = {4, 5, 6, 7}; + cpu_info._little_core_ids = {0, 1, 2, 3}; + cpu_info._cluster_ids = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, A73, A53); + return SaberSuccess; + + } else if (hardware_name.find("MT6765") != std::string::npos || + hardware_name.find("MT6739") != std::string::npos || + hardware_name.find("MT6738") != std::string::npos || + hardware_name.find("MT6737") != std::string::npos){ // A22, MT6739, MT6738, MT6767 + cpu_info._compute_core_num = 4; + cpu_info._core_ids = {0, 1, 2, 3}; + cpu_info._big_core_ids = {0, 0, 0, 0}; + cpu_info._little_core_ids = {}; + cpu_info._cluster_ids = {0, 0, 0, 0}; + set_arch_info(cpu_info, 1, A53); + return SaberSuccess; + } + + return SaberUnImplError; +} + +#endif + + +} //namespace saber + +} //namespace anakin diff --git a/saber/core/impl/arm/cpu_info.h b/saber/core/impl/arm/cpu_info.h new file mode 100644 index 000000000..5a9239d38 --- /dev/null +++ b/saber/core/impl/arm/cpu_info.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_LITE_CORE_CPU_INFO_H +#define ANAKIN_SABER_LITE_CORE_CPU_INFO_H + +#include "saber/core/device.h" +namespace anakin{ + +namespace saber{ + +#ifdef PLATFORM_ANDROID + +SaberStatus get_cpu_info_from_name(DeviceInfo& cpu_info, std::string hardware_name); + +#endif + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_LITE_CORE_CPU_INFO_H diff --git a/saber/core/shape.h b/saber/core/shape.h index 5f529a3a9..e3847ec93 100644 --- a/saber/core/shape.h +++ b/saber/core/shape.h @@ -17,11 +17,11 @@ #define ANAKIN_SABER_CORE_SHAPE_H #include -#include "core/common.h" +#include "saber/core/common.h" -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { class Shape : public std::vector { public: @@ -35,13 +35,15 @@ class Shape : public std::vector { create_layout(layout_type); CHECK_EQ(_layout->dims(), data.size()) \ << "The shape from the vector must have the correct layout."; + for (int i = 0; i < _layout->dims(); ++i) { this->push_back(data[i]); } + if (_layout->inner_c() != -1) { CHECK_EQ(data[4], _layout->inner_c()) \ - << " Layout must be an integer multiple of " - << _layout->inner_c(); + << " Layout must be an integer multiple of " + << _layout->inner_c(); } } ~Shape() { @@ -52,17 +54,21 @@ class Shape : public std::vector { Shape(const Shape& right) : std::vector(right) { this->clear(); + for (int i = 0; i < right.size(); ++i) { this->push_back(right[i]); } + create_layout(right.get_layout()); } - Shape &operator=(const Shape& right) { + Shape& operator=(const Shape& right) { this->clear(); + for (int i = 0; i < right.size(); ++i) { this->push_back(right[i]); } + delete _layout; _layout = nullptr; create_layout(right.get_layout()); @@ -72,9 +78,11 @@ class Shape : public std::vector { Shape tmp_shape(*this); int* p = data(); + for (size_t i = 0; i < size(); i++) { tmp_shape[i] = p[i] + shape[i]; } + return tmp_shape; } @@ -82,76 +90,97 @@ class Shape : public std::vector { Shape tmp_shape(*this); int* p = data(); + for (size_t i = 0; i < size(); i++) { tmp_shape[i] = p[i] - shape[i]; } + return tmp_shape; } bool operator<(const Shape& shape) const { bool flag = size() == shape.size(); + if (!flag) { return false; } const int* p = data(); + for (size_t i = 0; i < size(); i++) { flag = flag && (p[i] < shape[i]); } + return flag; } - bool operator<=(const Shape& shape) const{ + bool operator<=(const Shape& shape) const { bool flag = size() == shape.size(); + if (!flag) { return false; } + const int* p = data(); + for (size_t i = 0; i < size(); i++) { flag = flag && (p[i] <= shape[i]); } + return flag; } bool operator>(const Shape& shape) const { bool flag = size() == shape.size(); + if (!flag) { return false; } const int* p = data(); + for (size_t i = 0; i > size(); i++) { flag = flag && (p[i] > shape[i]); } + return flag; } - bool operator>=(const Shape& shape) const{ + bool operator>=(const Shape& shape) const { bool flag = size() == shape.size(); + if (!flag) { return false; } + const int* p = data(); + for (size_t i = 0; i > size(); i++) { flag = flag && (p[i] >= shape[i]); } + return flag; } - bool operator==(const Shape& shape) const{ + bool operator==(const Shape& shape) const { bool flag = size() == shape.size(); + flag = flag && this->get_layout() == shape.get_layout(); + if (!flag) { return false; } + const int* p = data(); + for (size_t i = 0; i < size(); i++) { flag = flag && (p[i] == shape[i]); } + return flag; } int num_index() const { @@ -195,9 +224,11 @@ class Shape : public std::vector { } int channel() const { int shape_channel = this->channel_index() == -1 ? 1 : this->data()[this->channel_index()]; + if (_layout->inner_c() != -1) { shape_channel *= _layout->inner_c(); } + return shape_channel; } int height() const { @@ -216,67 +247,97 @@ class Shape : public std::vector { if (start > dims()) { start = dims(); } + if (this->size() == 0) { return 0; } + long long sum = 1; - for_each(this->begin() + start, this->end(), [&](int n){sum *= n;}); + for_each(this->begin() + start, this->end(), [&](int n) { + sum *= n; + }); + + if (_layout->aligned_length() != -1 && start <= 1) { + int channel_size = channel(); + int aligned_length = _layout->aligned_length(); + sum = sum / channel_size * ((channel_size + aligned_length - 1) / aligned_length * aligned_length); + } + return sum; } long long count(int start, int end) const { if (start < 0) { start = 0; } + if (end > dims()) { end = dims(); } + if (end < start) { end = start; } + long long sum = 1; + for (int i = start; i < end; ++i) { sum *= data()[i]; } + + if (_layout->aligned_length() != -1 && start <= 1 && end > 1) { + int channel_size = channel(); + int aligned_length = _layout->aligned_length(); + sum = sum / channel_size * ((channel_size + aligned_length - 1) / aligned_length * aligned_length); + } + return sum; } Shape get_stride() const { Shape data_stride = Shape::zero(*this); + for (int i = 0; i < dims(); ++i) { data_stride[i] = count(i + 1); } + return data_stride; } int dims() const { return this->size(); } - /** - * @brief Returns the 'canonical' version of a (usually) user-specified axis, - * allowing for negative indexing.(e.g., -1 for the last axis). - * @e.g. Layout: N C H W - * Canonic: 0 1 2 3 - * Axis: -4 -3 -2 -1 - * @param axis: the axis index. - * @notice You should pay attention to the usage when shape.dims() > 4. - */ + /** + * @brief Returns the 'canonical' version of a (usually) user-specified axis, + * allowing for negative indexing.(e.g., -1 for the last axis). + * @e.g. Layout: N C H W + * Canonic: 0 1 2 3 + * Axis: -4 -3 -2 -1 + * @param axis: the axis index. + * @notice You should pay attention to the usage when shape.dims() > 4. + */ int canon_axis(int axis) const { const int dims = this->dims(); CHECK_GE(axis, -dims); CHECK_LT(axis, dims); - if (axis < 0) { return axis + dims; } + + if (axis < 0) { + return axis + dims; + } + return axis; } bool is_continue(const Shape real_shape) const { - if (real_shape.size() != this->size()){ + if (real_shape.size() != this->size()) { return false; } const int* p = data(); + for (int i = this->size() - 1; i >= 0; i--) { if (p[i] != real_shape[i]) { int size = this->count() / this->count(i); return size == 1; } } + return true; } LayoutType get_layout() const { @@ -286,133 +347,243 @@ class Shape : public std::vector { return Layout_invalid; } } - void set_num (const int num) { + void set_num(const int num) { CHECK_GT(num, 0); + if (_layout->num_index() != -1) { this->data()[_layout->num_index()] = num; } } - void set_channel (const int channel) { + void set_channel(const int channel) { CHECK_GT(channel, 0); + if (_layout->channel_index() != -1) { int shape_channel = channel; + if (_layout->inner_c() != -1) { CHECK_EQ(channel % _layout->inner_c(), 0); shape_channel /= _layout->inner_c(); } + this->data()[_layout->channel_index()] = shape_channel; } } - void set_height (const int height) { + void set_height(const int height) { CHECK_GT(height, 0); + if (_layout->height_index() != -1) { this->data()[_layout->height_index()] = height; } } - void set_width (const int width) { + void set_width(const int width) { CHECK_GT(width, 0); + if (_layout->width_index() != -1) { this->data()[_layout->width_index()] = width; } } - void set_depth (const int depth) { + void set_depth(const int depth) { CHECK_GT(depth, 0); + if (_layout->depth_index() != -1) { this->data()[_layout->depth_index()] = depth; } } - void set_shape_without_layout(const Shape &right){ + void set_shape_without_layout(const Shape& right) { this->clear(); - this->resize(right.size()); - for (int i = 0; i < right.size(); ++i) { - this->data()[i]=right[i]; + if (this->size()==0){ + this->resize(right.size()); } + + this->set_num(right.num()); + this->set_channel(right.channel()); + this->set_height(right.height()); + this->set_width(right.width()); + } +// void set_layout_without_shape(LayoutType layout_type) { +// Shape sh = *this; +// Layout* layout = this->_layout; +// create_layout(layout_type); +// delete layout; +// +// if (sh._layout == nullptr) { +// return; +// } +// } void set_layout(LayoutType layout_type, std::vector new_shape = {}) { Shape sh = *this; Layout* layout = this->_layout; create_layout(layout_type); - if (sh._layout== nullptr) { + + if (sh._layout == nullptr || sh.empty()) { return; } + this->clear(); + if (new_shape.size() != 0) { CHECK_EQ(_layout->dims(), new_shape.size()) << "new_shape dims miss match with layout dims"; + for (auto i : new_shape) { this->push_back(i); } + return; } + this->resize(_layout->dims()); + + if (_layout->num_index() != -1) { this->data()[_layout->num_index()] = sh.num(); } + if (_layout->channel_index() != -1) { this->data()[_layout->channel_index()] = sh.channel(); + if (_layout->inner_c() != -1) { CHECK_EQ(sh.channel() % _layout->inner_c(), 0); this->data()[_layout->channel_index()] /= _layout->inner_c(); this->data()[4] = _layout->inner_c(); } } + if (_layout->height_index() != -1) { this->data()[_layout->height_index()] = sh.height(); } + if (_layout->width_index() != -1) { this->data()[_layout->width_index()] = sh.width(); } + if (_layout->depth_index() != -1) { this->data()[_layout->depth_index()] = sh.depth(); } + delete layout; } - static Shape zero(const Shape &right){ + static Shape zero(const Shape& right) { Shape sh = right; + for (int i = 0; i < right.size(); ++i) { sh[i] = 0; } + return sh; } - static Shape minusone(const Shape &right){ + static Shape minusone(const Shape& right) { Shape sh = right; + for (int i = 0; i < right.size(); ++i) { sh[i] = -1; } + return sh; } + static Shape cvt_shape(const Shape& right,LayoutType layoutType) { + CHECK_EQ(right._layout->dims(),4)<<"only support 4 dim shape"; + Shape sh({1,1,1,1},layoutType); + CHECK_EQ(sh._layout->dims(),4)<<"only support 4 dim shape"; + sh.set_num(right.num()); + sh.set_channel(right.channel()); + sh.set_height(right.height()); + sh.set_width(right.width()); + return sh; + } + + int get_layout_aligned_length() { + return _layout->aligned_length(); + } +#ifndef USE_SGX friend std::ostream& operator<<(std::ostream& out, const Shape& s) { for (int i = 0; i < s.dims() - 1; i++) { out << s.data()[i] << ", "; } + out << s.data()[s.dims() - 1]; + out << " , layout_type = " << s.get_layout() << ", size = " << s.count(); return out; } +#endif protected: Layout* _layout{nullptr}; private: void create_layout(LayoutType layout_type) { - switch(layout_type) { - case Layout_invalid: this->_layout = nullptr; \ - LOG(FATAL) << "The layout_type is invalid."; break; - case Layout_W: this->_layout = new W(); break; - case Layout_HW: this->_layout = new HW(); break; - case Layout_WH: this->_layout = new WH(); break; - case Layout_NC: this->_layout = new NC(); break; - case Layout_NH: this->_layout = new NH(); break; - case Layout_NW: this->_layout = new NW(); break; - case Layout_NHW: this->_layout = new NHW(); break; - case Layout_NCHW: this->_layout = new NCHW(); break; - case Layout_NHWC: this->_layout = new NHWC(); break; - case Layout_NCHW_C4: this->_layout = new NCHW_C4(); break; - case Layout_NCHW_C8: this->_layout = new NCHW_C8(); break; - case Layout_NCHW_C16: this->_layout = new NCHW_C16(); break; +// if(this->_layout != nullptr){ +// delete this->_layout; +// this->_layout = nullptr; +// } + + switch (layout_type) { + case Layout_invalid: + this->_layout = nullptr; + \ + LOG(FATAL) << "The layout_type is invalid."; + break; + + case Layout_W: + this->_layout = new W(); + break; + + case Layout_HW: + this->_layout = new HW(); + break; + + case Layout_WH: + this->_layout = new WH(); + break; + + case Layout_NC: + this->_layout = new NC(); + break; + + case Layout_NH: + this->_layout = new NH(); + break; + + case Layout_NW: + this->_layout = new NW(); + break; + + case Layout_NHW: + this->_layout = new NHW(); + break; + + case Layout_NCHW: + this->_layout = new NCHW(); + break; + + case Layout_NHWC: + this->_layout = new NHWC(); + break; + + case Layout_NCHW_C4: + this->_layout = new NCHW_C4(); + break; + + case Layout_NCHW_C8: + this->_layout = new NCHW_C8(); + break; + + case Layout_NCHW_C16: + this->_layout = new NCHW_C16(); + break; + + case Layout_NCHW_C8R: + this->_layout = new NCHW_C8R(); + break; + + case Layout_NCHW_C16R: + this->_layout = new NCHW_C16R(); + break; } } }; diff --git a/saber/core/target_traits.h b/saber/core/target_traits.h index e1878c050..ed16166c2 100644 --- a/saber/core/target_traits.h +++ b/saber/core/target_traits.h @@ -15,7 +15,7 @@ #ifndef ANAKIN_SABER_CORE_TARGET_TRAITS_H #define ANAKIN_SABER_CORE_TARGET_TRAITS_H -#include "core/common.h" +#include "saber/core/common.h" namespace anakin{ diff --git a/saber/core/tensor.h b/saber/core/tensor.h index 2c15b4fb4..4fc36020a 100644 --- a/saber/core/tensor.h +++ b/saber/core/tensor.h @@ -39,6 +39,7 @@ class Tensor { */ Tensor(DataType type = AK_FLOAT) : _valid_shape(), _shape(), _offset() { _dtype = type; + _buf_dtype = type; _type_len = type_length(type); _buf = std::make_shared>(); _is_subbuf = false; @@ -52,6 +53,7 @@ class Tensor { _valid_shape = shape; _offset = Shape::zero(shape); _dtype = type; + _buf_dtype = type; _type_len = type_length(type); _buf = std::make_shared>(shape.count() * _type_len); _is_shared = false; @@ -85,6 +87,7 @@ class Tensor { _valid_shape = tensor._valid_shape; _offset = tensor._offset; _dtype = tensor._dtype; + _buf_dtype = tensor._buf_dtype; _type_len = tensor._type_len; _buf = tensor._buf; _is_subbuf = tensor._is_subbuf; @@ -92,7 +95,7 @@ class Tensor { _seq_offset = tensor._seq_offset; _scale = tensor._scale; } - +#if 0 /** * \brief Copy constructor without events control. */ @@ -109,6 +112,7 @@ class Tensor { _seq_offset = tensor._seq_offset; _scale = tensor._scale; } +#endif #if 0 /** * \brief create tensor with buffer @@ -158,6 +162,7 @@ class Tensor { LOG(FATAL) << "tensor is shared, memory can not be re-alloced"; return SaberOutOfAuthority; } + _buf_dtype = type; _buf->re_alloc(_shape.count() * _type_len); } return SaberSuccess; @@ -170,32 +175,48 @@ class Tensor { DataType get_dtype() const { return _dtype; } + + size_t get_type_size(DataType type) const{ + switch(type) { + case AK_HALF: { + return sizeof(unsigned short); + } + case AK_FLOAT: { + return sizeof(float); + } + case AK_DOUBLE: { + return sizeof(double); + } + case AK_INT8: { + return sizeof(int8_t); + } + case AK_INT16: { + return sizeof(int16_t); + } + case AK_INT32: { + return sizeof(int); + } + case AK_UINT8: { + return sizeof(uint8_t); + } + default: { + LOG(ERROR) << "tensor's data type is not supported. "; + return 0u; + } + } + } size_t get_dtype_size() const { - switch(_dtype) { - case AK_HALF: { - return sizeof(unsigned short); - } - case AK_FLOAT: { - return sizeof(float); - } - case AK_DOUBLE: { - return sizeof(double); - } - case AK_INT8: { - return sizeof(int8_t); - } - case AK_INT32: { - return sizeof(int); - } - default: { - LOG(ERROR) << "tensor's data type is not supported. "; - return 0u; - } - } + return get_type_size(_dtype); } + DataType get_buf_dtype() const { + return _buf_dtype; + } + size_t get_buf_dtype_size() const { + return get_type_size(_buf_dtype); + } /** * \brief change tensor's layout and type * @param layout @@ -206,6 +227,11 @@ class Tensor { _valid_shape.set_layout(layout, data); return SaberSuccess; } +// SaberStatus set_layout_without_shape(LayoutType layout) { +// _valid_shape.set_layout_without_shape(layout); +// return SaberSuccess; +// } + LayoutType get_layout() const { return _valid_shape.get_layout(); } @@ -293,6 +319,7 @@ class Tensor { CHECK_EQ(_is_shared || _is_subbuf, false) << "shared tensor could not re_alloc"; if (type != AK_INVALID) { _dtype = type; + _buf_dtype = type; } _type_len = type_length(type); _shape = shape; @@ -350,6 +377,10 @@ class Tensor { return _valid_shape.is_continue(_shape); } + size_t capacity() const { + return _buf->get_capacity(); + } + /** * \brief Return shape count, from start index to end index(end index is excluded). * \param start Input start index. @@ -609,7 +640,7 @@ class Tensor { */ SaberStatus share_from(const Tensor& tensor) { - CHECK_LE(size(), tensor.size()) << "current tensor size should <= input tensor size"; + //CHECK_LE(size()*get_dtype_size(), tensor.size()*tensor.get_dtype_size()) << "current tensor size should <= input tensor size"; //_is_shared = BufferMemShare(_buf, tensor.get_buf()) > 0; @@ -1022,14 +1053,22 @@ class Tensor { _events_tree._events.record(stream); } + bool get_posstive_flag(){ + return _is_all_positive; + } + void set_posstive_flag(bool is_all_posstive){ + _is_all_positive=is_all_posstive; + } private: //! scale for quantization std::vector _scale; + bool _is_all_positive{false}; ///< Length of datatype. DataType _dtype{AK_FLOAT}; size_t _type_len{4}; + DataType _buf_dtype{AK_FLOAT}; ///< Represent the raw mem shape. Shape _shape; diff --git a/saber/core/tensor_op.cpp b/saber/core/tensor_op.cpp index 0ab9a6edb..860417862 100644 --- a/saber/core/tensor_op.cpp +++ b/saber/core/tensor_op.cpp @@ -1,10 +1,115 @@ -#include "tensor_op.h" +#include "saber/core/tensor_op.h" #include +#include +#include namespace anakin { namespace saber { +template +static void reorder_nhwc_nchw(const Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(),AK_FLOAT)<<"only support AK_FLOAT"; + CHECK_EQ(output.get_dtype(),AK_FLOAT)<<"only support AK_FLOAT"; + const float* input_ptr= static_cast(input.data()); + float* output_ptr= static_cast(output.mutable_data()); + int n_value=input.num(); + int c_value=input.channel(); + int h_value=input.height(); + int w_value=input.width(); + if (input.get_layout()==Layout_NHWC&&output.get_layout()==Layout_NCHW){ +#pragma omp parallel for collapse(4) schedule(static) + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index=n*h_value*w_value*c_value+h*w_value*c_value+w*c_value+c; + int out_index=n*c_value*h_value*w_value+c*h_value*w_value+h*w_value+w; + output_ptr[out_index]=input_ptr[in_index]; + } + } + } + } + }else if (input.get_layout()==Layout_NCHW&&output.get_layout()==Layout_NHWC){ +#pragma omp parallel for collapse(4) schedule(static) + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index=n*c_value*h_value*w_value+c*h_value*w_value+h*w_value+w; + int out_index=n*h_value*w_value*c_value+h*w_value*c_value+w*c_value+c; + output_ptr[out_index]=input_ptr[in_index]; + } + } + } + } + }else{ + LOG(FATAL)<<"not support layout "< +static void reorder_nchwc_nchw(Tensor& input, + Tensor& output) { + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK((input.get_layout()==Layout_NCHW_C16R||input.get_layout()==Layout_NCHW_C8R)&&output.get_layout()==Layout_NCHW)<<"not support "< 0"; + int c_round_divk = shape_input[1]; + + c_round_divk = (shape_input.channel() + aligned_length-1) / aligned_length; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); +#pragma omp parallel for collapse(4) schedule(static) + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + //#pragma ivdep + for (int w = 0; w < w_value; ++w) { + int round_c = c / aligned_length; + int remainder_c = c % aligned_length; + int input_idx = n * c_round_divk * h_value * w_value * aligned_length + round_c * h_value * w_value * aligned_length + + h * w_value * aligned_length + w * aligned_length + remainder_c; + int output_idx = n * c_value * h_value * w_value + c * h_value * w_value + + h * w_value + w ; + + *(output_ptr + output_idx) = input_ptr[input_idx]; + } + } + } + } + +} + +template +void tensor_reorder(Tensor& input, Tensor& output){ + if (input.valid_shape()==output.valid_shape()){ + output.copy_from(input); + return; + } + LayoutType in_layout= input.get_layout(); + LayoutType out_layout= output.get_layout(); + bool nhwc_flag=(in_layout==Layout_NHWC&&in_layout==Layout_NCHW)||(out_layout==Layout_NCHW&&out_layout==Layout_NHWC); + if ((in_layout==Layout_NCHW_C16R||in_layout==Layout_NCHW_C8R)&&out_layout==Layout_NCHW){ + reorder_nchwc_nchw(input,output); + }else if (nhwc_flag){ + reorder_nhwc_nchw(input,output); + }else{ + LOG(FATAL)<<"not support this "< void fill_tensor_host_const_impl(Dtype* dio, Dtype value, long long size) { for (long long i = 0; i < size; ++i) { @@ -100,6 +205,7 @@ template void fill_tensor_host_rand_impl2(Dtype* dio, Dtype vstart, Dtype vend, long long size) { std::random_device rd; std::mt19937 gen(rd()); +// std::mt19937 gen(1234); std::uniform_real_distribution dis(0, 1.f); for (long long i = 0; i < size; ++i) { Dtype random_num = static_cast(vstart + (vend - vstart) * dis(gen)); @@ -141,7 +247,36 @@ void print_tensor_host_impl(const Dtype* din, long long size, int width) { } printf("\n"); } - +template <> +void print_tensor_host_impl(const int8_t* din, long long size, int width) { + for (int i = 0; i < size; ++i) { + printf("%d ", static_cast(din[i])); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} +template <> +void print_tensor_host_impl(const uint8_t* din, long long size, int width) { + for (int i = 0; i < size; ++i) { + printf("%d ", static_cast(din[i])); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} +template <> +void print_tensor_host_impl(const int32_t* din, long long size, int width) { + for (int i = 0; i < size; ++i) { + printf("%d ", din[i]); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} template void print_tensor(Tensor& tensor, typename Tensor::API::stream_t stream) { @@ -149,6 +284,12 @@ void print_tensor(Tensor& tensor, typename Tensor::API:: const void* data_ptr = tensor.data(); long long size = tensor.size(); int width = tensor.width(); + if (tensor.get_layout()==Layout_NCHW_C8){ + width*=8; + }else if (tensor.get_layout()==Layout_NHWC){ + width=tensor.channel(); + } + DataType type = tensor.get_dtype(); switch(type) { case AK_UINT8: print_tensor_host_impl((const unsigned char*)data_ptr, size, width); break; @@ -220,23 +361,48 @@ void tensor_cmp_host(const Dtype* src1, const Dtype* src2, \ } template -double tensor_mean_value_host_impl(const Dtype* din, long long size) { +void tensor_cmp_host_mlu(const Dtype* correct, const Dtype* sample, \ + int size, double& diff) { + + double sum_diff = 0.0; + double sum_abs = 0.0; + + for (int i = 1; i < size; ++i) { + double diff = fabs(correct[i] - sample[i]); + sum_diff += diff*diff; + sum_abs += fabsf(correct[i])*fabsf(correct[i]); + } + diff = sqrt(sum_diff / sum_abs); + +} + +template +double tensor_mean_value_host_impl(const Dtype* din, long long size, double scale=1.f) { double sum = 0.0; for (long long i = 0; i < size; ++i) { - sum += din[i]; + sum += (double)din[i]*scale; } return sum / size; } + template double tensor_mean_value(Tensor& tensor, typename Tensor::API::stream_t stream) { const void* data_ptr = tensor.data(); long long size = tensor.size(); DataType type = tensor.get_dtype(); + double scale = 1.0; + if (type==AK_INT8){ + CHECK_EQ(tensor.get_scale().size(),1); + scale=tensor.get_scale()[0]; + }else if (type==AK_UINT8){ + CHECK_EQ(tensor.get_scale().size(),1); + scale=tensor.get_scale()[0]*(127.f/255.f); + } switch (type) { - case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size); - case AK_INT8: return tensor_mean_value_host_impl((const char*)data_ptr, size); + case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size, scale); + case AK_INT8: return tensor_mean_value_host_impl((const char*)data_ptr, size, scale); case AK_UINT16: return tensor_mean_value_host_impl((const unsigned short*)data_ptr, size); case AK_INT16: return tensor_mean_value_host_impl((const short*)data_ptr, size); case AK_UINT32: return tensor_mean_value_host_impl((const unsigned int*)data_ptr, size); @@ -254,11 +420,18 @@ double tensor_mean_value_valid(Tensor& tensor, typename Tensor& tensor, typename Tensor +void tensor_reorder(Tensor& input, Tensor& output); #endif #ifdef USE_CUDA FILL_TENSOR_HOST(NVHX86) +template<> +void tensor_reorder(Tensor& input, Tensor& output); #endif #ifdef AMD_GPU @@ -303,14 +480,21 @@ FILL_TENSOR_HOST(AMDHX86) FILL_TENSOR_HOST(ARM) #endif -#ifdef USE_BM_PLACE +#ifdef USE_BM_PLACE #endif +template void tensor_cmp_host_mlu(const float* correct, const float* sample, \ + int size, double& diff); +template void tensor_cmp_host_mlu(const int* correct, const int* sample, \ + int size, double& diff); + template void tensor_cmp_host(const float* src1, const float* src2, \ int size, double& max_ratio, double& max_diff); template void tensor_cmp_host(const int* src1, const int* src2, \ int size, double& max_ratio, double& max_diff); +template void tensor_cmp_host(const signed char* src1, const signed char* src2, int size, \ + double& max_ratio, double& max_diff); template void tensor_cmp_host(const char* src1, const char* src2, int size, \ double& max_ratio, double& max_diff); diff --git a/saber/core/tensor_op.h b/saber/core/tensor_op.h index ba76ed501..8f9e3f21a 100644 --- a/saber/core/tensor_op.h +++ b/saber/core/tensor_op.h @@ -16,8 +16,8 @@ #ifndef ANAKIN_SABER_TENSOR_OP_H #define ANAKIN_SABER_TENSOR_OP_H -#include "core/tensor.h" -#include "context.h" +#include "saber/core/tensor.h" +#include "saber/core/context.h" #include "anakin_config.h" namespace anakin{ @@ -26,6 +26,15 @@ namespace saber{ const float eps = 1e-6f; +/** + * tensor_reorder + * @tparam TargetType + * @param input + * @param output + */ +template +void tensor_reorder(Tensor& input, Tensor& output); + /** * \brief reorder reorder tensors from src layout to dst layout * \param src source tensor reference @@ -94,6 +103,9 @@ double tensor_mean_value_valid(Tensor& tensor, typename Tensor void tensor_cmp_host(const Dtype* src1, const Dtype* src2, int size, double& max_ratio, double& max_diff); +template +void tensor_cmp_host_mlu(const Dtype* correct, const Dtype* sample, \ + int size, double& diff); #ifdef USE_CUDA /// This transform helper is only used to transform inputs or outputs, diff --git a/saber/funcs/.DS_Store b/saber/funcs/.DS_Store new file mode 100644 index 000000000..2d1f0f3cd Binary files /dev/null and b/saber/funcs/.DS_Store differ diff --git a/saber/funcs/activation.h b/saber/funcs/activation.h index b1874e5b6..66e35bdd8 100644 --- a/saber/funcs/activation.h +++ b/saber/funcs/activation.h @@ -29,8 +29,8 @@ #include "saber/funcs/impl/x86/saber_activation.h" #endif -#ifdef AMD_GPU -#include "saber/funcs/impl/amd/saber_activation.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_activation.h" #endif #ifdef USE_ARM_PLACE @@ -74,6 +74,9 @@ class Activation : public BaseFunc< Shape output_shape = (input[0]->valid_shape()); output[0]->set_seq_offset(input[0]->get_seq_offset()); + if (param.active == Active_sigmoid || param.active == Active_relu || param.active == Active_clipped_relu){ + output[0]->set_posstive_flag(true); + } return output[0]->set_shape(output_shape); } @@ -96,7 +99,7 @@ class Activation : public BaseFunc< private: virtual void pick_best_static() override { - if (this->_param.active == Active_prelu) { + if (this->_param.active == Active_prelu || this->_param.active == Active_gelu || this->_param.active == Active_swish) { this->_best_impl = this->_impl[1]; } else { this->_best_impl = this->_impl[0]; diff --git a/saber/funcs/affine_channel.h b/saber/funcs/affine_channel.h index b1ea28c88..1221d2d81 100644 --- a/saber/funcs/affine_channel.h +++ b/saber/funcs/affine_channel.h @@ -60,7 +60,7 @@ class AffineChannel : public BaseFunc< virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ Param_t& param) override { SaberStatus status; - CHECK_EQ(input.size(), 3); + CHECK_EQ(input.size(), 1); Shape output_shape = input[0]->valid_shape(); output[0]->set_shape(output_shape); diff --git a/saber/funcs/aligned_mat_mul.h b/saber/funcs/aligned_mat_mul.h new file mode 100644 index 000000000..712a7e0dc --- /dev/null +++ b/saber/funcs/aligned_mat_mul.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_ALIGNED_MAT_MUL_H +#define ANAKIN_SABER_FUNCS_ALIGNED_MAT_MUL_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_aligned_mat_mul.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_aligned_mat_mul.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_aligned_mat_mul.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_aligned_mat_mul.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_aligned_mat_mul.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_aligned_mat_mul.h" +#endif + + +namespace anakin { +namespace saber { + +template +class AlignedMatMul : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + AlignedMatMulParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + AlignedMatMulParam>::BaseFunc; + + AlignedMatMul() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef AlignedMatMulParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + auto seq_offset_0 = input[0]->get_seq_offset()[0]; + auto seq_offset_1 = input[1]->get_seq_offset()[0]; + int seq_num = seq_offset_0.size() - 1; + int inner_size_A = input[0]->count_valid(1, input[0]->dims()); + int inner_size_B = input[1]->count_valid(1, input[1]->dims()); + int batch_A = seq_offset_0[1]; + int batch_B = seq_offset_1[1]; + int M = param.is_transpose_X ? inner_size_A : batch_A; + int N = param.is_transpose_Y ? batch_B : inner_size_B; + Shape output_shape({seq_num * M, N, 1, 1}, Layout_NCHW); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderAlignedMatMul ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberAlignedMatMul ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/fake_quantize_abs_max.h b/saber/funcs/anchor_generator.h similarity index 64% rename from saber/funcs/fake_quantize_abs_max.h rename to saber/funcs/anchor_generator.h index ea77a9a8d..7e7aceaf9 100644 --- a/saber/funcs/fake_quantize_abs_max.h +++ b/saber/funcs/anchor_generator.h @@ -13,45 +13,46 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_FAKE_QUANTIZE_ABS_MAX_H -#define ANAKIN_SABER_FUNCS_FAKE_QUANTIZE_ABS_MAX_H +#ifndef ANAKIN_SABER_FUNCS_ANCHOR_GENERATOR_H +#define ANAKIN_SABER_FUNCS_ANCHOR_GENERATOR_H #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_fake_quantize_abs_max.h" +#include "saber/funcs/impl/impl_anchor_generator.h" #ifdef NVIDIA_GPU -#include "saber/funcs/impl/cuda/saber_fake_quantize_abs_max.h" +#include "saber/funcs/impl/cuda/saber_anchor_generator.h" +//#include "saber/funcs/impl/cuda/vender_anchor_generator.h" #endif #ifdef USE_X86_PLACE -#include "saber/funcs/impl/x86/saber_fake_quantize_abs_max.h" +#include "saber/funcs/impl/x86/saber_anchor_generator.h" #endif #ifdef USE_ARM_PLACE //todo -#include "saber/funcs/impl/impl_fake_quantize_abs_max.h" +#include "saber/funcs/impl/impl_anchor_generator.h" #endif namespace anakin { namespace saber { template -class FakeQuantizeAbsMax : public BaseFunc< +class AnchorGenerator : public BaseFunc< TargetType, OpDtype, ImplBase, - FakeQuantizeAbsMaxParam> { + AnchorGeneratorParam> { public: using BaseFunc< TargetType, OpDtype, ImplBase, - FakeQuantizeAbsMaxParam>::BaseFunc; + AnchorGeneratorParam>::BaseFunc; - FakeQuantizeAbsMax() = default; + AnchorGenerator() = default; typedef Tensor InDataTensor; typedef Tensor OutDataTensor; typedef Tensor OpTensor; - typedef FakeQuantizeAbsMaxParam Param_t; + typedef AnchorGeneratorParam Param_t; typedef std::vector Input_v; typedef std::vector Output_v; typedef std::vector Shape_v; @@ -60,20 +61,13 @@ class FakeQuantizeAbsMax : public BaseFunc< Param_t& param) override { SaberStatus status; CHECK_EQ(input.size(), 1); - - Shape output_shape = input[0]->valid_shape(); + CHECK_EQ(output.size(), 2); + auto anchor_sizes = param.anchor_sizes; + auto aspect_ratios = param.aspect_ratios; + int num_anchors = anchor_sizes.size() * aspect_ratios.size(); + Shape output_shape = std::vector{input[0]->height(), input[0]->width(), num_anchors, 4}; output[0]->set_shape(output_shape); - switch (param.bit_length) { - case 8: - output[0]->set_dtype(AK_INT8); - break; - case 16: - output[0]->set_dtype(AK_INT16); - break; - default: - LOG(FATAL) << "other bit length has not been supported"; - - } + output[1]->set_shape(output_shape); return SaberSuccess; } @@ -81,11 +75,11 @@ class FakeQuantizeAbsMax : public BaseFunc< virtual SaberStatus init_impl(ImplEnum implenum) override { switch (implenum) { case VENDER_IMPL: - this->_impl.push_back(new VenderFakeQuantizeAbsMax ); + this->_impl.push_back(new VenderAnchorGenerator ); return SaberSuccess; case SABER_IMPL: - this->_impl.push_back(new SaberFakeQuantizeAbsMax ); + this->_impl.push_back(new SaberAnchorGenerator ); return SaberSuccess; default: @@ -110,4 +104,4 @@ class FakeQuantizeAbsMax : public BaseFunc< } -#endif //ANAKIN_SABER_FUNCS_FAKE_QUANTIZE_ABS_MAX_H +#endif //ANAKIN_SABER_FUNCS_ANCHOR_GENERATOR_H diff --git a/saber/funcs/argmax.h b/saber/funcs/argmax.h index 221046989..302cdf6d0 100644 --- a/saber/funcs/argmax.h +++ b/saber/funcs/argmax.h @@ -28,8 +28,11 @@ #endif #ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_argmax.h" +#include "saber/funcs/impl/arm/saber_argmax.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_argmax.h" #endif namespace anakin { diff --git a/saber/funcs/arithmetic.h b/saber/funcs/arithmetic.h new file mode 100644 index 000000000..3319dc173 --- /dev/null +++ b/saber/funcs/arithmetic.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_ARITHMETIC_H +#define ANAKIN_SABER_FUNCS_ARITHMETIC_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_arithmetic.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_arithmetic.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_arithmetic.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_arithmetic.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_arithmetic.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_arithmetic.h" +#endif + + +namespace anakin { +namespace saber { + +template +class Arithmetic : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ArithmeticParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ArithmeticParam>::BaseFunc; + + Arithmetic() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ArithmeticParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape = (input[0]->valid_shape()); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderArithmetic ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberArithmetic ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/attention_padding_mask.h b/saber/funcs/attention_padding_mask.h new file mode 100644 index 000000000..7d242a8b2 --- /dev/null +++ b/saber/funcs/attention_padding_mask.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_ATTENTION_PADDING_MASK_H +#define ANAKIN_SABER_FUNCS_ATTENTION_PADDING_MASK_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_attention_padding_mask.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_attention_padding_mask.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_attention_padding_mask.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/saber_attention_padding_mask.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_attention_padding_mask.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_attention_padding_mask.h" +#endif + + +namespace anakin { +namespace saber { + +template +class AttentionPaddingMask : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + AttentionPaddingMaskParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + AttentionPaddingMaskParam>::BaseFunc; + + AttentionPaddingMask() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef AttentionPaddingMaskParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + Shape output_shape = input[0]->valid_shape(); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderAttentionPaddingMask ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberAttentionPaddingMask ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/axpy.h b/saber/funcs/axpy.h index 126f7b41c..73ae51f07 100644 --- a/saber/funcs/axpy.h +++ b/saber/funcs/axpy.h @@ -18,6 +18,10 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_axpy.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_axpy.h" +#endif + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_axpy.h" #endif @@ -27,10 +31,9 @@ #endif #ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_axpy.h" +#include "saber/funcs/impl/arm/saber_axpy.h" #endif - + namespace anakin { namespace saber { @@ -83,7 +86,7 @@ class Axpy : public BaseFunc< } private: - + virtual void pick_best_static() override { if (true) // some condition? this->_best_impl = this->_impl[0]; @@ -99,4 +102,4 @@ class Axpy : public BaseFunc< } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/base.h b/saber/funcs/base.h index 4e2995235..7133947b3 100644 --- a/saber/funcs/base.h +++ b/saber/funcs/base.h @@ -18,10 +18,13 @@ #include "saber/saber_funcs_param.h" #include "saber/core/context.h" -#include "timer.h" #include #include +#ifndef USE_SGX +#include "timer.h" +#endif + namespace anakin { namespace saber { @@ -170,9 +173,11 @@ class BaseFunc { case STATIC: pick_best_static(); break; +#ifndef USE_SGX case RUNTIME: pick_best_runtime(input, output, param, ctx); break; +#endif case SPECIFY: pick_best_specify(implenum); break; @@ -187,6 +192,12 @@ class BaseFunc { //typedef std::unordered_map static_map; virtual void pick_best_static() = 0; +#ifdef USE_SGX + virtual void pick_best_runtime(const Input_v& input, Output_v& output, Param_t& param, \ + Context &ctx) { + _best_impl = _impl[0]; + } +#else virtual void pick_best_runtime(const Input_v& input, Output_v& output, Param_t& param, \ Context &ctx) { @@ -230,7 +241,8 @@ class BaseFunc { _best_impl = _impl[idx]; } - +#endif + virtual void pick_best_specify(ImplEnum implenum) = 0; }; diff --git a/saber/funcs/box_clip.h b/saber/funcs/box_clip.h new file mode 100644 index 000000000..5961aa2ff --- /dev/null +++ b/saber/funcs/box_clip.h @@ -0,0 +1,84 @@ +#ifndef ANAKIN_SABER_FUNCS_BOX_CLIP_H +#define ANAKIN_SABER_FUNCS_BOX_CLIP_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_box_clip.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_box_clip.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_box_clip.h" +#endif + +namespace anakin { +namespace saber { + +template + +class BoxClip : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + EmptyParam + > { +public: + using BaseFunc < + TargetType, + OpDtype, + ImplBase, + EmptyParam >::BaseFunc; + + BoxClip() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef EmptyParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v& output, Param_t& param) override { + + output[0]->set_seq_offset(input[1]->get_seq_offset()); + return output[0]->set_shape_without_layout(input[1]->valid_shape()); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderBoxClip ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberBoxClip ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) { // some condition? + this->_best_impl = this->_impl[0]; + } + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif //ANAKIN_BOX_CLIP_H diff --git a/saber/funcs/box_coder.h b/saber/funcs/box_coder.h new file mode 100644 index 000000000..3717003a7 --- /dev/null +++ b/saber/funcs/box_coder.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_BOX_CODER_H +#define ANAKIN_SABER_FUNCS_BOX_CODER_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_box_coder.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_box_coder.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_box_coder.h" +#endif + +namespace anakin { +namespace saber { + +template +class BoxCoder : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + BoxCoderParam > { +public: + using BaseFunc < + TargetType, + OpDtype, + ImplBase, + BoxCoderParam >::BaseFunc; + + BoxCoder() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef BoxCoderParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v& output, Param_t& param) override { + auto prior_box_tensor = input[0]; + auto loc_tensor = input[1]; + output[0]->set_seq_offset(loc_tensor->get_seq_offset()); + + if (param.axis == 0) { + CHECK_EQ(prior_box_tensor->num(), loc_tensor->channel()); + } else if (param.axis == 1) { + CHECK_EQ(prior_box_tensor->num(), loc_tensor->num()); + } else { + LOG(FATAL) << "invalid axis " << param.axis; + } + CHECK_EQ(prior_box_tensor->channel(), loc_tensor->width() + 1); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(loc_tensor->valid_shape()); + + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderBoxCoder ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberBoxCoder ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) { // some condition? + this->_best_impl = this->_impl[0]; + } + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin +#endif //ANAKIN_SABER_FUNCS_BOX_CODER_H diff --git a/saber/funcs/calibrate.h b/saber/funcs/calibrate.h index 4e36e2c29..a7ffafdf1 100644 --- a/saber/funcs/calibrate.h +++ b/saber/funcs/calibrate.h @@ -7,6 +7,13 @@ namespace anakin { namespace saber { +// keep origin layout +template +SaberStatus flatten_calibrate( + Tensor &out_tensor, + const Tensor &in_tensor, + Context &ctx); + template SaberStatus conv_calibrate_fp32_int8_c4( Tensor &out_tensor, @@ -26,6 +33,29 @@ SaberStatus conv_calibrate_int8_c4_fp32( const Tensor &in_tensor, const float* weight_scale, Context ctx); +template +SaberStatus calibrate_int8_c4_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + const float out_scale, + Context ctx); +template +SaberStatus conv_data_calibrate(Tensor &out_tensor, + const Tensor &in_tensor, + const float in_scale, + const float* weight_scale, + Context ctx); + +template +SaberStatus layout_trans_nchwc4_2_nchw( + Tensor &out_tensor, + const Tensor &in_tensor, + float scale, + Context ctx); template void float2char(bool col_direct, signed char* dst, const float* src, @@ -40,39 +70,52 @@ void fix2float(float * dst, template SaberStatus get_tensor_scale(std::vector &vector_scale, - const Tensor &tensor, const int axis) { + const Tensor &tensor, const int axis, bool scale_per_k) { int out_dims = tensor.valid_shape()[axis]; - vector_scale.resize(out_dims); - long long inner_dim = tensor.count_valid(axis + 1, tensor.dims()); + if (scale_per_k) { + vector_scale.resize(out_dims); + } else { + vector_scale.resize(1); + } const float* in_data = (const float*)(tensor.data()); + if (scale_per_k) { + long long inner_dim = tensor.count_valid(axis + 1, tensor.dims()); + for (int c = 0; c < out_dims; ++c) { + float max_val = -1.f; - for (int c = 0; c < out_dims; ++c) { - float max_val = -1.f; + for (int i = 0; i < inner_dim; ++i) { + float read_data = fabs(in_data[i]); + max_val = (read_data > max_val) ? read_data : max_val; + } - for (int i = 0; i < inner_dim; ++i) { + vector_scale[c] = max_val / 127.f; + in_data += inner_dim; + } + } else { + long long count = tensor.valid_size(); + float max_val = -1.f; + for (int i = 0; i < count; ++i) { float read_data = fabs(in_data[i]); max_val = (read_data > max_val) ? read_data : max_val; } - - vector_scale[c] = max_val / 127.f; - in_data += inner_dim; + vector_scale[0] = max_val / 127.f; } + return SaberSuccess; } template -SaberStatus convert_weights_to_nchw_c4_host(Tensor& out_tensor, +SaberStatus scale_conv_weights_to_nchw_host(Tensor& out_tensor, const Tensor& in_tensor, Context ctx) { - - int input_channel = in_tensor.channel(); - int output_channel = out_tensor.num(); + CHECK_EQ(in_tensor.data(),AK_FLOAT)<<"input must be ak_float"; + CHECK_EQ(out_tensor.data(),AK_INT8)<<"output must be int 8"; std::vector vector_weight_scale; get_tensor_scale(vector_weight_scale, in_tensor, 0); int o_num = out_tensor.num(); - int o_channel = out_tensor.valid_shape()[1]; + int o_channel = out_tensor.channel(); int o_height = out_tensor.height(); int o_width = out_tensor.width(); @@ -86,52 +129,317 @@ SaberStatus convert_weights_to_nchw_c4_host(Tensor& out_tensor, for (int idx = 0; idx < o_num * o_channel * o_height * o_width; ++idx) { + int n = (idx / (out_n_stride)) % o_num; + + out_weight_data[idx]= static_cast(in_weight_data[idx]/vector_weight_scale[n]); + + } + out_tensor.set_scale(vector_weight_scale); + + return SaberSuccess; +} + +template +SaberStatus convert_weights_to_nchw_c4_host(Tensor& out_tensor, + const Tensor& in_tensor, const Context &ctx, + bool scale_per_k = false) { + + int output_channel = out_tensor.num(); + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 0, scale_per_k); + + int o_num = out_tensor.num(); + int out_channel = in_tensor.channel(); + int out_channel_4 = in_tensor.channel() / 4; + bool channel_rest_4 = (out_channel & 0x3) != 0; + out_channel_4 += channel_rest_4 ? 1 : 0; + int o_height = out_tensor.height(); + int o_width = out_tensor.width(); + + int out_n_stride = out_channel_4 * o_height * o_width; + int out_c_stride = o_height * o_width; + int out_h_stride = o_width; + + Shape in_stride = in_tensor.get_stride(); + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); + + for (int idx = 0; idx < o_num * out_channel_4 * o_height * o_width; ++idx) { + int n = (idx / (out_n_stride)) % o_num; int in_offset = ((idx / (out_n_stride)) % o_num) * in_stride[0] - + ((idx / (out_c_stride)) % o_channel) * (in_stride[1] * 4) + + ((idx / (out_c_stride)) % out_channel_4) * (in_stride[1] * 4) + ((idx / (out_h_stride)) % o_height) * in_stride[2] + (idx % o_width) * in_stride[3]; - + int read_channel = ((idx / (out_c_stride)) % out_channel_4); int out_offset = ((idx / (out_n_stride)) % o_num) * out_n_stride - + ((idx / (out_c_stride)) % o_channel) * out_c_stride + + ((idx / (out_c_stride)) % out_channel_4) * out_c_stride + ((idx / (out_h_stride)) % o_height) * out_h_stride + (idx % o_width); + float scale = scale_per_k ? vector_weight_scale[n] : vector_weight_scale[0]; + bool p0, p1, p2, p3; + p0 = (4 * read_channel + 0) < out_channel; + p1 = (4 * read_channel + 1) < out_channel; + p2 = (4 * read_channel + 2) < out_channel; + p3 = (4 * read_channel + 3) < out_channel; + float read; + if (p0) { + read = in_weight_data[in_offset + 0 * in_stride[1]]; + } else { + read = 0.f; + } + out_weight_data[out_offset * 4 + 0] = (char)(round(read / scale)); + if (p1) { + read = in_weight_data[in_offset + 1 * in_stride[1]]; + } else { + read = 0; + } + out_weight_data[out_offset * 4 + 1] = (char)(round(read / scale)); + if (p2) { + read = in_weight_data[in_offset + 2 * in_stride[1]]; + } else { + read = 0; + } + out_weight_data[out_offset * 4 + 2] = (char)(round(read / scale)); + if (p3) { + read = in_weight_data[in_offset + 3 * in_stride[1]]; + } else { + read = 0; + } + out_weight_data[out_offset * 4 + 3] = (char)(round(read / scale)); + } + out_tensor.set_scale(vector_weight_scale); +// for (auto i : vector_weight_scale) { +// LOG(INFO) << i; +// } + return SaberSuccess; +} +template +SaberStatus layout_trans_depthwise( + dtype* out_ptr, const dtype* in_ptr, + int num, int height, int width) { + // layout transform + int num_4 = num >> 2; + num_4 += ((num & 0x3) == 0) ? 0 : 1; + for (int n = 0; n < num_4; ++n) { + for (int i = 0; i < height * width; ++i) { + int in_idx = i + (n * 4) * height * width; + int out_idx = (n * height * width + i) * 4; + out_ptr[out_idx] = in_ptr[in_idx]; + if (n * 4 + 1 < num) { + in_idx += height * width; + out_ptr[out_idx + 1] = in_ptr[in_idx]; + } + if (n * 4 + 2 < num) { + in_idx += height * width; + out_ptr[out_idx + 2] = in_ptr[in_idx]; + } + if (n * 4 + 3 < num) { + in_idx += height * width; + out_ptr[out_idx + 3] = in_ptr[in_idx]; + } + } + } + return SaberSuccess; +} + +template +SaberStatus convert_weights_to_depthwise(Tensor& out_tensor, + const Tensor& in_tensor, const Context &ctx, + bool scale_per_k = false) { + + Tensor weight_temp; + weight_temp.re_alloc(in_tensor.valid_shape(), AK_INT8); - out_weight_data[out_offset * 4 + 0] = (char)(round( - in_weight_data[in_offset + 0 * in_stride[1]] / vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 1] = (char)(round( - in_weight_data[in_offset + 1 * in_stride[1]] / vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 2] = (char)(round( - in_weight_data[in_offset + 2 * in_stride[1]] / vector_weight_scale[n])); - out_weight_data[out_offset * 4 + 3] = (char)(round( - in_weight_data[in_offset + 3 * in_stride[1]] / vector_weight_scale[n])); + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 0, scale_per_k); + + int num = in_tensor.num(); + int channel = in_tensor.channel(); + int height = in_tensor.height(); + int width = in_tensor.width(); + int count = in_tensor.valid_size(); + int out_n_stride = channel * height * width; + const float* in_weight_data = (const float*)in_tensor.data(); + char* weight_temp_data = (char*)weight_temp.mutable_data(); + char* out_tensor_data = (char*)out_tensor.mutable_data(); + + for (int i = 0; i < count; ++i) { + int n = (i / (out_n_stride)) % num; + float scale = scale_per_k ? vector_weight_scale[n] : vector_weight_scale[0]; + weight_temp_data[i] = (char)(round( + in_weight_data[i] / scale)); } + // finished scale + layout_trans_depthwise( + out_tensor_data, weight_temp_data, num, height, width); out_tensor.set_scale(vector_weight_scale); + return SaberSuccess; +} + +template +SaberStatus convert_weights_to_direct(Tensor& out_tensor, + const Tensor& in_tensor, const Context &ctx, + bool scale_per_k = false) { + + Tensor weight_temp; + weight_temp.re_alloc(in_tensor.valid_shape(), AK_INT8); +// CHECK_EQ((in_tensor.channel() % 4), 0); +// CHECK_EQ((in_tensor.num() % 4), 0); + int input_channel = in_tensor.channel(); + int output_channel = in_tensor.num(); + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 0, scale_per_k); + + int num = in_tensor.num(); + int channel = in_tensor.channel(); + int channel_4 = channel >> 2; + bool channel_rest_4 = (channel & 0x3) != 0; + channel_4 += channel_rest_4 ? 1 : 0; + int height = in_tensor.height(); + int width = in_tensor.width(); + int out_n_stride = channel * height * width; + int out_c_stride = height * width; + int out_h_stride = width; + + Shape in_stride = in_tensor.get_stride(); + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); + // data scale + for (int idx = 0; idx < num * channel * height * width; ++idx) { + int n = (idx / (out_n_stride)) % num; + float scale = scale_per_k ? vector_weight_scale[n] : vector_weight_scale[0]; + out_weight_data[idx] = (char)(round( + in_weight_data[idx] / scale)); + } + // finished scale + // layout transform + char *weight_temp_ptr = (char*)weight_temp.mutable_data(); + const int in_loop = in_tensor.channel() * in_tensor.height() * in_tensor.width(); + for (int var_k = 0; var_k < in_tensor.num(); var_k++) { + for (int var_crs = 0; var_crs < in_loop; var_crs++) { + weight_temp_ptr[var_crs * in_tensor.num() + var_k] = + out_weight_data[var_k * in_loop + var_crs]; + } + } + int read_in = 0; + int write_out = 0; + const int out_loop = channel_4; + const int inner_loop = in_tensor.num() * in_tensor.height() * in_tensor.width() * 4; + for (int i = 0; i < out_loop; ++i) { + for (int j = 0; j < inner_loop; ++j) { + write_out = i * inner_loop + j; + if ((i * 4 + j % 4) < channel) { + read_in = ((i * 4) + (j % 4)) * (inner_loop / 4) + j / 4; + out_weight_data[write_out] = weight_temp_ptr[read_in]; + } else { + out_weight_data[write_out] = 0; + } + } + } + // finished transform + + out_tensor.set_scale(vector_weight_scale); + // for (auto i : vector_weight_scale) { // LOG(INFO) << i; // } return SaberSuccess; } + template SaberStatus convert_bias_host(Tensor& out_tensor, - const Tensor& in_tensor, - float in_scale, std::vector vector_weight_scale, - Context ctx) { + const Tensor& in_tensor, + float in_scale, std::vector vector_weight_scale, + Context ctx, bool scale_per_k = false) { unsigned long weight_size = vector_weight_scale.size(); unsigned long bias_size = in_tensor.size(); - CHECK_GT(in_scale, 0); - CHECK_GT(weight_size, 0); - CHECK_EQ(bias_size, weight_size); + CHECK_GT(in_scale, 0); + CHECK_GT(weight_size, 0); const float* in_data = (const float*)in_tensor.data(); float* out_data = (float*)out_tensor.mutable_data(); for (int i = 0; i < bias_size; ++i) { - out_data[i] = in_data[i] / in_scale / vector_weight_scale[i]; + float weights_scale = (scale_per_k && weight_size != 1) ? vector_weight_scale[i] : vector_weight_scale[0]; + out_data[i] = in_data[i] / in_scale / weights_scale; } return SaberSuccess; } +template +void transpose_filter_kcrs_2_crskc4(const Dtype *input, Dtype *temp, Dtype *output, \ + int K, int C, int R, int S) { + const int CRS = C * R * S; + for (int var_k = 0; var_k < K; var_k++) { + for (int var_crs = 0; var_crs < CRS; var_crs++) { + temp[var_crs * K + var_k] = input[var_k * CRS + var_crs]; + } + } + int read_in = 0; + int write_out = 0; + int out_loop = C / 4; + int inner_loop = K * R * S * 4; + for (int i = 0; i < out_loop; ++i) { + for (int j = 0; j < inner_loop; ++j) { + write_out = i * inner_loop + j; + read_in = ((i * 4) + (j % 4)) * (inner_loop / 4) + j / 4; + output[write_out] = temp[read_in]; + } + } +} +template +void transpose_weight_nchw_2_nchwc4(const Dtype* input, Dtype *output, + int N, int C, int H, int W) { + + int out_n = N; + int out_c = ((C + 3) >> 2); + int out_h = H; + int out_w = W * 4; + + for (int o_n = 0; o_n < out_n; ++o_n) { + for (int o_c = 0; o_c < out_c; ++o_c) { + for (int o_h = 0; o_h < out_h; ++o_h) { + for (int o_w = 0; o_w < out_w; ++o_w) { + int i_c = o_c * 4 + (o_w & 0x3); + int read_idx = o_n * C * H * W + + i_c * H * W + + o_h * W + + (o_w / 4); + int write_idx = o_n * out_c * out_h * out_w + + o_c * out_h * out_w + + o_h * out_w + + o_w; + if (i_c < C) { + output[write_idx] = input[read_idx]; + } else { + output[write_idx] = 0; + } + } + } + } + } +} +//// reverse quantization +//template +//class Dequantization { +//public: +// +//}; +// +//// high precision quantize to low precision +//template +//class Quantization { +//public: +// +//}; +// +//// scale transform while keep precision +//template +//class Requantization { +//public: +// +//}; } // namespace saber } // namespace anakin diff --git a/saber/funcs/cast.h b/saber/funcs/cast.h index 265783f70..0faa4fb07 100644 --- a/saber/funcs/cast.h +++ b/saber/funcs/cast.h @@ -18,6 +18,9 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_cast.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_cast.h" +#endif #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_cast.h" #endif @@ -27,8 +30,7 @@ #endif #ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_cast.h" +#include "saber/funcs/impl/arm/saber_cast.h" #endif namespace anakin { @@ -98,4 +100,4 @@ class Cast : public BaseFunc< } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/concat.h b/saber/funcs/concat.h index ba45d5ee7..569dc5e51 100644 --- a/saber/funcs/concat.h +++ b/saber/funcs/concat.h @@ -19,6 +19,10 @@ #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_concat.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_concat.h" +#endif + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_concat.h" #endif @@ -77,13 +81,14 @@ class Concat : public BaseFunc< for (int i = 1; i < input_size; ++i) { Shape sh = shapes_in[i]; for (int j = 0; j < sh.dims(); ++j) { + CHECK_EQ(sh.get_layout(), shape_out.get_layout()) << "This should be same"; if (j == param.axis) { continue; } else if (sh[j] != -1) { CHECK_EQ(shape_out[j], sh[j]) \ << "All inputs must have the same shape, except at concat_axis."; } else { - sh[j] = shape_out[j]; - SABER_CHECK(input[i]->set_shape(sh)); +// sh[j] = shape_out[j]; +// SABER_CHECK(input[i]->set_shape(sh)); } } shape_out[param.axis] += sh[param.axis]; diff --git a/saber/funcs/conv.h b/saber/funcs/conv.h index 414edd837..96d142d82 100644 --- a/saber/funcs/conv.h +++ b/saber/funcs/conv.h @@ -27,15 +27,23 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_conv.h" +#ifndef USE_SGX +#include "saber/funcs/impl/x86/vender_conv.h" +#endif #endif #ifdef USE_ARM_PLACE -//#include "saber/funcs/impl/arm/saber_conv.h" +#include "saber/funcs/impl/arm/saber_conv.h" #endif #ifdef USE_BM_PLACE //#include "saber/funcs/impl/bm/vender_conv.h" #endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_conv.h" +#include "saber/funcs/impl/amd/include/vender_conv.h" +#endif namespace anakin { namespace saber { @@ -67,7 +75,8 @@ class Conv : public BaseFunc< Output_v &output, Param_t ¶m) override { Shape conv_shape = conv_compute_shape(input[0]->valid_shape(), param); output[0]->set_seq_offset(input[0]->get_seq_offset()); - return output[0]->set_shape(conv_shape); + Shape result=Shape::cvt_shape(conv_shape,output[0]->get_layout()); + return output[0]->set_shape_without_layout(result); } virtual SaberStatus init_impl(ImplEnum implenum) override { diff --git a/saber/funcs/conv_pooling.h b/saber/funcs/conv_pooling.h index da105870e..bf14ba823 100644 --- a/saber/funcs/conv_pooling.h +++ b/saber/funcs/conv_pooling.h @@ -29,6 +29,14 @@ #include "saber/funcs/impl/x86/saber_conv_pooling.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_conv_pooling.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_conv_pooling.h" +#include "saber/funcs/impl/amd/include/vender_conv_pooling.h" +#endif namespace anakin { namespace saber { diff --git a/saber/funcs/coord2patch.h b/saber/funcs/coord2patch.h new file mode 100644 index 000000000..b61b690d3 --- /dev/null +++ b/saber/funcs/coord2patch.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_COORD2PATCH_H +#define ANAKIN_SABER_FUNCS_COORD2PATCH_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_coord2patch.h" + +namespace anakin { + +namespace saber { + +template +class Coord2Patch : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + Coord2PatchParam +> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + Coord2PatchParam>::BaseFunc; + + Coord2Patch() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef Coord2PatchParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ + Param_t& param) override { + CHECK_GT(input.size(), 1) << "coord2patch need 2 inputs"; + Shape output_shape = input[1]->valid_shape(); + output_shape[2] = param.output_h; + output_shape[3] = param.output_w; + output[0]->set_shape(output_shape); + return SaberSuccess; + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderCoord2Patch ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberCoord2Patch ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + //virtual void pick_best_runtime(Input_v input, Output_v output, Param_t& param) override {} + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} +} + +#endif //ANAKIN_SABER_FUNCS_COORD2PATCH_H diff --git a/saber/funcs/cos_sim.h b/saber/funcs/cos_sim.h new file mode 100644 index 000000000..314c2bccd --- /dev/null +++ b/saber/funcs/cos_sim.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_COS_SIM_H +#define ANAKIN_SABER_FUNCS_COS_SIM_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_cos_sim.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_cos_sim.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_cos_sim.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_cos_sim.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_cos_sim.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_cos_sim.h" +#endif + + +namespace anakin { +namespace saber { + +template +class CosSim : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + CosSimParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + CosSimParam>::BaseFunc; + + CosSim() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef CosSimParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape({input[0]->num(), 1, 1, 1}, Layout_NCHW); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + //this->_impl.push_back(new VenderCosSim _impl.push_back(new VenderCosSim ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberCosSim ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/crop.h b/saber/funcs/crop.h index 2fbc930de..f4b7c262d 100644 --- a/saber/funcs/crop.h +++ b/saber/funcs/crop.h @@ -31,6 +31,9 @@ #include "saber/funcs/impl/impl_crop.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_crop.h" +#endif namespace anakin { namespace saber { diff --git a/saber/funcs/ctc_align.h b/saber/funcs/ctc_align.h index 7435d03d9..4d3ab2590 100644 --- a/saber/funcs/ctc_align.h +++ b/saber/funcs/ctc_align.h @@ -19,6 +19,9 @@ #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_ctc_align.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_ctc_align.h" +#endif #ifdef NVIDIA_GPU //#include "saber/funcs/impl/cuda/saber_ctc_align.h" #endif diff --git a/saber/funcs/debug.h b/saber/funcs/debug.h index 031423e7f..503c58611 100644 --- a/saber/funcs/debug.h +++ b/saber/funcs/debug.h @@ -16,7 +16,17 @@ #ifndef ANAKIN_SABER_FUNCS_DEBUG_H #define ANAKIN_SABER_FUNCS_DEBUG_H -#include "tensor.h" +#include "anakin_config.h" +#include +#include +#include +#include + +#ifndef USE_SGX +#include "saber/core/tensor.h" +#include "saber/core/tensor_op.h" +#include "saber/core/tensor.h" +#include "saber/funcs/saber_util.h" namespace anakin { namespace saber { @@ -34,44 +44,497 @@ template <> struct DefaultHostType { typedef ARM Host_type; }; +template +std::string to_string(T value) +{ + std::ostringstream os ; + os << value; + return os.str(); +} +template +static void reorder_nhwc_nchw(const Tensor& input, + Tensor& output) { + + + + + int n_value = input.num(); + int c_value = input.channel(); + int h_value = input.height(); + int w_value = input.width(); + + if (input.get_layout() == Layout_NHWC && output.get_layout() == Layout_NCHW) { + if (input.get_dtype() == AK_INT8 && output.get_dtype() == AK_FLOAT) { + float* output_ptr = static_cast(output.mutable_data()); + CHECK(input.get_scale().size() >= 1); + float scale = input.get_scale()[0]; + const int8_t* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + int out_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + output_ptr[out_index] = input_ptr[in_index] * scale; + } + } + } + } + } else if (input.get_dtype() == AK_UINT8 && output.get_dtype() == AK_FLOAT) { + LOG(INFO) << "print uint 8"; + CHECK(input.get_scale().size() >= 1); + float scale = (input.get_scale()[0]) * (127.f / 255.f); + LOG(INFO) << "scale = " << scale; + double sum = 0.0; + double max = 0.0; + const uint8_t* input_ptr = static_cast(input.data()); + float* output_ptr = static_cast(output.mutable_data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + int out_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + output_ptr[out_index] = (float)input_ptr[in_index] * scale; + sum += output_ptr[out_index]; + max = output_ptr[out_index] > max ? output_ptr[out_index] : max; + } + } + } + } + + LOG(INFO) << "avg = " << (sum / input.valid_size()) << "," << max; + } else if (input.get_dtype() == AK_UINT8 && output.get_dtype() == AK_UINT8) { + LOG(INFO) << "reorder uint 8"; + uint8_t* output_ptr = static_cast(output.mutable_data()); + const uint8_t* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + int out_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + output_ptr[out_index] = input_ptr[in_index]; + } + } + } + } + } else if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_FLOAT) { + const float* input_ptr = static_cast(input.data()); + float* output_ptr = static_cast(output.mutable_data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + int out_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + output_ptr[out_index] = input_ptr[in_index]; + } + } + } + } + } else { + LOG(FATAL) << "not support input type " << input.get_dtype(); + } + } else if (input.get_layout() == Layout_NCHW && output.get_layout() == Layout_NHWC) { + if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_FLOAT) { + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + int out_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + output_ptr[out_index] = input_ptr[in_index]; + } + } + } + } + } else if (input.get_dtype() == AK_UINT8 && output.get_dtype() == AK_UINT8) { + uint8_t* output_ptr = static_cast(output.mutable_data()); + const uint8_t* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + int out_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + output_ptr[out_index] = input_ptr[in_index]; + } + } + } + } + } else if (input.get_dtype() == AK_INT8 && output.get_dtype() == AK_INT8) { + int8_t* output_ptr = static_cast(output.mutable_data()); + const int8_t* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + int out_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + output_ptr[out_index] = input_ptr[in_index]; + } + } + } + } + } else if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_INT8) { + CHECK(output.get_scale().size() >= 1); + float scale = 1.f / (output.get_scale()[0]); + int8_t* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + int out_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + output_ptr[out_index] = saturate(roundf(input_ptr[in_index] * scale)); + } + } + } + } + } else if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_UINT8) { + CHECK(output.get_scale().size() >= 1); + float scale = 1.f / (output.get_scale()[0]* (127.f / 255.f)); + uint8_t* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int in_index = n * c_value * h_value * w_value + c * h_value * w_value + h * w_value + w; + int out_index = n * h_value * w_value * c_value + h * w_value * c_value + w * c_value + c; + output_ptr[out_index] = saturate(roundf(input_ptr[in_index] * scale)); + } + } + } + } + }else { + LOG(FATAL) << "not support in/ou type " << input.get_dtype() << "," << output.get_dtype(); + } + } else { + LOG(FATAL) << "not support layout " << input.get_layout() << "," << output.get_layout(); + } + +} + +template +static void reorder_nchwc_nchw(Tensor& input, + Tensor& output) { + if (input.valid_shape() == output.valid_shape()) { + output.copy_from(input); + return; + } + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + LayoutType in_layout = input.get_layout(); + LayoutType out_layout = output.get_layout(); + bool is_nchwc_nchw = (in_layout == Layout_NCHW_C16R || in_layout == Layout_NCHW_C8R) + && (out_layout == Layout_NCHW); + bool is_nchw_nchwc = (out_layout == Layout_NCHW_C16R || out_layout == Layout_NCHW_C8R) + && (in_layout == Layout_NCHW); + CHECK(is_nchw_nchwc || is_nchwc_nchw) << "not support " << input.get_layout(); + + if (is_nchwc_nchw) { + Shape shape = output.valid_shape(); + int n_value = shape[0]; + int c_value = shape[1]; + int h_value = shape[2]; + int w_value = shape[3]; + Shape shape_input = input.valid_shape(); + int aligned_length = shape_input.get_layout_aligned_length(); + CHECK_GT(aligned_length, 0) << "input aligned should > 0"; + int c_round_divk = shape_input[1]; + + c_round_divk = (shape_input.channel() + aligned_length - 1) / aligned_length; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(4) schedule(static) + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + //#pragma ivdep + for (int w = 0; w < w_value; ++w) { + int round_c = c / aligned_length; + int remainder_c = c % aligned_length; + int input_idx = n * c_round_divk * h_value * w_value * aligned_length + round_c * h_value * + w_value * aligned_length + + h * w_value * aligned_length + w * aligned_length + remainder_c; + int output_idx = n * c_value * h_value * w_value + c * h_value * w_value + + h * w_value + w ; + + *(output_ptr + output_idx) = input_ptr[input_idx]; + } + } + } + } + } else if (is_nchw_nchwc) { + Shape shape = input.valid_shape(); + int n_value = shape[0], c_value = shape[1], h_value = shape[2], w_value = shape[3]; + + int aligned_length = output.valid_shape().get_layout_aligned_length(); + CHECK_GT(aligned_length, 0) << "input aligned should > 0"; + + int c_round_divk = (c_value + aligned_length - 1) / aligned_length; + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(5) schedule(static) + + for (int n = 0; n < n_value; ++n) { + for (int c_idx = 0; c_idx < c_round_divk; ++c_idx) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + for (int c = 0; c < aligned_length; ++c) { + int input_idx = n * c_value * h_value * w_value + (c_idx * aligned_length + c) * h_value * w_value + + h * w_value + w; + int output_idx = n * c_round_divk * h_value * w_value * aligned_length + c_idx * h_value * w_value * + aligned_length + + h * w_value * aligned_length + w * aligned_length + c; + + *(output_ptr + output_idx) = ((c_idx * aligned_length + c) < c_value) ? * + (input_ptr + input_idx) : 0; + } + } + } + } + } + + } else { + LOG(FATAL) << "not support this shape"; + } + + +} + +template +static void reorder_nchwc8_nchw(Tensor& input, + Tensor& output) { + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = output.valid_shape(); + int n_value = shape[0]; + int c_value = shape[1]; + int h_value = shape[2]; + int w_value = shape[3]; + Shape shape_input = input.valid_shape(); + int c_round_div8 = shape_input[1]; + + if (input.get_layout() == Layout_NCHW_C8R) { + c_round_div8 = (shape_input.channel() + 7) / 8; + } + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(4) schedule(static) + + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + //#pragma ivdep + for (int w = 0; w < w_value; ++w) { + int round_c = c / 8; + int remainder_c = c % 8; + int input_idx = n * c_round_div8 * h_value * w_value * 8 + round_c * h_value * w_value * 8 + + h * w_value * 8 + w * 8 + remainder_c; + int output_idx = n * c_value * h_value * w_value + c * h_value * w_value + + h * w_value + w ; + + *(output_ptr + output_idx) = input_ptr[input_idx]; + } + } + } + } +} + +template +inline void calibrate_int8c4_to_fp32_host(Tensor& host_tensor, + const Tensor & int8_tensor) { + + CHECK_EQ(host_tensor.get_dtype(), AK_FLOAT); + CHECK_EQ(host_tensor.get_layout(), Layout_NCHW); + CHECK_EQ(int8_tensor.get_dtype(), AK_INT8); + CHECK_EQ(int8_tensor.get_layout(), Layout_NCHW_C4); + CHECK_EQ(host_tensor.valid_size(), int8_tensor.valid_size()); + CHECK_GE(int8_tensor.get_scale().size(), 1); + + Shape out_stride = host_tensor.get_stride(); + Shape in_shape = int8_tensor.valid_shape(); + Shape out_shape = host_tensor.valid_shape(); + int valid_width = in_shape.width(); + int valid_height = in_shape.height(); + int valid_channel_4 = in_shape.channel() / 4; + int valid_num = in_shape.num(); + int in_n_stride = in_shape[1] * in_shape[2] * in_shape[3] / 4; + int in_c_stride = in_shape[2] * in_shape[3]; + int in_h_stride = in_shape[3]; + int in_w_stride = 1; + + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3] / 4; + const char* in_data = (const char*)int8_tensor.data(); + float* out_data = (float*)host_tensor.mutable_data(); + float scale = int8_tensor.get_scale()[0]; + + for (int gid = 0; gid < count; ++ gid) { + float load0, load1, load2, load3; + + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_stride[0] + + read_c * (out_stride[1] << 2) + + read_h * out_stride[2] + + read_w * out_stride[3]; + + if (gid < count) { + + char readin0 = in_data[4 * in_offset + 0]; + char readin1 = in_data[4 * in_offset + 1]; + char readin2 = in_data[4 * in_offset + 2]; + char readin3 = in_data[4 * in_offset + 3]; + + load0 = static_cast(readin0); + load1 = static_cast(readin1); + load2 = static_cast(readin2); + load3 = static_cast(readin3); + + out_data[out_offset] = load0 * scale; + out_offset += out_stride[1]; + out_data[out_offset] = load1 * scale; + out_offset += out_stride[1]; + out_data[out_offset] = load2 * scale; + out_offset += out_stride[1]; + out_data[out_offset] = load3 * scale; + } + } +} + template -static void write_tensorfile(const Tensor& tensor, const char* locate) { +static void write_tensorfile(const Tensor& tensor, const char* locate, + bool trans_tensor = true) { typedef typename DefaultHostType::Host_type HOST_TYPE; Tensor host_tensor; - host_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); - host_tensor.copy_from(tensor); + + if (trans_tensor) { + if (tensor.get_dtype() == AK_INT8 && tensor.get_layout() == Layout_NCHW_C4) { + Tensor temp_tensor; + temp_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); + temp_tensor.copy_from(tensor); + temp_tensor.set_scale(tensor.get_scale()); + Shape fp32_shape = tensor.valid_shape(); + fp32_shape.set_layout(Layout_NCHW); + host_tensor.re_alloc(fp32_shape, AK_FLOAT); + calibrate_int8c4_to_fp32_host(host_tensor, temp_tensor); + } else if (tensor.get_layout() == Layout_NHWC) { + Tensor temp_tensor; + temp_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); + temp_tensor.copy_from(tensor); + LOG(INFO) << "scale size = " << tensor.get_scale().size(); + LOG(INFO) << "scale value = " << tensor.get_scale()[0]; + temp_tensor.set_scale(tensor.get_scale()); + Shape fp32_shape = tensor.valid_shape(); + fp32_shape.set_layout(Layout_NCHW); + host_tensor.re_alloc(fp32_shape, AK_FLOAT); + reorder_nhwc_nchw(temp_tensor, host_tensor); + LOG(INFO) << "record int8 tensor"; + // calibrate_int8nhwc_to_fp32_host(host_tensor, temp_tensor); + } else { + host_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); + host_tensor.copy_from(tensor); + } + + if (host_tensor.get_layout() == Layout_NCHW_C8R) { + Tensor temp_tensor(host_tensor.valid_shape()); + temp_tensor.copy_from(host_tensor); + Shape old_shape = host_tensor.valid_shape(); + host_tensor.reshape(Shape({old_shape[0], old_shape[1], old_shape[2], old_shape[3]})); + reorder_nchwc8_nchw(temp_tensor, host_tensor); + } + } else { + host_tensor.re_alloc(tensor.valid_shape(), tensor.get_dtype()); + host_tensor.copy_from(tensor); + } + LOG(INFO) << "target tensor data:" << tensor.valid_size(); - FILE* fp = fopen(locate, "w+"); + FILE* fp = fopen(locate, "w"); if (fp == nullptr) { LOG(ERROR) << "file open field " << locate; } else { - if (tensor.get_dtype() == AK_FLOAT) { + if (host_tensor.get_dtype() == AK_FLOAT) { const float* data_ptr = (const float*)host_tensor.data(); int size = host_tensor.valid_size(); for (int i = 0; i < size; ++i) { fprintf(fp, "[%d] %f \n", i, (data_ptr[i])); } - } else if (tensor.get_dtype() == AK_INT8) { + } else if (host_tensor.get_dtype() == AK_INT8) { const char* data_ptr = (const char*)host_tensor.data(); int size = host_tensor.valid_size(); for (int i = 0; i < size; ++i) { fprintf(fp, "[%d] %d \n", i, (data_ptr[i])); } + } else if (host_tensor.get_dtype() == AK_UINT8) { + const unsigned char* data_ptr = (const unsigned char*)host_tensor.data(); + int size = host_tensor.valid_size(); + + for (int i = 0; i < size; ++i) { + fprintf(fp, "[%d] %u \n", i, (data_ptr[i])); + } } else { LOG(FATAL) << "not supported write type"; } + if (tensor.get_seq_offset().size() > 0) { + auto seq_offset = tensor.get_seq_offset(); + + for (int i = 0; i < seq_offset.size(); i++) { + for (int offset_data : seq_offset[i]) { + fprintf(fp, "[offset_%d] %d \n", i, offset_data); + } + } + } + fclose(fp); } LOG(INFO) << "!!! write success: " << locate; } +static void split_string(const std::string& s, char delim, + std::vector& elems) { + std::stringstream ss(s); + std::string item; + + while (std::getline(ss, item, delim)) { + elems.push_back(item); + } +} + + static std::string& replace_all(std::string& str, const std::string& old_value, const std::string& new_value) { while (true) { @@ -89,16 +552,122 @@ static std::string& replace_all(std::string& str, const std::string& old_val template static void record_tensor_in_format(const Tensor& tensor, - const std::string& op_type, const std::string& op_name, bool is_out, int index) { + const std::string& op_type, const std::string& op_name, + bool is_out, int index, int iter = 0) { + // CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "now record func only support ak_float"; std::string path = "record+" + op_type + "+" + op_name + "+" + (is_out ? "out" : "in") + - "+" + std::to_string(index) + "+"; + "+" + to_string(index) + "+"; + + if (tensor.valid_size() > 1 && tensor.shape().size() == 4) { + path += to_string(tensor.num()) + "_" + to_string(tensor.channel()) + "_" + + to_string(tensor.height()) + "_" + to_string(tensor.width()) + "_"; + } else { + for (auto x : tensor.valid_shape()) { + path += to_string(x) + "_"; + } + } + + path += "+nchw+"; + path += "ak_float+"; + path += to_string(iter); + + path = replace_all(path, "/", "_"); + write_tensorfile(tensor, (path + ".txt").c_str()); +} +static void get_shape(std::string shape_string, std::vector& shape_vec) { + std::vector shape_s_vec; + split_string(shape_string, '_', shape_s_vec); + shape_vec.clear(); + + for (int i = 0; i < shape_s_vec.size(); i++) { + shape_vec.push_back(atoi(shape_s_vec[i].c_str())); + } +} +static std::string get_basename(std::string path) { + std::vector elems; + split_string(path, '/', elems); + + if (elems.size() >= 1) { + return elems[elems.size() - 1]; + } else { + return ""; + } +} + +template +static void read_tensor(Tensor& tensor, std::string location) { + FILE* fp = fopen(location.c_str(), "r"); + float* tensor_data = static_cast(tensor.mutable_data()); + int index = 0; + + if (fp == nullptr) { + LOG(FATAL) << "can`t open " << location; + } else { + char buf[1024]; + std::vector seq_offset; + + while (fgets(buf, 1024, fp) != NULL) { + std::string str(buf); + std::vector s_vec; + split_string(str, ' ', s_vec); + + if (s_vec[0].find("offset") != std::string::npos) { + if (s_vec[0] == "[offset_0]") { + seq_offset.push_back(atoi(s_vec[1].c_str())); + } else { + LOG(FATAL) << "not support " << s_vec[0]; + } + } else { + CHECK_LT(index, tensor.valid_size()) << "index must less than valid size"; + tensor_data[index++] = atof(s_vec[1].c_str()); + } + } + } + +} + +template +static void load_tensor_in_io_format(Tensor& tensor, bool& is_input, + std::string& op_name, std::string location) { + std::string base_name(get_basename(location)); + LOG(INFO) << "base name " << base_name; + std::vector base_split; + split_string(base_name, '+', base_split); + op_name = base_split[2]; + std::string in_out_flag = base_split[3]; + std::string shape = base_split[5]; + std::string layout = base_split[6]; + std::string data_type = base_split[7]; + std::vector shape_vec; + get_shape(shape, shape_vec); + CHECK(in_out_flag == "in" + || in_out_flag == "out") << "in/out flag must be in or out, not " << in_out_flag; + CHECK(layout == "nchw") << "load layout now only support nchw not " << layout; + CHECK(data_type == "ak_float") << "data type now only support ak_float not " << data_type; + is_input = in_out_flag == "in"; + Shape ak_shape(shape_vec, Layout_NCHW); + tensor.re_alloc(ak_shape); + read_tensor(tensor, location); +} + +template +static void record_tensor_in_io_format(const Tensor& tensor, std::string tensor_name, + bool is_out, int index, int iter = 0) { + CHECK_EQ(tensor.get_dtype(), AK_FLOAT) << "now record func only support ak_float"; + CHECK_EQ(tensor.get_layout(), Layout_NCHW) << "now record func only support ak_float"; + std::string path = ""; + path = path + "record+" + (is_out ? "out+" : "in+") + tensor_name + "+"; for (auto x : tensor.valid_shape()) { - path += std::to_string(x) + "_"; + path += to_string(x) + "_"; } + path += "+nchw+"; + path += "ak_float+"; + path += to_string(iter); + path = replace_all(path, "/", "_"); write_tensorfile(tensor, (path + ".txt").c_str()); } @@ -108,7 +677,7 @@ static std::string vector_2_string(std::vector vec) { std::string ans = "["; for (auto a : vec) { - ans += std::to_string(a) + ","; + ans += to_string(a) + ","; } ans += "]"; @@ -120,14 +689,130 @@ static void printf_intrin_var(Dtype data) { std::string ans = ""; for (int i = 0; i < sizeof(data) / 4; i++) { - ans += std::to_string(data[i]) + ","; + ans += to_string(data[i]) + ","; + } + + LOG(INFO) << ans; +} + +template +static void printf_intrin_var_epi16(Dtype data) { + std::string ans = ""; + + for (int i = 0; i < sizeof(data) / 4; i++) { + ans += to_string(data[i]) + ","; + } + + LOG(INFO) << ans; +} + +template +static void printf_pointer(Dtype* data, size_t length) { + std::string ans = ""; + + for (int i = 0; i < length; i++) { + ans += to_string(data[i]) + ","; + } + + LOG(INFO) << ans << " [length = "< +void printf_pointer(uint8_t* data, size_t length){ + std::string ans = ""; + + for (int i = 0; i < length; i++) { + ans += to_string((int)data[i]) + ","; + } + + LOG(INFO) << ans << " [length = "< +void printf_pointer(int8_t* data, size_t length){ + std::string ans = ""; + + for (int i = 0; i < length; i++) { + ans += to_string((int)data[i]) + ","; + } + + LOG(INFO) << ans << " [length = "< +void printf_pointer(void* data, size_t length){ + LOG(INFO)<<"printf_pointer do not want to print void*"; +} + +#if defined(__AVX2__) + +template<> +void printf_intrin_var<__m256i>(__m256i data) { + int avx2_print_buf[8]; + std::string ans = ""; + _mm256_storeu_si256((__m256i*)(&avx2_print_buf[0]), data); + + for (int i = 0; i < 8; i++) { + ans += to_string(avx2_print_buf[i]) + ","; } LOG(INFO) << ans; } +template<> +void printf_intrin_var<__m256>(__m256 data) { + float avx2_print_buf[8]; + std::string ans = ""; + _mm256_storeu_ps((&avx2_print_buf[0]), data); + for (int i = 0; i < 8; i++) { + ans += to_string(avx2_print_buf[i]) + ","; + } + LOG(INFO) << ans; } +template<> +void printf_intrin_var_epi16<__m256i>(__m256i data) { + short avx2_print_buf[16]; + std::string ans = ""; + _mm256_storeu_si256((__m256i*)(&avx2_print_buf[0]), data); + + for (int i = 0; i < 16; i++) { + ans += to_string(avx2_print_buf[i]) + ","; + } + + std::cout << ans << std::endl; } +#endif + +#if defined(__AVX512F__) +template<> +void printf_intrin_var<__m512i>(__m512i data) { + std::string ans = ""; + int avx512_print_buf[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + _mm512_storeu_si512((__m512i*)(&avx512_print_buf[0]), data); + + for (int i = 0; i < 16; i++) { + ans += to_string(avx512_print_buf[i]) + ","; + } + + LOG(INFO) << ans; +} +template<> +void printf_intrin_var<__v32hi>(__v32hi data) { + std::string ans = ""; + short avx512_print_buf[32] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + - 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + _mm512_storeu_si512((__m512i*)(&avx512_print_buf[0]), (__m512i)data); + + for (int i = 0; i < 32; i++) { + ans += to_string(avx512_print_buf[i]) + ","; + } + LOG(INFO) << ans; +} +#endif + +} +} + +#endif #endif //ANAKIN_DEBUG_H diff --git a/saber/funcs/deconv.h b/saber/funcs/deconv.h index d4f96637b..7922706cd 100644 --- a/saber/funcs/deconv.h +++ b/saber/funcs/deconv.h @@ -20,14 +20,23 @@ #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_deconv.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/vender_deconv.h" +#endif + #ifdef USE_CUDA #include "saber/funcs/impl/cuda/saber_deconv.h" #include "saber/funcs/impl/cuda/vender_deconv.h" #endif + #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_deconv.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_deconv.h" +#endif + namespace anakin { namespace saber { @@ -57,10 +66,8 @@ class Deconv : public BaseFunc< virtual SaberStatus compute_output_shape(const Input_v &input, \ Output_v &output, Param_t ¶m) override { - Shape deconv_shape = deconv_compute_shape(input[0]->valid_shape(), param); - deconv_shape.set_layout(Layout_NCHW); - return output[0]->set_shape(deconv_shape); + return output[0]->set_shape_without_layout(deconv_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { diff --git a/saber/funcs/deformable_conv.h b/saber/funcs/deformable_conv.h index c60b0137a..d7cd9a9eb 100644 --- a/saber/funcs/deformable_conv.h +++ b/saber/funcs/deformable_conv.h @@ -22,6 +22,9 @@ #ifdef NVIDIA_GPU //#include "saber/funcs/impl/cuda/saber_deformable_conv.h" #endif +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/include/vender_deformable_conv.h" +#endif namespace anakin { namespace saber { diff --git a/saber/funcs/detection_output.h b/saber/funcs/detection_output.h index 1af46c05b..0a6f81d5c 100644 --- a/saber/funcs/detection_output.h +++ b/saber/funcs/detection_output.h @@ -23,10 +23,16 @@ #include "saber/funcs/impl/cuda/saber_detection_output.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_detection_output.h" +#endif + #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_detection_output.h" #endif - +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_detection_output.h" +#endif namespace anakin { namespace saber { @@ -56,7 +62,19 @@ class DetectionOutput : public BaseFunc< virtual SaberStatus compute_output_shape(const Input_v &input, \ Output_v &output, Param_t ¶m) override { - Shape shape_out = Shape({1, 1, param.keep_top_k * input[0]->num(), 7}, Layout_NCHW); + Shape shape_out; + if (param.share_location) { + // for one stage + shape_out = Shape({1, 1, param.keep_top_k * input[0]->num(), 7}, Layout_NCHW); + } else { + // for two stage + auto offset = input[0]->get_seq_offset(); + CHECK_GT(offset.size(), 0) << "input tensors must have seq_offset"; + CHECK_GT(offset[0].size(), 0) << "seq offset must have at least 2 elements"; + int num = offset[0].size() - 1; + shape_out = Shape({1, 1, param.keep_top_k * num, 7}, Layout_NCHW); + } + return output[0]->set_shape(shape_out); } diff --git a/saber/funcs/dfmb_psroi_align.h b/saber/funcs/dfmb_psroi_align.h index c8b8e55c2..483902220 100644 --- a/saber/funcs/dfmb_psroi_align.h +++ b/saber/funcs/dfmb_psroi_align.h @@ -12,7 +12,6 @@ #ifndef ANAKIN_SABER_FUNCS_DFMB_PSROI_ALIGN_H #define ANAKIN_SABER_FUNCS_DFMB_PSROI_ALIGN_H #include "saber/core/tensor.h" -#include "saber/funcs/timer.h" #include "saber/funcs/base.h" #include "saber/saber_funcs_param.h" #include "saber/funcs/impl/impl_base.h" @@ -26,7 +25,7 @@ #endif #ifdef USE_ARM_PLACE //todo -#include "saber/funcs/impl/impl_dfmb_psroi_algin.h" +//#include "saber/funcs/impl/impl_dfmb_psroi_algin.h" #endif namespace anakin { namespace saber { @@ -97,4 +96,4 @@ class DFMBPSROIAlign : public BaseFunc < } } -#endif //ANAKIN_SABER_FUNCS_DFMB_PSROI_ALIGN_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_DFMB_PSROI_ALIGN_H diff --git a/saber/funcs/eltwise.h b/saber/funcs/eltwise.h index c698f5d2f..cd8afcd46 100644 --- a/saber/funcs/eltwise.h +++ b/saber/funcs/eltwise.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_ELTWISE_H @@ -27,7 +27,10 @@ #include "saber/funcs/impl/x86/saber_eltwise.h" #endif #ifdef USE_ARM_PLACE -//#include "saber/funcs/impl/arm/saber_eltwise.h" +#include "saber/funcs/impl/arm/saber_eltwise.h" +#endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_eltwise.h" #endif namespace anakin { namespace saber { @@ -59,15 +62,21 @@ class Eltwise : public BaseFunc< virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ Param_t& param) override { - for (int i = 1; i < input.size(); ++i) { - CHECK_EQ(input[0]->num(), input[i]->num()); - CHECK_EQ(input[0]->channel(), input[i]->channel()); - CHECK_EQ(input[0]->height(), input[i]->height()); - CHECK_EQ(input[0]->width(), input[i]->width()); + if (param.operation != Eltwise_div) { + for (int i = 1; i < input.size(); ++i) { + CHECK_EQ(input[0]->num(), input[i]->num()); + CHECK_EQ(input[0]->channel(), input[i]->channel()); + CHECK_EQ(input[0]->height(), input[i]->height()); + CHECK_EQ(input[0]->width(), input[i]->width()); + } } Shape output_shape = input[0]->valid_shape(); output[0]->set_shape(output_shape); + if (param.operation == Eltwise_sum) { + CHECK_EQ(param.coeff.size(), input.size()) << "eltwise sum coeff num is not right"; + } + output[0]->set_seq_offset(input[0]->get_seq_offset()); return SaberSuccess; } @@ -107,4 +116,4 @@ class Eltwise : public BaseFunc< } -#endif //ANAKIN_SABER_FUNCS_ELTWISE_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_ELTWISE_H diff --git a/saber/funcs/eltwise_act.h b/saber/funcs/eltwise_act.h index 939709a6b..7347eb9ef 100644 --- a/saber/funcs/eltwise_act.h +++ b/saber/funcs/eltwise_act.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_ELTWISE_ACT_H @@ -27,8 +27,7 @@ //#include "saber/funcs/impl/x86/saber_eltwise_act.h" #endif #ifdef USE_ARM_PLACE -//todo -//#include "saber/funcs/impl/arm/saber_eltwise_active.h" +#include "saber/funcs/impl/arm/saber_eltwise_act.h" #endif namespace anakin { @@ -107,4 +106,4 @@ class EltwiseActive : public BaseFunc< } } -#endif //ANAKIN_SABER_FUNCS_ELTWISE_ACTIVE_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_ELTWISE_ACTIVE_H diff --git a/saber/funcs/embedding.h b/saber/funcs/embedding.h index f54f82543..dbeff3c76 100644 --- a/saber/funcs/embedding.h +++ b/saber/funcs/embedding.h @@ -25,12 +25,12 @@ #include "saber/funcs/impl/x86/saber_embedding.h" #endif -// #ifdef USE_AMD -// #include "saber/funcs/impl/amd/saber_embedding.h" -// #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_embedding.h" +#endif #ifdef USE_ARM_PLACE -#include "saber/funcs/impl/arm/saber_embedding.h" +//#include "saber/funcs/impl/arm/saber_embedding.h" #endif namespace anakin { @@ -64,7 +64,7 @@ class Embedding : public BaseFunc< Output_v &output, Param_t ¶m) override { Shape output_shape({input[0]->valid_size(), param.emb_dim, 1, 1}); - CHECK_EQ(output.size(), param.num_direct) + CHECK_EQ(output.size(), param.num_direct) << "output tensor num is not equal to the direct number in param"; for (int i = 0; i < output.size(); i++) { output[i]->set_seq_offset(input[0]->get_seq_offset()); diff --git a/saber/funcs/fc.h b/saber/funcs/fc.h index 80accc09a..513b2c9ca 100644 --- a/saber/funcs/fc.h +++ b/saber/funcs/fc.h @@ -30,7 +30,11 @@ #include "saber/funcs/impl/arm/saber_fc.h" #endif -namespace anakin { +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/vender_fc.h" +#endif + +namespace anakin{ namespace saber { @@ -73,7 +77,7 @@ class Fc : public BaseFunc < Shape shape_out({m, n, 1, 1}, Layout_NCHW); output[0]->set_seq_offset(input[0]->get_seq_offset()); - return output[0]->set_shape(shape_out); + return output[0]->set_shape_without_layout(shape_out); } virtual SaberStatus init_impl(ImplEnum implenum) override { diff --git a/saber/funcs/funcs_utils.h b/saber/funcs/funcs_utils.h index 51828bca1..691d82d7c 100644 --- a/saber/funcs/funcs_utils.h +++ b/saber/funcs/funcs_utils.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef SABER_FUNCS_UTILS_H @@ -30,20 +30,25 @@ Shape conv_compute_shape(const Shape input_shape, Param ¶m) { Shape output_shape = (input_shape); CHECK_GE(input_shape.size(), 4) << "using reshape2d to reshape a 1d conv?"; - output_shape.set_num(input_shape.num()); // N - output_shape.set_channel(param.weight()->num()); // K + int num_idx = output_shape.num_index(); + int channel_idx = output_shape.channel_index(); + int height_idx = output_shape.height_index(); + int width_idx = output_shape.width_index(); + + output_shape[num_idx] = input_shape.num(); // N + output_shape[channel_idx] = param.weight()->num(); // K int input_dim = input_shape.height(); // P int kernel_exten = param.dilation_h * (param.weight()->height() - 1) + 1; int output_height = (input_dim + 2 * param.pad_h - kernel_exten) / param.stride_h + 1; - output_shape.set_height(output_height); + output_shape[height_idx] = output_height; input_dim = input_shape.width(); // Q kernel_exten = param.dilation_w * (param.weight()->width() - 1) + 1; int output_width = (input_dim + 2 * param.pad_w - kernel_exten) / param.stride_w + 1; - output_shape.set_width(output_width); + output_shape[width_idx] = output_width; return output_shape; } @@ -54,8 +59,13 @@ Shape deconv_compute_shape(const Shape input_shape, ConvParam ¶m // append the $n and $c/$k, output: N * K * P * Q - output_shape.set_num(input_shape.num()); // N - output_shape.set_channel(param.weight()->num() * param.group); // K + int num_idx = output_shape.num_index(); + int channel_idx = output_shape.channel_index(); + int height_idx = output_shape.height_index(); + int width_idx = output_shape.width_index(); + + output_shape[num_idx] = input_shape.num(); // N + output_shape[channel_idx] = param.weight()->num() * param.group; // K int kernel_extent_h = param.dilation_h * (param.weight()->height() - 1) + 1; @@ -66,8 +76,8 @@ Shape deconv_compute_shape(const Shape input_shape, ConvParam ¶m int output_dim_w = (input_shape.width() - 1) * param.stride_w + kernel_extent_w - 2 * param.pad_w; - output_shape.set_height(output_dim_h); - output_shape.set_width(output_dim_w); + output_shape[height_idx] = output_dim_h; + output_shape[width_idx] = output_dim_w; return output_shape; } @@ -99,16 +109,16 @@ Shape pool_compute_shape(const Shape input_shape, Param ¶m) { } else { if (param.cmp_out_shape_floor_as_conv) { out_height = static_cast((static_cast( - in_height + 2 * pad_h - window_h) / stride_h)) + 1; + in_height + 2 * pad_h - window_h) / stride_h)) + 1; out_width = static_cast((static_cast( - in_width + 2 * pad_w - window_w) / stride_w)) + 1; + in_width + 2 * pad_w - window_w) / stride_w)) + 1; } else { out_height = static_cast(ceilf(static_cast( - in_height + 2 * pad_h - window_h) / stride_h)) + 1; + in_height + 2 * pad_h - window_h) / stride_h)) + 1; out_width = static_cast(ceilf(static_cast( - in_width + 2 * pad_w - window_w) / stride_w)) + 1; + in_width + 2 * pad_w - window_w) / stride_w)) + 1; } } @@ -120,8 +130,10 @@ Shape pool_compute_shape(const Shape input_shape, Param ¶m) { -- out_width; } } - output_shape.set_height(out_height); - output_shape.set_width(out_width); + int height_idx = output_shape.height_index(); + int width_idx = output_shape.width_index(); + output_shape[height_idx] = out_height; + output_shape[width_idx] = out_width; return output_shape; } @@ -165,7 +177,7 @@ void merge_matrix_to_matrix_in_leddim(const Dtype* input, } template -void transform_3x3_weight_2_4x4(const Dtype* input, +void transform_3x3_weight_2_4x4(const Dtype* input, Dtype* output, int K, int k_align_up, @@ -189,7 +201,7 @@ void transform_3x3_weight_2_4x4(const Dtype* input, }else{ g[i][j] = 0.f; } - + } } G[0][0] = g[0][0]; @@ -198,20 +210,20 @@ void transform_3x3_weight_2_4x4(const Dtype* input, G[0][3] = g[0][2]; G[1][0] = 0.50*(g[0][0] + g[1][0] + g[2][0]); - G[1][1] = 0.25*(g[0][0] + g[0][1] + g[0][2] - + g[1][0] + g[1][1] + g[1][2] + G[1][1] = 0.25*(g[0][0] + g[0][1] + g[0][2] + + g[1][0] + g[1][1] + g[1][2] + g[2][0] + g[2][1] + g[2][2]); - G[1][2] = 0.25*(g[0][0] - g[0][1] + g[0][2] - + g[1][0] - g[1][1] + g[1][2] + G[1][2] = 0.25*(g[0][0] - g[0][1] + g[0][2] + + g[1][0] - g[1][1] + g[1][2] + g[2][0] - g[2][1] + g[2][2]); G[1][3] = 0.50*(g[0][2] + g[1][2] + g[2][2]); G[2][0] = 0.50*(g[0][0] - g[1][0] + g[2][0]); - G[2][1] = 0.25*(g[0][0] + g[0][1] + g[0][2] - - g[1][0] - g[1][1] - g[1][2] + G[2][1] = 0.25*(g[0][0] + g[0][1] + g[0][2] + - g[1][0] - g[1][1] - g[1][2] + g[2][0] + g[2][1] + g[2][2]); - G[2][2] = 0.25*(g[0][0] - g[0][1] + g[0][2] - - g[1][0] + g[1][1] - g[1][2] + G[2][2] = 0.25*(g[0][0] - g[0][1] + g[0][2] + - g[1][0] + g[1][1] - g[1][2] + g[2][0] - g[2][1] + g[2][2]); G[2][3] = 0.50*(g[0][2] - g[1][2] + g[2][2]); @@ -237,10 +249,10 @@ void transform_3x3_weight_2_4x4(const Dtype* input, int idx_0 = (i * 4 + j) % 2; int idx_1 = (i * 4 + j) / 2; - int offset = + int offset = kidx_1 * 32 * 2 * 8 - + cidx_1 * (k_align_up * 2 * 8 * 8) - + cidx_0 * 2 * 32 + idx_1 * (k_align_up * 2 * 8) + + cidx_1 * (k_align_up * 2 * 8 * 8) + + cidx_0 * 2 * 32 + idx_1 * (k_align_up * 2 * 8) + idx_0 * 32 + kidx_16 * 16 + kidx_height * 4 + kidx_width; output[offset] = G[i][j]; } @@ -249,7 +261,7 @@ void transform_3x3_weight_2_4x4(const Dtype* input, } } -// transform +// transform // PAY ATTENTION!!!![zs] // The shape of weights is suppose to be {in_channel, out_channel, kernel_size, kernel_size}; // but caffe is reshaped their shape as {out, in, kernel_size, kernel_size} @@ -261,7 +273,7 @@ void transform_3x3_weight_2_4x4(const Dtype* input, // int out_channel : the real output filter num(as much as you can, this is the proto param) // // const float * - // weights_src : the real data is orgnized as + // weights_src : the real data is orgnized as // (in_channel, out_channel, kernel_size, kernel_size) // const float * // XX_out : the output data is orgnized as @@ -314,6 +326,42 @@ void transpose_filter_KCRS_2_CRSK(const Dtype *input, Dtype *output, \ } } +template +void transpose_filter_KCRS_2_CRSKC4(Tensor weights, + int K, int C, int R, int S) { + Tensor temp; + Tensor temp_in; + Tensor target_temp; + temp.re_alloc(weights.valid_shape(), Dtype); + temp_in.re_alloc(weights.valid_shape(), Dtype); + target_temp.re_alloc(weights.valid_shape(), Dtype); + + temp_in.copy_from(weights); + const dtype *input = (const dtype*)temp_in.data(); + dtype *temp_ptr = (dtype*)temp.mutable_data(); + dtype *target_temp_ptr = (dtype*)target_temp.mutable_data(); + + const int CRS = C * R * S; + for (int var_k = 0; var_k < K; var_k++) { + for (int var_crs = 0; var_crs < CRS; var_crs++) { + temp_ptr[var_crs * K + var_k] = input[var_k * CRS + var_crs]; + } + } + + int read_in = 0; + int write_out = 0; + int out_loop = C / 4; + int inner_loop = K * R * S * 4; + for (int i = 0; i < out_loop; ++i) { + for (int j = 0; j < inner_loop; ++j) { + write_out = i * inner_loop + j; + read_in = ((i * 4) + (j % 4)) * (inner_loop / 4) + j / 4; + target_temp_ptr[write_out] = temp_ptr[read_in]; + } + } + weights.copy_from(target_temp); +} + template < typename Tensor_t, template class Param > void update_conv_weights(Param& param) { #ifdef USE_ARM_PLACE @@ -335,7 +383,7 @@ void update_conv_weights(Param& param) { new_weight.copy_from(*(param.conv_param.weight())); Shape bias_shape; - if (param.conv_param.bias()->size() > 0) { + if (param.conv_param.bias() && param.conv_param.bias()->size() > 0) { bias_shape = param.conv_param.bias()->shape(); new_bias.re_alloc(bias_shape, AK_FLOAT); new_bias.copy_from(*(param.conv_param.bias())); diff --git a/saber/funcs/gemm.h b/saber/funcs/gemm.h index 788d0dcfe..b24dac46c 100644 --- a/saber/funcs/gemm.h +++ b/saber/funcs/gemm.h @@ -23,15 +23,31 @@ namespace anakin { namespace saber { +template +class MatrixFunc { +public: + virtual SaberStatus init( + const bool trans_A, const bool trans_B, + const int m, const int n, const int k, + Context ctx) = 0; + + virtual SaberStatus dispatch( + const outDtype alpha, const outDtype beta, + const inDtype* a, const inDtype* b, + outDtype* c) = 0; +}; + template -class Gemm { +class Gemm : public MatrixFunc { // Row major gemm public: Gemm() = default; - ~Gemm() {} + ~Gemm() = default; SaberStatus init(const bool trans_A, const bool trans_B, const int m, const int n, const int k, @@ -57,7 +73,7 @@ class Gemv { // Row major gemm public: Gemv() = default; - ~Gemv() {} + ~Gemv() = default; SaberStatus init(const bool trans_A, const int m, const int n, const int incx, const int incy, diff --git a/saber/funcs/generate_proposals.h b/saber/funcs/generate_proposals.h new file mode 100644 index 000000000..3ef8a6021 --- /dev/null +++ b/saber/funcs/generate_proposals.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_GENERATE_PROPOSALS_H +#define ANAKIN_SABER_FUNCS_GENERATE_PROPOSALS_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_generate_proposals.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_generate_proposals.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_generate_proposals.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_generate_proposals.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_generate_proposals.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_generate_proposals.h" +#endif + +namespace anakin { +namespace saber { + +template +class GenerateProposals : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + GenerateProposalsParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + GenerateProposalsParam>::BaseFunc; + + GenerateProposals() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef GenerateProposalsParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape({input[2]->num() * param.post_nms_top_n, 5, 1, 1}, Layout_NCHW); + std::vector seq_offset; + for (int i = 0; i < input[2]->num() + 1; i++) { + seq_offset.push_back(i*param.post_nms_top_n); + } + + output[0]->set_seq_offset({seq_offset}); + output[1]->set_shape(Shape({input[2]->num() * param.post_nms_top_n, 1, 1, 1}, Layout_NCHW)); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + //this->_impl.push_back(new VenderGenerateProposals _impl.push_back(new VenderGenerateProposals ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberGenerateProposals ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/im2sequence.h b/saber/funcs/im2sequence.h index 20caf864e..b857d8af6 100644 --- a/saber/funcs/im2sequence.h +++ b/saber/funcs/im2sequence.h @@ -22,6 +22,9 @@ #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_im2sequence.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_im2sequence.h" +#endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_im2sequence.h" diff --git a/saber/funcs/impl/.DS_Store b/saber/funcs/impl/.DS_Store new file mode 100644 index 000000000..b36771bbc Binary files /dev/null and b/saber/funcs/impl/.DS_Store differ diff --git a/saber/funcs/impl/arm/impl/neon_mathfun.h b/saber/funcs/impl/arm/impl/neon_mathfun.h deleted file mode 100644 index 8c074b56d..000000000 --- a/saber/funcs/impl/arm/impl/neon_mathfun.h +++ /dev/null @@ -1,320 +0,0 @@ -/* NEON implementation of sin, cos, exp and log - * - * Inspired by Intel Approximate Math library, and based on the - * corresponding algorithms of the cephes math library - */ - -/* Copyright (C) 2011 Julien Pommier - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - * - * (this is the zlib license) - */ -#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H -#define ANAKIN_SABER_FUNCS_ARM_IMPL_NEON_MATHFUN_H - -#include "saber/core/common.h" - -#define c_inv_mant_mask ~0x7f800000u -#define c_cephes_SQRTHF 0.707106781186547524 -#define c_cephes_log_p0 7.0376836292E-2 -#define c_cephes_log_p1 - 1.1514610310E-1 -#define c_cephes_log_p2 1.1676998740E-1 -#define c_cephes_log_p3 - 1.2420140846E-1 -#define c_cephes_log_p4 + 1.4249322787E-1 -#define c_cephes_log_p5 - 1.6668057665E-1 -#define c_cephes_log_p6 + 2.0000714765E-1 -#define c_cephes_log_p7 - 2.4999993993E-1 -#define c_cephes_log_p8 + 3.3333331174E-1 -#define c_cephes_log_q1 -2.12194440e-4 -#define c_cephes_log_q2 0.693359375 - -/* natural logarithm computed for 4 simultaneous float - * return NaN for x <= 0 - */ -static inline float32x4_t log_ps(float32x4_t x) -{ - float32x4_t one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - int32x4_t ux = vreinterpretq_s32_f32(x); - - int32x4_t emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - float32x4_t e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - /* part2: - * if( x < SQRTHF ) { - * e -= 1; - * x = x + x - 1.0; - * } else { x = x - 1.0; } - */ - uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - float32x4_t z = vmulq_f32(x,x); - - float32x4_t y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN - return x; -} - -#define c_exp_hi 88.3762626647949f -#define c_exp_lo -88.3762626647949f - -#define c_cephes_LOG2EF 1.44269504088896341 -#define c_cephes_exp_C1 0.693359375 -#define c_cephes_exp_C2 -2.12194440e-4 - -#define c_cephes_exp_p0 1.9875691500E-4 -#define c_cephes_exp_p1 1.3981999507E-3 -#define c_cephes_exp_p2 8.3334519073E-3 -#define c_cephes_exp_p3 4.1665795894E-2 -#define c_cephes_exp_p4 1.6666665459E-1 -#define c_cephes_exp_p5 5.0000001201E-1 - -/* exp() computed for 4 float at once */ -static inline float32x4_t exp_ps(float32x4_t x) -{ - float32x4_t tmp, fx; - - float32x4_t one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - uint32x4_t mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; - float32x4_t y = vld1q_dup_f32(cephes_exp_p+0); - float32x4_t c1 = vld1q_dup_f32(cephes_exp_p+1); - float32x4_t c2 = vld1q_dup_f32(cephes_exp_p+2); - float32x4_t c3 = vld1q_dup_f32(cephes_exp_p+3); - float32x4_t c4 = vld1q_dup_f32(cephes_exp_p+4); - float32x4_t c5 = vld1q_dup_f32(cephes_exp_p+5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - float32x4_t pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -#define c_minus_cephes_DP1 -0.78515625 -#define c_minus_cephes_DP2 -2.4187564849853515625e-4 -#define c_minus_cephes_DP3 -3.77489497744594108e-8 -#define c_sincof_p0 -1.9515295891E-4 -#define c_sincof_p1 8.3321608736E-3 -#define c_sincof_p2 -1.6666654611E-1 -#define c_coscof_p0 2.443315711809948E-005 -#define c_coscof_p1 -1.388731625493765E-003 -#define c_coscof_p2 4.166664568298827E-002 -#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI - -/* evaluation of 4 sines & cosines at once. - * - * The code is the exact rewriting of the cephes sinf function. - * Precision is excellent as long as x < 8192 (I did not bother to - * take into account the special handling they have for greater values - * -- it does not return garbage for arguments over 8192, though, but - * the extra precision is missing). - * - * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - * surprising but correct result. - * - * Note also that when you compute sin(x), cos(x) is available at - * almost no extra price so both sin_ps and cos_ps make use of - * sincos_ps.. - */ -static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) -{ - // any x - float32x4_t xmm1, xmm2, xmm3, y; - - uint32x4_t emm2; - - uint32x4_t sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - /* scale by 4/Pi */ - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - /* store the integer part of y in mm0 */ - emm2 = vcvtq_u32_f32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - /* get the polynom selection mask - * there is one polynom for 0 <= x <= Pi/4 - * and another one for Pi/4 -SaberStatus SaberActivation::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ActivationParam ¶m) { - - int num = inputs[0]->num(); - int channel = inputs[0]->channel(); - float* ptr_out = (float*)outputs[0]->mutable_data(); - const float* ptr_in = (const float*)inputs[0]->data(); - int size = inputs[0]->valid_size(); - int csize= size / (channel * num); - int threads = 1; - this->_ctx->get_mode(threads); - //multi threads - int nums_per_thread = size / threads; - int remain = size - threads * nums_per_thread; - //openmp 16 - int neon_loop_cnt = nums_per_thread >> 4; - int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4); - //deal with 4 data - int neon_loop_cnt_dim4 = nums_per_thread >> 2; - int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2); - float32x4_t vzero = vdupq_n_f32(0.f); - float coef = param.coef; - float slope = param.negative_slope; - bool channel_shared = param.prelu_param.channel_shared; - float* slopes_ptr = nullptr; - switch (param.active){ - //x > 0 ? x :0 - case Active_relu: - #pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - int cnt = neon_loop_cnt; -#ifdef __aarch64__ - for (int num = 0; num < neon_loop_cnt; num++){ - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - // ptr_in_thread+=4; - float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); - // ptr_in_thread+=4; - float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); - // ptr_in_thread+=4; - float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); - //ptr_in_thread+=4; - ptr_in_thread += 16; - vr0 = vmaxq_f32(vr0, vzero); - vr1 = vmaxq_f32(vr1, vzero); - vr2 = vmaxq_f32(vr2, vzero); - vr3 = vmaxq_f32(vr3, vzero); - vst1q_f32(ptr_out_thread, vr0); - //ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 4, vr1); - // ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 8, vr2); - // ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 12, vr3); - //ptr_out_thread+=4; - ptr_out_thread += 16; - } -#else - if (cnt > 0) { - asm volatile ( - "1: @ loop header\n" - "vld1.32 {d0-d1}, [%[din]]! @ load din 0\n" - "vld1.32 {d2-d3}, [%[din]]! @ load din 0\n" - "vld1.32 {d4-d5}, [%[din]]! @ load din 0\n" - "vld1.32 {d6-d7}, [%[din]]! @ load din 0\n" - - "vmax.f32 q8, q0, %q[vzero] @ relu\n" - "vmax.f32 q9, q1, %q[vzero] @ relu\n" - "vmax.f32 q10, q2, %q[vzero] @ relu\n" - "vmax.f32 q11, q3, %q[vzero] @ relu\n" - - "vst1.32 {d16-d17}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din]] @ preload data\n" - "vst1.32 {d18-d19}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #128] @ preload data\n" - "vst1.32 {d20-d21}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #256] @ preload data\n" - "vst1.32 {d22-d23}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #384] @ preload data\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - "bne 1b @ jump to main loop start point\n" - :[dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) - :[vzero] "w" (vzero) - :"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); - } -#endif - for (int j = 0; j < neon_loop_remain; j++) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f; - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int i = 0; i < remain; i++) { - ptr_out[0] = ptr_in[0] > 0.f ? ptr_in[0] : 0.f; - ptr_in++; - ptr_out++; - } - break; - - // x > 0 ? x : 0; - // x < threshold ? x : threshold - case Active_clipped_relu: - //coef = param.coef; - #pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - int cnt = neon_loop_cnt; - float32x4_t vthreshold = vdupq_n_f32(coef); -#ifdef __aarch64__ - for (int num = 0; num < neon_loop_cnt; num++){ - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); - float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); - float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); - ptr_in_thread += 16; - - vr0 = vmaxq_f32(vr0,vzero); - vr1 = vmaxq_f32(vr1,vzero); - vr2 = vmaxq_f32(vr2,vzero); - vr3 = vmaxq_f32(vr3,vzero); - - uint32x4_t vmask0 = vcgeq_f32(vr0, vthreshold); - uint32x4_t vmask1 = vcgeq_f32(vr1, vthreshold); - uint32x4_t vmask2 = vcgeq_f32(vr2, vthreshold); - uint32x4_t vmask3 = vcgeq_f32(vr3, vthreshold); - - float32x4_t vout0 =vbslq_f32(vmask0, vthreshold, vr0); - float32x4_t vout1 =vbslq_f32(vmask1, vthreshold, vr1); - float32x4_t vout2 =vbslq_f32(vmask2, vthreshold, vr2); - float32x4_t vout3 =vbslq_f32(vmask3, vthreshold, vr3); - - - vst1q_f32(ptr_out_thread, vout0); - vst1q_f32(ptr_out_thread + 4, vout1); - vst1q_f32(ptr_out_thread + 8, vout2); - vst1q_f32(ptr_out_thread + 12, vout3); - //ptr_out_thread+=4; - ptr_out_thread += 16; - } -#else - if (cnt > 0) { - asm volatile ( - "3: @ loop header\n" - "vld1.32 {d0-d1}, [%[din]]! @ load din 0\n" - "vld1.32 {d2-d3}, [%[din]]! @ load din 0\n" - "vld1.32 {d4-d5}, [%[din]]! @ load din 0\n" - "vld1.32 {d6-d7}, [%[din]]! @ load din 0\n" - - "vmax.f32 q8, q0, %q[vzero] @ relu\n" - "vmax.f32 q9, q1, %q[vzero] @ relu\n" - "vmax.f32 q10, q2, %q[vzero] @ relu\n" - "vmax.f32 q11, q3, %q[vzero] @ relu\n" - - "vcgt.f32 q0, q8, %q[vthreshold] @ v0 > threshold\n" - "vcgt.f32 q1, q9, %q[vthreshold] @ v0 > threshold\n" - "vcgt.f32 q2, q10, %q[vthreshold] @ v0 > threshold\n" - "vcgt.f32 q3, q11, %q[vthreshold] @ v0 > threshold\n" - - "vbit.f32 q8, %q[vthreshold], q0 @ \n" - "vbit.f32 q9, %q[vthreshold], q1 @ \n" - "vbit.f32 q10, %q[vthreshold], q2 @ \n" - "vbit.f32 q11, %q[vthreshold], q3 @ \n" - - "vst1.32 {d16-d17}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din]] @ preload data\n" - "vst1.32 {d18-d19}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #128] @ preload data\n" - "vst1.32 {d20-d21}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #256] @ preload data\n" - "vst1.32 {d22-d23}, [%[dout]]! @ store result, add pointer\n" - "pld [%[din], #384] @ preload data\n" - - "subs %[cnt], #1 @ loop count minus 1\n" - "bne 3b @ jump to main loop start point\n" - :[dout] "+r"(ptr_out_thread), [din] "+r"(ptr_in_thread), [cnt] "+r"(cnt) - :[vzero] "w" (vzero), [vthreshold] "w" (vthreshold) - :"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); - } -#endif - for (int j = 0; j < neon_loop_remain; j++) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? (ptr_in_thread[0] > coef ? coef : ptr_in_thread[0]) : 0.f; - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int i = 0; i < remain; i++) { - ptr_out[0] = ptr_in[0] > 0.f ? (ptr_in[0] > coef ? coef : ptr_in[0]) : 0.f; - ptr_in++; - ptr_out++; - } - break; - //sigmoid: 1/(exp(-x) + 1) - case Active_sigmoid: - #pragma omp parallel for - for (int i = 0; i < threads; i++) { - float32x4_t exp_vec = vdupq_n_f32(0.0f); - float32x4_t recip = vdupq_n_f32(0.0f); - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - for (int j = 0; j < neon_loop_cnt_dim4; j++ ) { - exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread))); - exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f)); - recip = vrecpeq_f32(exp_vec); - recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip); - recip = vmulq_f32 (vrecpsq_f32 (exp_vec, recip), recip); - vst1q_f32(ptr_out_thread, recip); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for (int j = 0; j < neon_loop_remain_dim4; j++){ - ptr_out_thread[0] = 1 / (1 + exp(-ptr_in_thread[0])); - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int i = 0; i < remain; i++) { - ptr_out[0] = 1/(1+exp(-ptr_in[0])); - ptr_in++; - ptr_out++; - } - break; - - // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) - case Active_tanh: - //LOG(INFO) << "Active_tanh"; - #pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t vtwo = vdupq_n_f32(2.0f); - float32x4_t vone = vdupq_n_f32(1.0f); - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - int cnt4 = neon_loop_cnt_dim4; - int remain4 = size; - cnt4 = cnt4 < 5 ? cnt4 : 0; - remain4 = cnt4 == 0 ? remain4 : neon_loop_remain_dim4; - for (int j = 0; j < cnt4; j++) { - float32x4_t vdin = vld1q_f32(ptr_in_thread); - float32x4_t vsum = vmulq_f32(vdin, vtwo); - float32x4_t vexp_sum = exp_ps(vsum); - float32x4_t vadd_sum = vaddq_f32(vexp_sum, vone); - float32x4_t vrecip = div_ps(vtwo, vadd_sum); - float32x4_t vout = vsubq_f32(vone, vrecip); - vst1q_f32(ptr_out_thread, vout); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for(int j = 0; j < remain4; j++){ - ptr_out_thread[0] = 1.0 - 2.0 / (1.0 + exp(2.0 * ptr_in_thread[0])); - //(exp(ptr_in_thread[0]) - exp(-ptr_in_thread[0])) / (exp(ptr_in_thread[0]) + exp(-ptr_in_thread[0])); - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - ptr_out[0] = 1.0 - 2.0 / (1.0 + exp(2.0 * ptr_in[0]));//(exp(ptr_in[0]) - exp(-ptr_in[0])) / (exp(ptr_in[0]) + exp(-ptr_in[0])); - ptr_in++; - ptr_out++; - } - break; - - // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} - case Active_stanh: - #pragma omp parallel for - for (int i = 0; i < threads; ++i) { - float32x4_t vcoef = vdupq_n_f32(coef); - float32x4_t vslope = vdupq_n_f32(slope); - float32x4_t vtwo = vdupq_n_f32(2.0f); - float32x4_t vone = vdupq_n_f32(1.0f); - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - int cnt4 = neon_loop_cnt_dim4; - int remain4 = size; - cnt4 = cnt4 < 10 ? cnt4 : 0; - remain4 = cnt4 == 0 ? remain4 : neon_loop_remain_dim4; - for (int j = 0; j < cnt4; j++) { - float32x4_t vdin = vld1q_f32(ptr_in_thread); - float32x4_t vmul_sum = vmulq_f32(vdin, vslope); - float32x4_t vsum = vmulq_f32(vmul_sum, vtwo); - float32x4_t vexp_sum = exp_ps(vsum); - float32x4_t vadd_sum = vaddq_f32(vexp_sum, vone); - float32x4_t vrecip = div_ps(vtwo, vadd_sum); - float32x4_t vout = vsubq_f32(vone, vrecip); - vout = vmulq_f32(vout, vcoef); - vst1q_f32(ptr_out_thread, vout); - ptr_out_thread += 4; - ptr_in_thread += 4; - } - for(int j = 0; j < remain4; j++){ - float din = ptr_in_thread[0] * slope; - ptr_out_thread[0] = coef * (1.0 - 2.0 / (1.0 + exp(2.0 * din))); - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int j = 0; j < remain; ++j) { - float din = ptr_in[0] * slope; - ptr_out[0] = coef * (1.0 - 2.0 / (1.0 + exp(2.0 * din))); - ptr_in++; - ptr_out++; - } - break; - - //prelu: x > 0 ? x : slope[c] * x - case Active_prelu: - slopes_ptr = (float*)param.prelu_param.slope->data(); - for (int n = 0; n < num; n++){ - const float* data_in_batch = ptr_in + n * channel * csize; - float* data_out_batch = ptr_out + n * channel * csize; -#pragma omp parallel for - for (int c = 0; c < channel; c++){ - const float* data_in_channel = data_in_batch + c * csize; - float* data_out_channel = data_out_batch + c * csize; - float slope_val = channel_shared ? slopes_ptr[0] : slopes_ptr[c]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vslope = vdupq_n_f32(slope_val); - int dim4 = csize >> 2; - int dim4_remain = csize - (dim4 * 4); -#ifdef __aarch64__ - for (int i = 0; i < dim4; i++){ - float32x4_t vr0 = vld1q_f32(data_in_channel); - uint32x4_t vmask = vcltq_f32(vr0, vzero);//vr0 <= vzero - float32x4_t vout = vmulq_f32(vr0, vslope);//vr0 * vslope - float32x4_t vout_sel = vbslq_f32(vmask, vout, vr0); - vst1q_f32(data_out_channel, vout_sel); - data_in_channel += 4; - data_out_channel += 4; - } -#else - int cnt = dim4; - if (dim4 > 0){ - asm volatile( - "2: @main loop\n" - "vld1.f32 {d0-d1}, [%[ptr_in]]! @load q1\n" - "vclt.f32 q1, q0, %q[vzero] @vcle q0 <= vzero\n" - "vmul.f32 q2, q0, %q[vslope] @vmul q0 * vslope\n" - "vbit.32 q0, q2, q1 @vbit q0, q2, q1\n" - "subs %[cnt], #1 @subs nn, 1\n" - "vst1.f32 {d0-d1}, [%[ptr_out]]! @store data\n" - "bne 2b @bne nn\n" - :[ptr_in] "+r" (data_in_channel), [cnt] "+r" (cnt), \ - [ptr_out] "+r" (data_out_channel) - :[vzero] "w" (vzero), [vslope] "w" (vslope) - :"q0", "q1", "q2" - ); - } -#endif //__aarch64__ - for (int i = 0 ; i < dim4_remain ; i++) { - data_out_channel[0] = data_in_channel[0] > 0 ? data_in_channel[0] : data_in_channel[0] * slope_val; - data_in_channel++; - data_out_channel++; - } - } - } - break; - - //elu: x > 0 ? x : coef * (exp(x) - 1) - case Active_elu: - #pragma omp parallel for - for (int i = 0; i < threads; ++i) { - const float* ptr_in_thread = ptr_in + i * nums_per_thread; - float* ptr_out_thread = ptr_out + i * nums_per_thread; - int cnt = neon_loop_cnt; - float32x4_t vone = vdupq_n_f32(1.0f); - float32x4_t vcoef = vdupq_n_f32(coef); - for (int num = 0; num < neon_loop_cnt; num++){ - float32x4_t vr0 = vld1q_f32(ptr_in_thread); - // ptr_in_thread+=4; - float32x4_t vr1 = vld1q_f32(ptr_in_thread + 4); - // ptr_in_thread+=4; - float32x4_t vr2 = vld1q_f32(ptr_in_thread + 8); - // ptr_in_thread+=4; - float32x4_t vr3 = vld1q_f32(ptr_in_thread + 12); - //ptr_in_thread+=4; - ptr_in_thread += 16; - - float32x4_t vsum0 = exp_ps(vr0); - float32x4_t vsum1 = exp_ps(vr1); - float32x4_t vsum2 = exp_ps(vr2); - float32x4_t vsum3 = exp_ps(vr3); - uint32x4_t vmask0 = vcgeq_f32(vr0, vzero); - uint32x4_t vmask1 = vcgeq_f32(vr1, vzero); - uint32x4_t vmask2 = vcgeq_f32(vr2, vzero); - uint32x4_t vmask3 = vcgeq_f32(vr3, vzero); - vsum0 = vsubq_f32(vsum0, vone); - vsum1 = vsubq_f32(vsum1, vone); - vsum2 = vsubq_f32(vsum2, vone); - vsum3 = vsubq_f32(vsum3, vone); - - vsum0 = vmulq_f32(vsum0, vcoef); - vsum1 = vmulq_f32(vsum1, vcoef); - vsum2 = vmulq_f32(vsum2, vcoef); - vsum3 = vmulq_f32(vsum3, vcoef); - - - - float32x4_t vout0 =vbslq_f32(vmask0, vr0, vsum0); - float32x4_t vout1 =vbslq_f32(vmask1, vr1, vsum1); - float32x4_t vout2 =vbslq_f32(vmask2, vr2, vsum2); - float32x4_t vout3 =vbslq_f32(vmask3, vr3, vsum3); - - vst1q_f32(ptr_out_thread, vout0); - //ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 4, vout1); - // ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 8, vout2); - // ptr_out_thread+=4; - vst1q_f32(ptr_out_thread + 12, vout3); - //ptr_out_thread+=4; - ptr_out_thread += 16; - } - - for (int j = 0; j < neon_loop_remain; j++) { - ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : coef * (exp(ptr_in_thread[0]) - 1); - ptr_in_thread++; - ptr_out_thread++; - } - } - ptr_out = ptr_out + threads * nums_per_thread; - ptr_in = ptr_in + threads * nums_per_thread; - for (int i = 0; i < remain; i++) { - ptr_out[0] = ptr_in[0] > 0.f ? ptr_in[0] : coef * (exp(ptr_in[0]) - 1); - ptr_in++; - ptr_out++; - } - break; - default: - return SaberUnKownError; - } - return SaberSuccess; -} -DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, ARM, AK_HALF); -DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, ARM, AK_INT8); -} -} // namespace anakin diff --git a/saber/funcs/impl/arm/saber_activation.h b/saber/funcs/impl/arm/saber_activation.h deleted file mode 100644 index 10ef82f8a..000000000 --- a/saber/funcs/impl/arm/saber_activation.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H -#define ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H - -#include "saber/funcs/impl/impl_activation.h" - -namespace anakin{ - -namespace saber{ - -template -class SaberActivation : \ - public ImplBase< - ARM, - OpDtype, - ActivationParam > -{ -public: - typedef typename DataTrait::Dtype OpDataType; - - SaberActivation() - {} - - ~SaberActivation() {} - - virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - ActivationParam& param, Context& ctx) { - this->_ctx = &ctx; - return SaberSuccess; - } - - virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - ActivationParam& param, Context &ctx) { - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector *>& inputs, - std::vector *>& outputs, - ActivationParam& param); - - -}; - -//template class SaberActivation; - -} - -} -#endif //ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_ACTIVATION_H diff --git a/saber/funcs/impl/arm/saber_concat.cpp b/saber/funcs/impl/arm/saber_concat.cpp deleted file mode 100644 index 6fb3e3af5..000000000 --- a/saber/funcs/impl/arm/saber_concat.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "saber/funcs/impl/arm/saber_concat.h" - -namespace anakin{ - -namespace saber{ - -template -void concat_kernel_arm(const int len, const dtype* src, dtype* dst) { - if (dst != src) { - memcpy(dst, src, sizeof(dtype) * len); - } -} - -template <> -SaberStatus SaberConcat::dispatch(\ - const std::vector *>& inputs, - std::vector *>& outputs, - ConcatParam ¶m) { - - int input_size = inputs.size(); - - //! get output data, valid shape and stride shape - int offset_concat_axis = 0; - Shape out_shape = outputs[0]->valid_shape(); - const int out_concat_axis = out_shape[param.axis]; - - if (inputs.size() == 1) { - outputs[0]->copy_from(*inputs[0]); - return SaberSuccess; - } - - OpDataType* dout = (OpDataType*)outputs[0]->mutable_data(); - - for (int i = 0; i < input_size; ++i) { - Shape sh_in = inputs[i]->valid_shape(); - const OpDataType* din = (const OpDataType*)inputs[i]->data(); - const int in_concat_axis = sh_in[param.axis]; - for (int n = 0; n < _num_concats; ++n) { - concat_kernel_arm(in_concat_axis * _concat_input_size, - din + n * in_concat_axis * _concat_input_size, - dout + (n * out_concat_axis + offset_concat_axis) - * _concat_input_size); - } - offset_concat_axis += in_concat_axis; - } - return SaberSuccess; -} -DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, ARM, AK_HALF); -DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, ARM, AK_INT8); -//template class SaberConcat; - -} //namespace anakin - -} //namespace anakin diff --git a/saber/funcs/impl/arm/saber_concat.h b/saber/funcs/impl/arm/saber_concat.h deleted file mode 100644 index 1370b7ed8..000000000 --- a/saber/funcs/impl/arm/saber_concat.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H -#define ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H - -#include "saber/funcs/impl/impl_concat.h" -#include "saber/core/tensor.h" - -#ifdef USE_ARM_PLACE - -namespace anakin{ - -namespace saber{ - -template -class SaberConcat : \ - public ImplBase< - ARM, OpDtype, - ConcatParam > { -public: - typedef typename DataTrait::Dtype OpDataType; - - SaberConcat() = default; - ~SaberConcat() {} - - virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - ConcatParam ¶m, Context &ctx){ - // get context - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - ConcatParam ¶m, Context &ctx){ - - _num_concats = inputs[0]->count_valid(0, param.axis); - _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector *>& inputs, - std::vector *>& outputs, - ConcatParam ¶m); - -private: - int _num_concats; - int _concat_input_size; -}; - -} //namespace saber - -} //namespace anakin - -#endif //USE_ARM_PLACE - -#endif //ANAKIN_SABER_FUNCS_IMPL_ARM_SABER_CONCAT_H diff --git a/saber/funcs/impl/cuda/.DS_Store b/saber/funcs/impl/cuda/.DS_Store new file mode 100644 index 000000000..29d5ce236 Binary files /dev/null and b/saber/funcs/impl/cuda/.DS_Store differ diff --git a/saber/funcs/impl/cuda/base/.DS_Store b/saber/funcs/impl/cuda/base/.DS_Store new file mode 100644 index 000000000..a1253754d Binary files /dev/null and b/saber/funcs/impl/cuda/base/.DS_Store differ diff --git a/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu b/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu index cb0ba4e90..4e971f8f4 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/calibrate.cu @@ -7,13 +7,32 @@ namespace anakin { namespace saber { +template +__global__ +void convert_data_type4(out_dtype* out_data, const in_dtype* in_data, + int count, float scale) { + int gid = threadIdx.x + blockIdx.x * blockDim.x; + if (gid < count) { + in_vtype load = ((in_vtype*)in_data)[gid]; + out_vtype store; + float load0 = static_cast(load.x) * scale; + float load1 = static_cast(load.y) * scale; + float load2 = static_cast(load.z) * scale; + float load3 = static_cast(load.w) * scale; + store.x = static_cast(__float2int_rn(load0)); + store.y = static_cast(__float2int_rn(load1)); + store.z = static_cast(__float2int_rn(load2)); + store.w = static_cast(__float2int_rn(load3)); + ((out_vtype*)out_data)[gid] = store; + } +} + __global__ void transform_nchw_2_c4(char* out_data, const float* in_data, - int valid_num, int valid_channel_4, int valid_height, int valid_width, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - float scale, - int count) { + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float scale, int count, int out_channel) { int load0, load1, load2, load3; int gid = threadIdx.x + blockIdx.x * blockDim.x; @@ -34,57 +53,48 @@ void transform_nchw_2_c4(char* out_data, const float* in_data, + write_w; if (gid < count) { + bool p0, p1, p2, p3; + p0 = (4 * write_c) < out_channel; + p1 = (4 * write_c) + 1 < out_channel; + p2 = (4 * write_c) + 2 < out_channel; + p3 = (4 * write_c) + 3 < out_channel; + float r0; char4 write; - load0 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + if (p0) r0 = __ldg(&in_data[in_offset]); + else r0 = 0; + load0 = __float2int_rn(r0 * scale); write.x = static_cast(load0); in_offset += in_c_stride; - load1 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + if (p1) r0 = __ldg(&in_data[in_offset]); + else r0 = 0; + load1 = __float2int_rn(r0 * scale); write.y = static_cast(load1); in_offset += in_c_stride; - load2 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + if (p2) r0 = __ldg(&in_data[in_offset]); + else r0 = 0; + load2 = __float2int_rn(r0 * scale); write.z = static_cast(load2); in_offset += in_c_stride; - load3 = __float2int_rn(__ldg(&in_data[in_offset]) * scale); + if (p3) r0 = __ldg(&in_data[in_offset]); + else r0 = 0; + load3 = __float2int_rn(r0 * scale); write.w = static_cast(load3); ((char4*)out_data)[out_offset] = write; } } -template<> -SaberStatus conv_calibrate_fp32_int8_c4(Tensor &out_tensor, - const Tensor &in_tensor, const float in_scale, Context ctx) { - - const float * in_data = (const float*)in_tensor.data(); - char * out_data = (char*)out_tensor.mutable_data(); - - Shape in_stride = in_tensor.get_stride(); - - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - int count = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]; - cudaStream_t cuda_stream = ctx.get_compute_stream(); - transform_nchw_2_c4<<>>(out_data, in_data, - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - in_stride[0], in_stride[1], in_stride[2], in_stride[3], - out_shape[1] * out_shape[2] * out_shape[3], - out_shape[2] * out_shape[3], out_shape[3], 1, - (1.f / in_scale), count); - - return SaberSuccess; -} - __global__ void transform_nchw_2_nchw(float * out_data, - const float* in_data, const int count, - int in_n, int in_c, int in_h, int in_w, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n, int out_c, int out_h, int out_w, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - const float *scale, const float input_scale) { + const float* in_data, const int count, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n, int out_c, int out_h, int out_w, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float *scale, const float input_scale) { + CUDA_KERNEL_LOOP(tid, count){ int read_w = tid % in_w; int read_h = (tid / (in_w)) % in_h; @@ -112,41 +122,12 @@ __global__ void transform_nchw_2_nchw(float * out_data, } } -template<> -SaberStatus conv_calibrate_int32_fp32( - Tensor &out_tensor, const Tensor &in_tensor, - const float in_scale, const float* weight_scale, Context ctx) { - - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - - Shape stride_in = in_tensor.get_stride(); - Shape stride_out = out_tensor.get_stride(); - - const float *in_data = (const float*)in_tensor.data(); - float *out_data = (float*)out_tensor.mutable_data(); - - const int count = in_tensor.valid_size(); - cudaStream_t cuda_stream = ctx.get_compute_stream(); - - transform_nchw_2_nchw - <<>>( - out_data, in_data, count, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - stride_in[0], stride_in[1], stride_in[2], stride_in[3], - out_shape[0], out_shape[1], out_shape[2], out_shape[3], - stride_out[0], stride_out[1], stride_out[2], stride_out[3], - weight_scale, in_scale); - - return SaberSuccess; -} - __global__ void int8nchwc4_fp32nchw(float* out_data, const char* in_data, - int valid_num, int valid_channel_4, int valid_height, int valid_width, - int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, - int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, - const float* scale, int count) { + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float* scale, int count) { float load0, load1, load2, load3; int gid = threadIdx.x + blockIdx.x * blockDim.x; @@ -183,38 +164,91 @@ void int8nchwc4_fp32nchw(float* out_data, const char* in_data, } } -template<> -SaberStatus conv_calibrate_int8_c4_fp32( - Tensor &out_tensor, - const Tensor &in_tensor, - const float* weight_scale, - Context ctx) { +template +__global__ +void nchwc4_2_nchw(dtype* out_data, const char* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, int count) { - Shape out_stride = out_tensor.get_stride(); - Shape in_shape = in_tensor.valid_shape(); - Shape out_shape = out_tensor.valid_shape(); - int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + dtype load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; - const char * in_data = (const char*)in_tensor.data(); - float * out_data = (float*)out_tensor.mutable_data(); + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; - cudaStream_t cuda_stream = ctx.get_compute_stream(); - int8nchwc4_fp32nchw<<>>(out_data, in_data, - in_shape[0], in_shape[1], in_shape[2], in_shape[3], - in_shape[1] * in_shape[2] * in_shape[3], - in_shape[2] * in_shape[3], - in_shape[3], 1, - out_stride[0], out_stride[1], out_stride[2], out_stride[3], - weight_scale, count); + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; - return SaberSuccess; + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + + char4 readin = __ldg(&((const char4*)in_data)[in_offset]); + load0 = static_cast(readin.x); + load1 = static_cast(readin.y); + load2 = static_cast(readin.z); + load3 = static_cast(readin.w); + + out_data[out_offset] = load0; out_offset += out_c_stride; + out_data[out_offset] = load1; out_offset += out_c_stride; + out_data[out_offset] = load2; out_offset += out_c_stride; + out_data[out_offset] = load3; + } } -#define JUDGESIGN(x) (((x) >= 0) ? +1 : -1) +__global__ +void int8nchwc4_fp32nchw_s(float* out_data, const char* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float scale, int count) { + + float load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + + char4 readin = __ldg(&((const char4*)in_data)[in_offset]); + + load0 = static_cast(readin.x); + load1 = static_cast(readin.y); + load2 = static_cast(readin.z); + load3 = static_cast(readin.w); + + out_data[out_offset] = load0 * scale; out_offset += out_c_stride; + out_data[out_offset] = load1 * scale; out_offset += out_c_stride; + out_data[out_offset] = load2 * scale; out_offset += out_c_stride; + out_data[out_offset] = load3 * scale; + } +} +#define JUDGESIGN(x) (((x) >= 0) ? +1 : -1) __global__ void calibrate_float2char_col(signed char* dst, const float* src, - float * scale, int height, int width) { + float * scale, int height, int width) { int gid = threadIdx.x + blockIdx.x * blockDim.x; float col_max = 0.0f; @@ -244,7 +278,7 @@ void calibrate_float2char_col(signed char* dst, const float* src, __global__ void calibrate_float2char_row(signed char* dst, const float* src, - float * scale, int height, int width) { + float * scale, int height, int width) { int gid = threadIdx.x + blockIdx.x * blockDim.x; float row_max = 0.0f; @@ -293,6 +327,354 @@ __global__ void calibrate_fix2float(float * dst, } } +template <> +SaberStatus conv_data_calibrate( + Tensor &out_tensor, const Tensor &in_tensor, const float in_scale, + const float* weight_scale, Context ctx) { + if (in_tensor.get_dtype() != AK_FLOAT) { + LOG(FATAL) << "input tensor dtype error!"; + } + if (out_tensor.get_dtype() != AK_INT8) { + LOG(FATAL) << "output tensor dtype error!"; + } + if (in_tensor.get_layout() != out_tensor.get_layout()) { + LOG(FATAL) << "convert layout is not same!"; + } + if (in_tensor.valid_size() != out_tensor.valid_size()) { + LOG(FATAL) << "convert size is not same!"; + } + char* out_data = (char*)out_tensor.mutable_data(); + const float* in_data = (const float*)in_tensor.data(); + float scale = 1 / (in_tensor.get_scale()[0]); + int count = in_tensor.valid_size() / 4; // need to check if is multiple of 4 + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + convert_data_type4 + <<>> ( + out_data, in_data, count, scale); + return SaberSuccess; +} + +template <> +SaberStatus conv_data_calibrate( + Tensor &out_tensor, const Tensor &in_tensor, const float in_scale, + const float* weight_scale, Context ctx) { + if (out_tensor.get_dtype() != AK_FLOAT) { + LOG(FATAL) << "output tensor dtype error!"; + } + if (in_tensor.get_dtype() != AK_INT8) { + LOG(FATAL) << "input tensor dtype error!"; + } + if (in_tensor.get_layout() != out_tensor.get_layout()) { + LOG(FATAL) << "convert layout is not same!"; + } + if (in_tensor.valid_size() != out_tensor.valid_size()) { + LOG(FATAL) << "convert size is not same!"; + } + float* out_data = (float*)out_tensor.mutable_data(); + const char* in_data = (const char*)in_tensor.data(); + float scale = in_tensor.get_scale()[0]; + int count = in_tensor.valid_size() / 4; // need to check if is multiple of 4 + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + convert_data_type4 + <<>> ( + out_data, in_data, count, scale); + + return SaberSuccess; +} + +template <> +SaberStatus conv_data_calibrate( + Tensor &out_tensor, const Tensor &in_tensor, const float in_scale, + const float* weight_scale, Context ctx) { + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + + const char * in_data = (const char*)in_tensor.data(); + float * out_data = (float*)out_tensor.mutable_data(); + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + int8nchwc4_fp32nchw<<>>(out_data, in_data, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + weight_scale, count); + + return SaberSuccess; +} + +template <> +SaberStatus conv_data_calibrate( + Tensor &out_tensor, const Tensor &in_tensor, const float in_scale, + const float* weight_scale, Context ctx) { + const float * in_data = (const float*)in_tensor.data(); + char * out_data = (char*)out_tensor.mutable_data(); + + Shape in_stride = in_tensor.get_stride(); + + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + + int out_num = out_shape.num(); + int out_channel = out_shape.channel(); + int out_height = out_shape.height(); + int out_width = out_shape.width(); + int out_channel_4 = out_channel >> 2; + bool multipler_4 = (out_channel & 0x3) != 0; + out_channel_4 += multipler_4 ? 1 : 0; + int count = out_num * out_channel_4 * out_height * out_width; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + transform_nchw_2_c4<<>>(out_data, in_data, + out_num, out_channel_4, out_height, out_width, + in_stride[0], in_stride[1], in_stride[2], in_stride[3], + out_channel_4 * out_height * out_width, + out_height * out_width, out_width, 1, + (1.f / in_scale), count, out_channel); + + return SaberSuccess; +} + +// This template is for calibrate!!!! +template <> +SaberStatus conv_data_calibrate( + Tensor &out_tensor, const Tensor &in_tensor, const float in_scale, + const float* weight_scale, Context ctx) { + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + + Shape stride_in = in_tensor.get_stride(); + Shape stride_out = out_tensor.get_stride(); + + const float *in_data = (const float*)in_tensor.data(); + float *out_data = (float*)out_tensor.mutable_data(); + + const int count = in_tensor.valid_size(); + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + transform_nchw_2_nchw + <<>>( + out_data, in_data, count, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3], + weight_scale, in_scale); + + return SaberSuccess; +} + +template <> +SaberStatus flatten_calibrate( + Tensor &out_tensor, + const Tensor &in_tensor, + Context &ctx) { + + if (out_tensor.get_dtype() != AK_FLOAT) { + LOG(FATAL) << "output tensor dtype error!"; + } + if (in_tensor.get_dtype() != AK_INT8) { + LOG(FATAL) << "input tensor dtype error!"; + } + if (in_tensor.get_layout() != out_tensor.get_layout()) { + LOG(FATAL) << "convert layout is not same!"; + } + if (in_tensor.valid_size() != out_tensor.valid_size()) { + LOG(FATAL) << "convert size is not same!"; + } + float* out_data = (float*)out_tensor.mutable_data(); + const char* in_data = (const char*)in_tensor.data(); + float scale = in_tensor.get_scale()[0]; + int count = in_tensor.valid_size() / 4; // need to check if is multiple of 4 + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + convert_data_type4 + <<>> ( + out_data, in_data, count, scale); + + return SaberSuccess; +} + +template <> +SaberStatus flatten_calibrate( + Tensor &out_tensor, + const Tensor &in_tensor, + Context &ctx) { + if (in_tensor.get_dtype() != AK_FLOAT) { + LOG(FATAL) << "input tensor dtype error!"; + } + if (out_tensor.get_dtype() != AK_INT8) { + LOG(FATAL) << "output tensor dtype error!"; + } + if (in_tensor.get_layout() != out_tensor.get_layout()) { + LOG(FATAL) << "convert layout is not same!"; + } + if (in_tensor.valid_size() != out_tensor.valid_size()) { + LOG(FATAL) << "convert size is not same!"; + } + char* out_data = (char*)out_tensor.mutable_data(); + const float* in_data = (const float*)in_tensor.data(); + float scale = 1 / (in_tensor.get_scale()[0]); + int count = in_tensor.valid_size() / 4; // need to check if is multiple of 4 + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + convert_data_type4 + <<>> ( + out_data, in_data, count, scale); + return SaberSuccess; +} + +template<> +SaberStatus conv_calibrate_fp32_int8_c4(Tensor &out_tensor, + const Tensor &in_tensor, const float in_scale, Context ctx) { + + const float * in_data = (const float*)in_tensor.data(); + char * out_data = (char*)out_tensor.mutable_data(); + + Shape in_stride = in_tensor.get_stride(); + + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + + int out_num = out_shape.num(); + int out_channel = in_shape.channel(); + int out_height = out_shape.height(); + int out_width = out_shape.width(); + int out_channel_4 = out_channel >> 2; + bool multipler_4 = (out_channel & 0x3) != 0; + out_channel_4 += multipler_4 ? 1 : 0; + int count = out_num * out_channel_4 * out_height * out_width; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + transform_nchw_2_c4<<>>(out_data, in_data, + out_num, out_channel_4, out_height, out_width, + in_stride[0], in_stride[1], in_stride[2], in_stride[3], + out_channel_4 * out_height * out_width, + out_height * out_width, out_width, 1, + (1.f / in_scale), count, out_channel); + + return SaberSuccess; +} + +template<> +SaberStatus conv_calibrate_int32_fp32( + Tensor &out_tensor, const Tensor &in_tensor, + const float in_scale, const float* weight_scale, Context ctx) { + + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + + Shape stride_in = in_tensor.get_stride(); + Shape stride_out = out_tensor.get_stride(); + + const float *in_data = (const float*)in_tensor.data(); + float *out_data = (float*)out_tensor.mutable_data(); + + const int count = in_tensor.valid_size(); + cudaStream_t cuda_stream = ctx.get_compute_stream(); + + transform_nchw_2_nchw + <<>>( + out_data, in_data, count, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3], + weight_scale, in_scale); + + return SaberSuccess; +} + +template<> +SaberStatus conv_calibrate_int8_c4_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + const float* weight_scale, + Context ctx) { + + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3] / 4; + + const char * in_data = (const char*)in_tensor.data(); + float * out_data = (float*)out_tensor.mutable_data(); + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + int8nchwc4_fp32nchw<<>>(out_data, in_data, + in_shape[0], in_shape[1] / 4, in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + weight_scale, count); + + return SaberSuccess; +} + +template <> +SaberStatus layout_trans_nchwc4_2_nchw( + Tensor &out_tensor, + const Tensor &in_tensor, + float scale, + Context ctx) { + + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3] / 4; + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + if (in_tensor.get_dtype() == AK_FLOAT) { + flatten_calibrate(out_tensor, in_tensor, ctx); + } else if (in_tensor.get_dtype() == AK_INT8) { + const char * in_data = (const char*)in_tensor.data(); + char * out_data = (char*)out_tensor.mutable_data(); + nchwc4_2_nchw<<>>(out_data, in_data, + in_shape[0], in_shape[1] / 4, in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3] / 4, + in_shape[2] * in_shape[3], in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], count); + } else { + LOG(FATAL) << "tensor dtype is wrong!!!"; + } + + return SaberSuccess; +} + +template<> +SaberStatus calibrate_int8_c4_fp32( + Tensor &out_tensor, + const Tensor &in_tensor, + const float out_scale, + Context ctx) { + + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3] / 4; + const char * in_data = (const char*)in_tensor.data(); + float * out_data = (float*)out_tensor.mutable_data(); + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + int8nchwc4_fp32nchw_s<<>>(out_data, in_data, + in_shape[0], in_shape[1] / 4, in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3] / 4, + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + out_scale, count); + + return SaberSuccess; +} + template <> void float2char(bool col_direct, signed char* dst, const float* src, float *scale, int height, int width, Context ctx) { @@ -316,5 +698,7 @@ void fix2float(float * dst, calibrate_fix2float<<>>(dst, sA, sB, alpha, beta, height, width, threads); } + + } } \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/reorder.cu b/saber/funcs/impl/cuda/base/cuda_c/reorder.cu new file mode 100644 index 000000000..ce3923c99 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/reorder.cu @@ -0,0 +1,166 @@ + +#include "saber/funcs/impl/cuda/reorder.h" + +namespace anakin { +namespace saber { + +template +__global__ +void transform_nchw_2_c4(dtype* out_data, const dtype* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + int count) { + + dtype load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int write_w = (gid) % valid_width; + int write_h = (gid / (out_h_stride)) % valid_height; + int write_c = (gid / (out_c_stride)) % valid_channel_4; + int write_n = (gid / (out_n_stride)) % valid_num; + + int in_offset = write_n * in_n_stride + + write_c * in_c_stride * 4 + + write_h * in_h_stride + + write_w * in_w_stride; + + int out_offset = write_n * out_n_stride + + write_c * out_c_stride + + write_h * out_h_stride + + write_w; + + if (gid < count) { + vtype write; + load0 = in_data[in_offset]; + write.x = load0; + + in_offset += in_c_stride; + load1 = in_data[in_offset]; + write.y = load1; + + in_offset += in_c_stride; + load2 = in_data[in_offset]; + write.z = load2; + + in_offset += in_c_stride; + load3 = in_data[in_offset]; + write.w = load3; + + ((vtype*)out_data)[out_offset] = write; + } +} + +template<> +SaberStatus convert_nchw_to_nchwc4(Tensor &out_tensor, + const Tensor &in_tensor, Context ctx) { + + CHECK_EQ(out_tensor.get_dtype(), in_tensor.get_dtype()); + const void * in_data = in_tensor.data(); + void * out_data = out_tensor.mutable_data(); + + Shape in_stride = in_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]; + cudaStream_t cuda_stream = ctx.get_compute_stream(); + if (out_tensor.get_dtype() == AK_INT8) { + transform_nchw_2_c4 + << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, + 0, cuda_stream >> > ((char*)out_data, (const char*)in_data, + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + in_stride[0], in_stride[1], in_stride[2], in_stride[3], + out_shape[1] * out_shape[2] * out_shape[3], + out_shape[2] * out_shape[3], out_shape[3], 1, + count); + } else if (out_tensor.get_dtype() == AK_FLOAT) { + transform_nchw_2_c4 + << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, + 0, cuda_stream >> > ((float*)out_data, (const float*)in_data, + out_shape[0], out_shape[1], out_shape[2], out_shape[3], + in_stride[0], in_stride[1], in_stride[2], in_stride[3], + out_shape[1] * out_shape[2] * out_shape[3], + out_shape[2] * out_shape[3], out_shape[3], 1, + count); + } else { + LOG(FATAL) << "NOT SUPPORT THIS DATATYPE in reorder!!!"; + } + return SaberSuccess; +} + +template +__global__ +void transform_nchwc4_2_nchw(dtype* out_data, const dtype* in_data, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + int count) { + + int gid = threadIdx.x + blockIdx.x * blockDim.x; + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + vtype readin = ((const vtype*)in_data)[in_offset]; + out_data[out_offset] = readin.x; out_offset += out_c_stride; + out_data[out_offset] = readin.y; out_offset += out_c_stride; + out_data[out_offset] = readin.z; out_offset += out_c_stride; + out_data[out_offset] = readin.w; + } +} + +template<> +SaberStatus convert_nchwc4_to_nchw( + Tensor &out_tensor, + const Tensor &in_tensor, + Context ctx) { + + CHECK_EQ(out_tensor.get_dtype(), in_tensor.get_dtype()); + + Shape out_stride = out_tensor.get_stride(); + Shape in_shape = in_tensor.valid_shape(); + Shape out_shape = out_tensor.valid_shape(); + int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + + const void * in_data = in_tensor.data(); + void * out_data = out_tensor.mutable_data(); + + cudaStream_t cuda_stream = ctx.get_compute_stream(); + if (out_tensor.get_dtype() == AK_INT8) { + transform_nchwc4_2_nchw + << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream >> > ( + (char*)out_data, (const char*)in_data, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], count); + } else if (out_tensor.get_dtype() == AK_FLOAT) { + transform_nchwc4_2_nchw + << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream >> > ( + (float*)out_data, (const float*)in_data, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], count); + } else { + LOG(FATAL) << "NOT SUPPORT THIS DATATYPE in reorder!!!"; + } + + return SaberSuccess; +} + + +} // namespace saber +} // namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu index 4717081bb..fbaf156c8 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_activation.cu @@ -1,5 +1,8 @@ #include "saber/funcs/impl/cuda/saber_activation.h" -#include "cuda_fp16.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/calibrate.h" + +#define BUILD_DEV __device__ namespace anakin{ namespace saber{ @@ -55,6 +58,7 @@ __global__ void ker_sigmoid_fwd(Dtype * out_data, + w * out_w_stride; Dtype in_var = in_data[in_idx]; + out_data[out_idx] = Dtype( Dtype(1) / (Dtype(1)+ exp(-in_var))); } @@ -148,6 +152,34 @@ __global__ void ker_clipped_relu_fwd(Dtype * out_data, out_data[out_idx] = in_var < clipped_threadhold? in_var : clipped_threadhold; } } + +template +__global__ void ker_swish_fwd(Dtype * out_data, + const Dtype* in_data, const int count, Dtype beta, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + Dtype in_var = in_data[in_idx]; + out_data[out_idx] = Dtype( in_var / (Dtype(1)+ exp(-(beta * in_var)))); + } +} + template __global__ void ker_elu_fwd(Dtype * out_data, const Dtype* in_data, const int count, Dtype coef, @@ -175,6 +207,34 @@ __global__ void ker_elu_fwd(Dtype * out_data, } } +template +__global__ void ker_gelu_fwd(Dtype * out_data, + const Dtype* in_data, const int count, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride) { + CUDA_KERNEL_LOOP(tid, count){ + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + Dtype in_var = in_data[in_idx]; + Dtype coeff = 0.5 * (std::erf(in_var / pow(2, 0.5)) + 1); + out_data[out_idx] = in_var * coeff; + } +} + template __global__ void ker_prelu_fwd(Dtype * out_data, const Dtype* in_data, const int count, @@ -207,31 +267,50 @@ __global__ void ker_prelu_fwd(Dtype * out_data, } } -template -SaberStatus SaberActivation::dispatch( \ +template <> +SaberStatus SaberActivation::create( \ const std::vector*>& inputs, std::vector*>& outputs, - ActivationParam& param) { + ActivationParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} +template <> +SaberStatus SaberActivation::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberActivation::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param) { Shape in_shape = inputs[0]->valid_shape(); Shape out_shape = outputs[0]->valid_shape(); Shape stride_in = inputs[0]->get_stride(); Shape stride_out = outputs[0]->get_stride(); - const OpDataType *in_data = (const OpDataType*)inputs[0]->data(); - OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + const float *in_data = (const float*)inputs[0]->data(); + float *out_data = (float*)outputs[0]->mutable_data(); const int count = inputs[0]->valid_size(); cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); - OpDataType negative_slope = param.negative_slope; - OpDataType coef = param.coef; + float negative_slope = param.negative_slope; + float coef = param.coef; switch (param.active) { //x > 0 ? x : 0 case Active_relu: - ker_relu_fwd + ker_relu_fwd <<>>( out_data, in_data, count, negative_slope, in_shape[0], in_shape[1], in_shape[2], in_shape[3], @@ -242,7 +321,7 @@ SaberStatus SaberActivation::dispatch( \ // sigmoid: 1/(exp(-x) + 1) case Active_sigmoid: - ker_sigmoid_fwd + ker_sigmoid_fwd <<>>( out_data, in_data, count, in_shape[0], in_shape[1], in_shape[2], in_shape[3], @@ -250,10 +329,21 @@ SaberStatus SaberActivation::dispatch( \ stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + // swish: x / (exp(-b * x) + 1) + case Active_swish: + + ker_swish_fwd + <<>>( + out_data, in_data, count, coef, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + break; + // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) case Active_tanh: - ker_tanh_fwd + ker_tanh_fwd <<>>( out_data, in_data, count, in_shape[0], in_shape[1], in_shape[2], in_shape[3], @@ -264,7 +354,7 @@ SaberStatus SaberActivation::dispatch( \ // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} case Active_stanh: - ker_stanh_fwd + ker_stanh_fwd <<>>( out_data, in_data, count, negative_slope, coef, in_shape[0], in_shape[1], in_shape[2], in_shape[3], @@ -276,7 +366,7 @@ SaberStatus SaberActivation::dispatch( \ // x < threshold ? x : threshold case Active_clipped_relu: - ker_clipped_relu_fwd + ker_clipped_relu_fwd <<>>( out_data, in_data, count, coef, in_shape[0], in_shape[1], in_shape[2], in_shape[3], @@ -287,20 +377,29 @@ SaberStatus SaberActivation::dispatch( \ //elu: x > 0 ? x : coef * (exp(x) - 1) case Active_elu: - ker_elu_fwd + ker_elu_fwd <<>>( out_data, in_data, count, coef, in_shape[0], in_shape[1], in_shape[2], in_shape[3], stride_in[0], stride_in[1], stride_in[2], stride_in[3], stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; + //gelu: x * 0.5(erf(x/sqrt(2)) + 1) + case Active_gelu: + ker_gelu_fwd + <<>>( + out_data, in_data, count, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + stride_in[0], stride_in[1], stride_in[2], stride_in[3], + stride_out[0], stride_out[1], stride_out[2], stride_out[3]); + break; //prelu: x > 0 ? x : slope[c] * x case Active_prelu: auto prelu_param = param.prelu_param; - const OpDataType* slope_ptr = (const OpDataType*)prelu_param.slope->data(); + const float* slope_ptr = (const float*)prelu_param.slope->data(); bool shared = prelu_param.channel_shared; - ker_prelu_fwd + ker_prelu_fwd <<>>( out_data, in_data, count, slope_ptr, shared, @@ -309,12 +408,394 @@ SaberStatus SaberActivation::dispatch( \ stride_out[0], stride_out[1], stride_out[2], stride_out[3]); break; } + CUDA_POST_KERNEL_CHECK; + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; +} + +// =================================int8 ================== +class ReluDev{ +public: + static __device__ float run(float in, float negative_slope, float placeholder) { + return (in > 0.f) ? in : in * negative_slope; + } +}; +class SigmoidDev{ +public: + static __device__ float run(float in, float placeholder1, float placeholder2) { + return float( float(1) / (float(1)+ exp(-in))); + } +}; + +template +__global__ +void ker_act_fwd_fp32_to_int8(char* out_data, const float* in_data, + int in_num, int in_channel_4, int in_height, int in_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float negtive_slope, const float coef, float scale, int count) { + + int load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int write_w = (gid) % in_width; + int write_h = (gid / (out_h_stride)) % in_height; + int write_c = (gid / (out_c_stride)) % in_channel_4; + int write_n = (gid / (out_n_stride)) % in_num; + + int in_offset = write_n * in_n_stride + + write_c * in_c_stride * 4 + + write_h * in_h_stride + + write_w * in_w_stride; + + int out_offset = write_n * out_n_stride + + write_c * out_c_stride + + write_h * out_h_stride + + write_w; + + if (gid < count) { + char4 write; + float temp; + temp = in_data[in_offset] * scale; + temp = Op::run(temp, negtive_slope, coef); + load0 = __float2int_rn(temp); + write.x = static_cast(load0); + + in_offset += in_c_stride; + temp = in_data[in_offset] * scale; + temp = Op::run(temp, negtive_slope, coef); + load1 = __float2int_rn(temp); + write.y = static_cast(load1); + + in_offset += in_c_stride; + temp = in_data[in_offset] * scale; + temp = Op::run(temp, negtive_slope, coef); + load2 = __float2int_rn(temp); + write.z = static_cast(load2); + + in_offset += in_c_stride; + temp = in_data[in_offset] * scale; + temp = Op::run(temp, negtive_slope, coef); + load3 = __float2int_rn(temp); + write.w = static_cast(load3); + + ((char4*)out_data)[out_offset] = write; + } +} + +template +__global__ +void ker_act_fwd_int8_to_fp32(float* out_data, const char* in_data, + int in_num, int in_channel_4, int in_height, int in_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float negtive_slope, const float coef, const float scale, int count) { + + float load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int read_w = (gid) % in_width; + int read_h = (gid / (in_h_stride)) % in_height; + int read_c = (gid / (in_c_stride)) % in_channel_4; + int read_n = (gid / (in_n_stride)) % in_num; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + char4 readin = ((const char4*)in_data)[in_offset]; + load0 = static_cast(readin.x) * scale; + load1 = static_cast(readin.y) * scale; + load2 = static_cast(readin.z) * scale; + load3 = static_cast(readin.w) * scale; + load0 = Op::run(load0, negtive_slope, coef); + load1 = Op::run(load1, negtive_slope, coef); + load2 = Op::run(load2, negtive_slope, coef); + load3 = Op::run(load3, negtive_slope, coef); + out_data[out_offset] = load0; out_offset += out_c_stride; + out_data[out_offset] = load1; out_offset += out_c_stride; + out_data[out_offset] = load2; out_offset += out_c_stride; + out_data[out_offset] = load3; + } +} + +__global__ void ker_sigmoid_fwd_int8(char * out_data, + const char* in_data, const int count, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float in_scale = 1.f, float out_scale = 1.f) { + + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + char in_var = in_data[in_idx]; + float in = static_cast(in_var) * in_scale; + in = float( float(1) / (float(1)+ exp(-in))); + in /= out_scale; + out_data[out_idx] = static_cast(in); + } +} + +template <> +SaberStatus SaberActivation::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param, Context& ctx) { + + this->_ctx = &ctx; + if (inputs[0]->get_dtype() == AK_FLOAT) { + Shape in_shape = inputs[0]->valid_shape(); + _int8_input.reshape(in_shape); + _int8_input.set_scale(inputs[0]->get_scale()); + _int8_input.set_layout(Layout_NCHW_C4); + } + return SaberSuccess; +} + +template <> +SaberStatus SaberActivation::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +__global__ void ker_clipped_relu_fwd_s8s8(char * out_data, + const char* in_data, const int count, float clipped_threadhold, + int in_n, int in_c, int in_h, int in_w, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + float in_scale, float out_scale) { + + CUDA_KERNEL_LOOP(tid, count) { + int w = tid % in_w; + int h = (tid / (in_w)) % in_h; + int c = (tid / (in_h * in_w)) % in_c; + int n = (tid / (in_c * in_h * in_w)) % in_n; + + int in_idx = n * in_n_stride + + c * in_c_stride + + h * in_h_stride + + w * in_w_stride; + + int out_idx = n * out_n_stride + + c * out_c_stride + + h * out_h_stride + + w * out_w_stride; + + char in_var = in_data[in_idx]; + if (in_var < 0) { + out_data[out_idx] = 0; + } else { + float temp = static_cast(in_var) * in_scale; + if (temp > clipped_threadhold) { + temp = clipped_threadhold * in_scale / out_scale; + out_data[out_idx] = static_cast(__float2int_rn(temp)); + } else { + out_data[out_idx] = in_var; + } + } + } +} + +__global__ +void ker_clipped_relu_fwd_s8s8(void* out_data, const void* in_data, const float clipped_threadhold, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float scale, const float out_scale, int count) { + + float load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + if (gid < count) { + + char4 readin = __ldg(&((const char4*)in_data)[in_offset]); + + load0 = static_cast(readin.x) * scale; + load1 = static_cast(readin.y) * scale; + load2 = static_cast(readin.z) * scale; + load3 = static_cast(readin.w) * scale; + + load0 = load0 > 0 ? load0 : 0; + load0 = load0 < clipped_threadhold? load0 : clipped_threadhold; + load1 = load1 > 0 ? load1 : 0; + load1 = load1 < clipped_threadhold? load1 : clipped_threadhold; + load2 = load2 > 0 ? load2 : 0; + load2 = load2 < clipped_threadhold? load2 : clipped_threadhold; + load3 = load3 > 0 ? load3 : 0; + load3 = load3 < clipped_threadhold? load3 : clipped_threadhold; + char4 store; + + store.x = static_cast(__float2int_rn(load0 * out_scale)); + store.y = static_cast(__float2int_rn(load1 * out_scale)); + store.z = static_cast(__float2int_rn(load2 * out_scale)); + store.w = static_cast(__float2int_rn(load3 * out_scale)); + + ((char4*)out_data)[in_offset] = store; + } +} + +__global__ +void ker_clipped_relu_fwd_s8f32(void* out_data, const void* in_data, + const float clipped_threadhold, + int valid_num, int valid_channel_4, int valid_height, int valid_width, + int in_n_stride, int in_c_stride, int in_h_stride, int in_w_stride, + int out_n_stride, int out_c_stride, int out_h_stride, int out_w_stride, + const float scale, const float out_scale, int count) { + + float load0, load1, load2, load3; + int gid = threadIdx.x + blockIdx.x * blockDim.x; + + int read_w = (gid) % valid_width; + int read_h = (gid / (in_h_stride)) % valid_height; + int read_c = (gid / (in_c_stride)) % valid_channel_4; + int read_n = (gid / (in_n_stride)) % valid_num; + int scale_index = read_c << 2; + + int in_offset = read_n * in_n_stride + + read_c * in_c_stride + + read_h * in_h_stride + + read_w; + + int out_offset = read_n * out_n_stride + + read_c * (out_c_stride << 2) + + read_h * out_h_stride + + read_w * out_w_stride; + + if (gid < count) { + + char4 readin = __ldg(&((const char4*)in_data)[in_offset]); + + load0 = static_cast(readin.x) * scale; + load1 = static_cast(readin.y) * scale; + load2 = static_cast(readin.z) * scale; + load3 = static_cast(readin.w) * scale; + load0 = load0 > 0 ? load0 : 0; + load0 = load0 < clipped_threadhold? load0 : clipped_threadhold; + load1 = load1 > 0 ? load1 : 0; + load1 = load1 < clipped_threadhold? load1 : clipped_threadhold; + load2 = load2 > 0 ? load2 : 0; + load2 = load2 < clipped_threadhold? load2 : clipped_threadhold; + load3 = load3 > 0 ? load3 : 0; + load3 = load3 < clipped_threadhold? load3 : clipped_threadhold; + ((float*)out_data)[out_offset] = load0; out_offset += out_c_stride; + ((float*)out_data)[out_offset] = load1; out_offset += out_c_stride; + ((float*)out_data)[out_offset] = load2; out_offset += out_c_stride; + ((float*)out_data)[out_offset] = load3; + } +} + +template <> +SaberStatus SaberActivation::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param) { + + const void *in_data = inputs[0]->data(); + void *out_data = outputs[0]->mutable_data(); + + const int count = inputs[0]->valid_size(); + int in_c_4 = inputs[0]->channel() / 4; + int out_c_4 = outputs[0]->channel() / 4; + +// float negative_slope = param.negative_slope; + float coef = param.coef; + + float in_scale = inputs[0]->get_scale()[0]; + float out_scale = 1.f / outputs[0]->get_scale()[0]; + + Shape out_stride = outputs[0]->get_stride(); + Shape in_shape = inputs[0]->valid_shape(); + Shape out_shape = outputs[0]->valid_shape(); +// int count = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3]; + + cudaStream_t cuda_stream = _ctx->get_compute_stream(); + + if (inputs[0]->get_dtype() == AK_FLOAT) { + conv_calibrate_fp32_int8_c4(_int8_input, *inputs[0], in_scale, *(this->_ctx)); + in_data = _int8_input.data(); + } else { + in_data = inputs[0]->data(); + } + + if (outputs[0]->get_dtype() == AK_INT8) { + switch (param.active) { + case Active_clipped_relu: + ker_clipped_relu_fwd_s8s8 + <<>>( + out_data, in_data, coef, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + in_scale, out_scale, count); + break; + default: + LOG(FATAL) << "Not implement this activation in this data config" << param.active; + break; + } + } else if (outputs[0]->get_dtype() == AK_FLOAT) { + switch (param.active) { + case Active_clipped_relu: + ker_clipped_relu_fwd_s8f32 + <<>>( + out_data, in_data, coef, + in_shape[0], in_shape[1], in_shape[2], in_shape[3], + in_shape[1] * in_shape[2] * in_shape[3], + in_shape[2] * in_shape[3], + in_shape[3], 1, + out_stride[0], out_stride[1], out_stride[2], out_stride[3], + in_scale, out_scale, count); + break; + default: + LOG(FATAL) << "Not implement this activation in this data config" << param.active; + break; + } + } else { + LOG(FATAL) << "not supported yet!!!"; + } + CUDA_POST_KERNEL_CHECK; return SaberSuccess; } template class SaberActivation; -DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, NV, AK_INT8); +template class SaberActivation; DEFINE_OP_TEMPLATE(SaberActivation, ActivationParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_affine_channel.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_affine_channel.cu index c5d1db212..5821b2fd5 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_affine_channel.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_affine_channel.cu @@ -27,8 +27,8 @@ SaberStatus SaberAffineChannel::dispatch(\ AffineChannelParam& param) { const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); - const OpDataType* scale_data = (const OpDataType*)inputs[1]->data(); - const OpDataType* bias_data = (const OpDataType*)inputs[2]->data(); + const OpDataType* scale_data = (const OpDataType*)param.weight()->data(); + const OpDataType* bias_data = (const OpDataType*)param.bias()->data(); OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); int count = outputs[0]->valid_size(); @@ -36,8 +36,8 @@ SaberStatus SaberAffineChannel::dispatch(\ int outer_num = inputs[0]->count_valid(0, channel_idx); int channel = inputs[0]->channel(); int inner_num = inputs[0]->count_valid(channel_idx+1, inputs[0]->dims()); - CHECK_EQ(inputs[1]->valid_size(), channel) << "affine channel input scale dims are not valid"; - CHECK_EQ(inputs[2]->valid_size(), channel) << "affine channel input bias dims are not valid"; + CHECK_EQ(param.weight()->valid_size(), channel) << "affine channel input scale dims are not valid"; + CHECK_EQ(param.bias()->valid_size(), channel) << "affine channel input bias dims are not valid"; if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { ker_affine_channel_fwd\ diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_anchor_generator.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_anchor_generator.cu new file mode 100644 index 000000000..7ba84a000 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_anchor_generator.cu @@ -0,0 +1,107 @@ +#include "saber/funcs/impl/cuda/saber_anchor_generator.h" +#include "saber/core/tensor_op.h" +#include "cuda_fp16.h" + +namespace anakin { +namespace saber { + +template +__global__ void ker_anchor_generator_fwd(Dtype * out_data, \ + Dtype* var_data, + const Dtype* in_data, + const int in_h, + const int in_w, + const float* anchor_sizes_data, + const int anchor_sizes_size, + const float* aspect_ratios_data, + const int aspect_ratios_size, + const int num_anchors, + const int stride_h, + const int stride_w, + const float var_0, + const float var_1, + const float var_2, + const float var_3, + const float offset, + const int count) +{ + CUDA_KERNEL_LOOP(tid, count){ + int h_id = tid / (num_anchors * in_w); + int w_id = (tid / num_anchors) % in_w; + int anchor_sizes_id = (tid % anchor_sizes_size); + int aspect_id = (tid / anchor_sizes_size) % aspect_ratios_size; + Dtype x_ctr = w_id * stride_w + offset * (stride_w - 1); + Dtype y_ctr = h_id * stride_h + offset * (stride_h - 1); + float anchor_size = anchor_sizes_data[anchor_sizes_id]; + float ar = aspect_ratios_data[aspect_id]; + Dtype area = stride_w * stride_h; + Dtype area_ratios = area / ar; + Dtype base_w = round(sqrt(area_ratios)); + Dtype base_h = round(base_w * ar); + Dtype scale_w = anchor_size / stride_w; + Dtype scale_h = anchor_size / stride_h; + Dtype half_width = 0.5 * (scale_w * base_w - 1); + Dtype half_height = 0.5 * (scale_h * base_h - 1); + Dtype* out_tmp = out_data + tid * 4; + Dtype* var_tmp = var_data + tid * 4; + out_tmp[0] = x_ctr - half_width; + out_tmp[1] = y_ctr - half_height; + out_tmp[2] = x_ctr + half_width; + out_tmp[3] = y_ctr + half_height; + var_tmp[0] = var_0; + var_tmp[1] = var_1; + var_tmp[2] = var_2; + var_tmp[3] = var_3; + } +} + +template +SaberStatus SaberAnchorGenerator::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + AnchorGeneratorParam& param) { + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* var_data = (OpDataType*)outputs[1]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const float* anchor_sizes_data = (const float*)_anchor_sizes.data(); + const float* aspect_ratios_data = (const float*)_aspect_ratios.data(); + + + int in_n = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int num_anchors = param.aspect_ratios.size() * param.anchor_sizes.size(); + int stride_h = param.stride[1]; + int stride_w = param.stride[0]; + float offset = param.offset; + int count = in_h * in_w * num_anchors; + + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + ker_anchor_generator_fwd\ + <<>>(\ + out_data, var_data, in_data, \ + in_h, in_w, \ + anchor_sizes_data, + param.anchor_sizes.size(), \ + aspect_ratios_data, + param.aspect_ratios.size(), + num_anchors, + stride_h, stride_w, + param.variances[0], + param.variances[1], + param.variances[2], + param.variances[3], + offset, + count); + } + + return SaberSuccess; +} + +DEFINE_OP_TEMPLATE(SaberAnchorGenerator, AnchorGeneratorParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAnchorGenerator, AnchorGeneratorParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu index 8f24c66d4..3d7456a88 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_argmax.cu @@ -180,7 +180,7 @@ __global__ void block_top1(const Dtype* in_data, volatile Dtype *vmax = share_data; volatile Dtype *vindex = share_index; if (blockSize >= 64) { - int index2 = index + 64; + int index2 = index + 32; if (vmax[index2] > vmax[index]) { vmax[index] = vmax[index2]; vindex[index] = vindex[index2]; @@ -294,7 +294,7 @@ __global__ void top1(const Dtype* in_data, volatile Dtype *vmax = share_data; volatile Dtype *vindex = share_index; if (blockSize >= 64) { - int index2 = index + 64; + int index2 = index + 32; if (vmax[index2] > vmax[index]) { vmax[index] = vmax[index2]; vindex[index] = vindex[index2]; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_arithmetic.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_arithmetic.cu new file mode 100644 index 000000000..a181833b5 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_arithmetic.cu @@ -0,0 +1,186 @@ +#include "saber/funcs/impl/cuda/saber_arithmetic.h" +#include "saber/core/tensor_op.h" +#include "saber/core/target_wrapper.h" + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_arithmetic_sum_fwd(Dtype * out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id+1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = in_data_0[tid] + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_sub_fwd(Dtype * out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id+1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = in_data_0[tid] - in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_mul_fwd(Dtype * out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id+1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = in_data_0[tid] * in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + + + +template <> +SaberStatus SaberArithmetic::create( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} + +template <> +SaberStatus SaberArithmetic::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam& param, Context& ctx) { + + this->_ctx = &ctx; + Shape shape({inputs[0]->num(), 1, 1, 1}, Layout_NCHW); + word_id_to_seq_id.re_alloc(shape, AK_INT32); + + int offset_size = inputs[0]->get_seq_offset()[0].size(); + Shape offset_shape(std::vector{offset_size, 1, 1, 1}, Layout_NCHW); + offset_tensor_0.re_alloc(offset_shape, AK_INT32); + offset_tensor_1.re_alloc(offset_shape, AK_INT32); + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberArithmetic::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam& param) { + + const float *in_data_0 = (const float*)inputs[0]->data(); + const float *in_data_1 = (const float*)inputs[1]->data(); + float *out_data = (float*)outputs[0]->mutable_data(); + + const int inner_size = inputs[0]->valid_size() / inputs[0]->num(); + const int count = inputs[0]->valid_size(); + + Shape shape({inputs[0]->num(), 1, 1, 1}, Layout_NCHW); + word_id_to_seq_id.reshape(shape); + + auto offset_0 = inputs[0]->get_seq_offset()[0]; + auto offset_1 = inputs[1]->get_seq_offset()[0]; + std::vector word_seq_map; + for (int i = 0; i < offset_0.size() - 1; i++) { + for (int j = offset_0[i]; j < offset_0[i+1]; j++) { + word_seq_map.push_back(i); + } + } + + int seq_num = offset_0.size() - 1; + Shape offset_shape({seq_num + 1, 1, 1, 1}, Layout_NCHW); + offset_tensor_0.reshape(offset_shape); + offset_tensor_1.reshape(offset_shape); + auto offset_data_0 = (int*)offset_tensor_0.mutable_data(); + auto offset_data_1 = (int*)offset_tensor_1.mutable_data(); + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int* gpu_map_data = (int *)word_id_to_seq_id.mutable_data(); + + cudaMemcpyAsync(gpu_map_data, &word_seq_map[0], sizeof(int) * word_seq_map.size(), cudaMemcpyHostToDevice,cuda_stream); + + cudaMemcpyAsync(offset_data_0, &offset_0[0], sizeof(int) * offset_0.size(), cudaMemcpyHostToDevice, cuda_stream); + + cudaMemcpyAsync(offset_data_1, &offset_1[0], sizeof(int) * offset_1.size(), cudaMemcpyHostToDevice, cuda_stream); + + switch (param.op_type) { + //out[0] = input_0[0] + input_1[0] + case SUM: + + ker_arithmetic_sum_fwd + <<>>( + out_data, in_data_0, in_data_1, offset_data_0, offset_data_1, + gpu_map_data, seq_num, inner_size, count); + break; + + //out[0] = input_0[0] - input_1[0] + case SUB: + ker_arithmetic_sub_fwd + <<>>( + out_data, in_data_0, in_data_1, offset_data_0, offset_data_1, + gpu_map_data, seq_num, inner_size, count); + break; + + //out[0] = input_0[0] * input_1[0] + case MUL: + ker_arithmetic_mul_fwd + <<>>( + out_data, in_data_0, in_data_1, offset_data_0, offset_data_1, + gpu_map_data, seq_num, inner_size, count); + break; + + } + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + +template class SaberArithmetic; +DEFINE_OP_TEMPLATE(SaberArithmetic, ArithmeticParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberArithmetic, ArithmeticParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_attention_padding_mask.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_attention_padding_mask.cu new file mode 100644 index 000000000..a1bef4f50 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_attention_padding_mask.cu @@ -0,0 +1,95 @@ +#include "saber/funcs/impl/cuda/saber_attention_padding_mask.h" +#include "saber/core/tensor_op.h" +#define BUILD_DEV __device__ + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_attention_padding_mask_fwd(Dtype * out_data, + const Dtype* attn_data, + const int* src_offset, + const int attn_seq_num, + const int attn_seq_len, + const int src_seq_num, + const int src_seq_len, + const Dtype mask, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int src_word_id = tid % src_seq_len; + int tmp_tid = tid / src_seq_len; + int attn_seq_id = tmp_tid / attn_seq_len; + int attn_word_id = tmp_tid % attn_seq_len; + int src_seq_id = attn_seq_id % src_seq_num; + int cur_len = src_offset[src_seq_id+1] - src_offset[src_seq_id]; + if (src_word_id >= cur_len) { + out_data[tid] = mask; + } else { + out_data[tid] = attn_data[tid]; + } + } +} + +template +SaberStatus SaberAttentionPaddingMask::create( \ + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberAttentionPaddingMask::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam& param, Context& ctx) { + _src_offset.set_dtype(AK_INT32); + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberAttentionPaddingMask::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam& param) { + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + const OpDataType *attn_data = (const OpDataType*)inputs[0]->data(); + const OpDataType *src_data = (const OpDataType*)inputs[1]->data(); + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + + const int count = outputs[0]->valid_size(); + int attn_seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int attn_seq_len = inputs[0]->get_seq_offset()[0][1]; + int src_seq_len = inputs[0]->count_valid(1, inputs[0]->dims()); + auto src_offset = inputs[1]->get_seq_offset()[0]; + int src_seq_num = src_offset.size() - 1; + + _src_offset.reshape(Shape({src_seq_num+1, 1, 1, 1}, Layout_NCHW)); + int* src_offset_data = (int*)_src_offset.mutable_data(); + cudaMemcpyAsync(src_offset_data, &src_offset[0], sizeof(int) * (src_seq_num+1), cudaMemcpyHostToDevice, cuda_stream); + + ker_attention_padding_mask_fwd<<>>(out_data, + attn_data, + src_offset_data, + attn_seq_num, + attn_seq_len, + src_seq_num, + src_seq_len, + param.mask, + count); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + + +template class SaberAttentionPaddingMask; +DEFINE_OP_TEMPLATE(SaberAttentionPaddingMask, AttentionPaddingMaskParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAttentionPaddingMask, AttentionPaddingMaskParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_box_clip.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_box_clip.cu new file mode 100644 index 000000000..208555083 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_box_clip.cu @@ -0,0 +1,73 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "saber/funcs/impl/cuda/saber_box_clip.h" +#include "saber/funcs/saber_util.h" +#include "tensor_op.h" +#include "debug.h" +namespace anakin { + +namespace saber { + +static constexpr int ImInfoSize = 3; + +template +static __global__ void GPUBoxClip(const Dtype* input, const int* lod, + const int width, const Dtype* im_info, + Dtype* output) { + Dtype im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + Dtype im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + Dtype im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), Dtype(0.)); + } +} + +template +SaberStatus SaberBoxClip::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, EmptyParam& param) { + static constexpr int im_info_size = 3; + static constexpr int box_info_size = 4; + auto seq_offset = inputs[1]->get_seq_offset(); + CHECK_EQ(inputs.size(), 2) << "need two input"; + CHECK_EQ(seq_offset.size(), 1) << "need offset to cal batch"; + CHECK_GT(seq_offset[0].size(), 1) << "need offset to cal batch"; + auto offset = seq_offset[0]; + auto img = inputs[1]; + auto im_info = inputs[0]; + const float* im_info_ptr = static_cast(im_info->data()); + float* box_ptr = static_cast(img->data()); + int batch_size = offset.size() - 1; + CHECK_EQ(batch_size * im_info_size, im_info->valid_size()) << "im_info should be valid"; + utils::try_expand_tensor(cuda_seq_offset, offset.size()); + CUDA_CHECK(cudaMemcpyAsync(cuda_seq_offset.data(), offset.data(), sizeof(int)*offset.size(), + cudaMemcpyHostToDevice, this->_ctx->get_compute_stream())); + GPUBoxClip <<< batch_size, 256, 0, this->_ctx->get_compute_stream() >>> ( + static_cast(img->data()), static_cast(cuda_seq_offset.data()), + box_info_size, static_cast(im_info->data()), static_cast(outputs[0]->data())); + return SaberSuccess; +} + +template class SaberBoxClip; +DEFINE_OP_TEMPLATE(SaberBoxClip, EmptyParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberBoxClip, EmptyParam, NV, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_box_coder.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_box_coder.cu new file mode 100644 index 000000000..60d22c60a --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_box_coder.cu @@ -0,0 +1,152 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "saber/funcs/impl/cuda/saber_box_coder.h" + +namespace anakin { + +namespace saber { + +enum BOX_CODER_VAR { + FIX_SIZE_VAR = 0, + NO_VAR = 1, + FROM_INPUT_VAR = 2 +}; + +template +__global__ void decode_center_size_kernel( + const float* prior_box_data, const float* prior_box_var_data, + const float* target_box_data, const int row, const int col, const int len, + const int axis, float* output, float nomalized) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + int prior_box_offset = 0; + int out_len = 4; + int var_len = 4; + int delta_len = 4; + int anchor_len = len; + + if (idx < row * col) { + const int col_idx = idx % col; + const int row_idx = idx / col; + prior_box_offset = axis == 0 ? col_idx * anchor_len : row_idx * anchor_len; + prior_box_offset += 1; + float prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + nomalized; + float prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + nomalized; + float prior_box_center_x = + prior_box_data[prior_box_offset] + prior_box_width * 0.5; + float prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height * 0.5; + + float box_var_x = 1.f; + float box_var_y = 1.f; + float box_var_w = 1.f; + float box_var_h = 1.f; + + if (fix_size_var == FROM_INPUT_VAR) { + int prior_var_offset = axis == 0 ? col_idx * var_len : row_idx * var_len; + box_var_x = prior_box_var_data[prior_var_offset]; + box_var_y = prior_box_var_data[prior_var_offset + 1]; + box_var_w = prior_box_var_data[prior_var_offset + 2]; + box_var_h = prior_box_var_data[prior_var_offset + 3]; + } else if (fix_size_var == FIX_SIZE_VAR) { + box_var_x = prior_box_var_data[0]; + box_var_y = prior_box_var_data[1]; + box_var_w = prior_box_var_data[2]; + box_var_h = prior_box_var_data[3]; + } + + float target_box_width = + exp(box_var_w * target_box_data[idx * delta_len + 2]) * prior_box_width; + float target_box_height = + exp(box_var_h * target_box_data[idx * delta_len + 3]) * prior_box_height; + float target_box_center_x = + box_var_x * target_box_data[idx * delta_len] * prior_box_width + + prior_box_center_x; + float target_box_center_y = + box_var_y * target_box_data[idx * delta_len + 1] * prior_box_height + + prior_box_center_y; + + output[idx * out_len] = target_box_center_x - target_box_width / 2; + output[idx * out_len + 1] = target_box_center_y - target_box_height / 2; + output[idx * out_len + 2] = + target_box_center_x + target_box_width / 2 - nomalized; + output[idx * out_len + 3] = + target_box_center_y + target_box_height / 2 - nomalized; + } +} + +template +static inline void box_coder(Tensor* proposals, + const Tensor* anchors, + const Tensor* bbox_deltas, + const Tensor* variances, + BoxCoderParam& param, + cudaStream_t stream + ) { + const size_t row = bbox_deltas->num(); + const size_t col = bbox_deltas->channel(); + const size_t anchor_nums = row * col; + const size_t len = anchors->valid_shape()[1]; + CHECK_EQ(len, 5) << "anchor length is 5"; + const float* anchor_data = (const float*) anchors->data(); + const float* bbox_deltas_data = (const float*) bbox_deltas->data(); + float* proposals_data = (float*) proposals->data(); + const float* variances_data = nullptr; + float normalized = !param.box_normalized ? 1.f : 0; + + if (variances) { + variances_data = (const float*)variances->data(); + } + + int block = 512; + int grid = (row * col + block - 1) / block; + + decode_center_size_kernel <<< grid, block, 0, stream>>>(anchor_data, variances_data, + bbox_deltas_data, + row, col, len, param.axis, proposals_data, normalized); +}; + +template +SaberStatus SaberBoxCoder::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, BoxCoderParam& param) { + Tensor* anchor = inputs[0]; + Tensor* delta = inputs[1]; + Tensor* variances = nullptr; + Tensor* proposal = outputs[0]; + + if (param.variance() != nullptr && param.variance()->valid_size() > 0) { + variances = param.variance(); + CHECK(variances->valid_size() == 4); + box_coder(proposal, anchor, delta, variances, param, + this->_ctx ->get_compute_stream()); + } else if (inputs.size() >= 3) { + variances = inputs[2]; + box_coder(proposal, anchor, delta, variances, param, + this->_ctx ->get_compute_stream()); + } else { + box_coder(proposal, anchor, delta, variances, param, this->_ctx ->get_compute_stream()); + } + + return SaberSuccess; +} + +template class SaberBoxCoder; +DEFINE_OP_TEMPLATE(SaberBoxCoder, BoxCoderParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberBoxCoder, BoxCoderParam, NV, AK_INT8); +} //namespace anakin + +} //name diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu index fa599fdef..979dbbb4c 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_concat.cu @@ -1,4 +1,6 @@ #include "saber/funcs/impl/cuda/saber_concat.h" +#include "saber/funcs/impl/cuda/reorder.h" +#include "saber/funcs/calibrate.h" namespace anakin{ @@ -17,7 +19,6 @@ __global__ void concat_impl_cuda(const int nthreads, const dtype* in_data, const int concat_index = index % total_concat_size; const int top_index = concat_index + (concat_num * top_concat_axis + offset_concat_axis) * concat_size; - out_data[top_index] = in_data[index]; } } @@ -37,9 +38,28 @@ __global__ void concat_impl_2d_impl(const int inner_size, const int num_concats, concat_size + idx_inner; out_data[idx_output] = in_data[idx_input]; } +} +template <> +SaberStatus SaberConcat::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, + Context& ctx) { + + _num_concats = inputs[0]->count_valid(0, param.axis); + _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); + return SaberSuccess; } +template <> +SaberStatus SaberConcat::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} template <> SaberStatus SaberConcat::dispatch(const std::vector *>& inputs, @@ -70,7 +90,7 @@ SaberStatus SaberConcat::dispatch(const std::vector *>& const int nthreads = in_concat_size * _num_concats; float ratio = (float)in_concat_size / _num_concats; bool is_balance = (ratio > 0.1 && ratio < 10); - if (is_balance){ + if (is_balance) { int block_x = BLOCK_SIZE; int block_y = BLOCK_SIZE; int grid_x = (in_concat_size + block_x - 1) / block_x; @@ -91,7 +111,7 @@ SaberStatus SaberConcat::dispatch(const std::vector *>& } } else { //! inputs or outputs memory is not continuous Shape offset_out = outputs[0]->offset(); - Tensor tsub; + Tensor tsub; for (int i = 0; i < input_size; ++i) { Shape in_shape = inputs[i]->valid_shape(); tsub.share_sub_buffer(*outputs[0], in_shape, offset_out); @@ -99,11 +119,135 @@ SaberStatus SaberConcat::dispatch(const std::vector *>& tsub.async_copy_from(*inputs[i], stream); } } + return SaberSuccess; +} + +template <> +SaberStatus SaberConcat::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, + Context& ctx) { + + _num_concats = inputs[0]->count_valid(0, param.axis); + _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); + _input_v.resize(inputs.size()); + for (int i = 0; i < inputs.size(); ++i) { + + if (inputs[i]->get_dtype() == AK_FLOAT) { + _input_v[i].re_alloc(inputs[i]->valid_shape(), AK_INT8); + } else if (inputs[i]->get_dtype() == AK_INT8 && inputs[i]->get_layout() == Layout_NCHW_C4) { + Shape new_shape = Shape({inputs[i]->num(), inputs[i]->channel(), + inputs[i]->height(), inputs[i]->width()}, Layout_NCHW); + _input_v[i].re_alloc(new_shape, AK_INT8); + } else if (inputs[i]->get_dtype() == AK_INT8 && inputs[i]->get_layout() == Layout_NCHW) { + // good, nothing to do + } else { + LOG(FATAL) << "Not support this situation, pls contact the r&d."; + } + } + + if (outputs[0]->get_dtype() == AK_FLOAT) { + _output.re_alloc(outputs[0]->valid_shape(), AK_INT8); + _output.set_scale(outputs[0]->get_scale()); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW_C4) { + Shape new_shape = outputs[0]->valid_shape(); + new_shape.set_layout(Layout_NCHW); + _output.re_alloc(new_shape, AK_INT8); + _output.set_scale(outputs[0]->get_scale()); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW) { + // good, nothing to do. + } else { + LOG(FATAL) << "Not support this situation, pls contact the r&d."; + } + return SaberSuccess; +} + +template <> +SaberStatus SaberConcat::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConcatParam& param, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberConcat::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, ConcatParam& param) { + + cudaStream_t stream = this->_ctx->get_compute_stream(); + int input_size = inputs.size(); + //! get output data, valid shape and stride shape + char* out_data = nullptr; + + if (outputs[0]->get_dtype() == AK_FLOAT) { + out_data = (char*)_output.mutable_data(); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW_C4) { + out_data = (char*)_output.mutable_data(); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW) { + out_data = (char*)outputs[0]->mutable_data(); + } else { + LOG(FATAL) << "Not support this situation, pls contact the r&d."; + } + int offset_concat_axis = 0; + Shape out_shape = outputs[0]->valid_shape(); + const int out_concat_axis = out_shape[param.axis]; + + //! inputs and outputs are all with continuous memory + for (int i = 0; i < input_size; ++i) { + Shape in_shape = inputs[i]->valid_shape(); + //std::vector bottom_shape = {tmp[3], tmp[2], tmp[1], tmp[0]}; + const char* in_data = nullptr; + if (inputs[i]->get_dtype() == AK_FLOAT) { + flatten_calibrate (_input_v[i], *inputs[i], *_ctx); + in_data = (char*)_input_v[i].mutable_data(); + } else if (inputs[i]->get_dtype() == AK_INT8 && inputs[i]->get_layout() == Layout_NCHW_C4) { + convert_nchwc4_to_nchw(_input_v[i], *inputs[i], *_ctx); + in_data = (char*)_input_v[i].mutable_data(); + } else if (inputs[i]->get_dtype() == AK_INT8 && inputs[i]->get_layout() == Layout_NCHW) { + in_data = (char*)inputs[i]->mutable_data(); + } else { + LOG(FATAL) << "Not support this situation, pls contact the r&d."; + } + const int in_concat_axis = in_shape[param.axis]; + const int in_concat_size = in_concat_axis * _concat_input_size; + const int nthreads = in_concat_size * _num_concats; + float ratio = (float)in_concat_size / _num_concats; + bool is_balance = (ratio > 0.1 && ratio < 10); + if (is_balance) { + int block_x = BLOCK_SIZE; + int block_y = BLOCK_SIZE; + int grid_x = (in_concat_size + block_x - 1) / block_x; + int grid_y = (_num_concats + block_y - 1) / block_y; + dim3 block(block_x, block_y); + dim3 grid(grid_x, grid_y); + concat_impl_2d_impl<<>>( + in_concat_size, _num_concats, in_data, _concat_input_size, + out_concat_axis, offset_concat_axis, out_data); + } else { + // NOLINT_NEXT_LINE(whitespace/operators) + concat_impl_cuda<<>>( + nthreads, in_data, _num_concats, _concat_input_size, + out_concat_axis, in_concat_axis, offset_concat_axis, out_data); + } + offset_concat_axis += in_concat_axis; + } + if (outputs[0]->get_dtype() == AK_FLOAT) { + flatten_calibrate(*outputs[0], _output, *_ctx); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW_C4) { + convert_nchw_to_nchwc4(*outputs[0], _output, *_ctx); + } else if (outputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_layout() == Layout_NCHW) { + // good, nothing to be done; + } else { + LOG(FATAL) << "Not support this situation, pls contact the r&d."; + } return SaberSuccess; } -DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, NV, AK_INT8); + DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, NV, AK_HALF); + } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_cos_sim.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_cos_sim.cu new file mode 100644 index 000000000..9a1743a0d --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_cos_sim.cu @@ -0,0 +1,151 @@ +#include "saber/funcs/impl/cuda/saber_cos_sim.h" +#include "cuda_fp16.h" + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_cos_sim_fwd(Dtype * out_data, + const Dtype* in_0, + const Dtype* in_1, + const int num, + const int len, + const float epsilon) { + int block_idx = blockIdx.x; + int thread_idx = threadIdx.x; + extern __shared__ Dtype share_mem[]; + Dtype* aa_sum = share_mem; + Dtype* bb_sum = share_mem + blockDim.x; + Dtype* ab_sum = bb_sum + blockDim.x; + aa_sum[thread_idx] = 0; + bb_sum[thread_idx] = 0; + ab_sum [thread_idx] = 0; + const Dtype* in_0_tmp = in_0 + block_idx * len; + const Dtype* in_1_tmp = in_1 + block_idx * len; + for (int i = thread_idx; i < len; i += blockDim.x) { + aa_sum[thread_idx] += in_0_tmp[i] * in_0_tmp[i]; + bb_sum[thread_idx] += in_1_tmp[i] * in_1_tmp[i]; + ab_sum[thread_idx] += in_0_tmp[i] * in_1_tmp[i]; + } + __syncthreads(); + if (blockDim.x >= 512) { + if (thread_idx < 256) { + int index = thread_idx + 256; + aa_sum[thread_idx] += aa_sum[index]; + bb_sum[thread_idx] += bb_sum[index]; + ab_sum[thread_idx] += ab_sum[index]; + } + __syncthreads(); + } + if (blockDim.x >= 256) { + if (thread_idx < 128) { + int index = thread_idx + 128; + aa_sum[thread_idx] += aa_sum[index]; + bb_sum[thread_idx] += bb_sum[index]; + ab_sum[thread_idx] += ab_sum[index]; + } + __syncthreads(); + } + if (blockDim.x >= 128) { + if (thread_idx < 64) { + int index = thread_idx + 64; + aa_sum[thread_idx] += aa_sum[index]; + bb_sum[thread_idx] += bb_sum[index]; + ab_sum[thread_idx] += ab_sum[index]; + } + __syncthreads(); + } + if (blockDim.x >= 64) { + if (thread_idx < 32) { + int index = thread_idx + 32; + aa_sum[thread_idx] += aa_sum[index]; + bb_sum[thread_idx] += bb_sum[index]; + ab_sum[thread_idx] += ab_sum[index]; + } + __syncthreads(); + } + if (blockDim.x >= 32) { + volatile Dtype *vaa_sum = aa_sum; + volatile Dtype *vbb_sum= bb_sum; + volatile Dtype *vab_sum= ab_sum; + if (thread_idx < 16) { + int index = thread_idx + 16; + vaa_sum[thread_idx] += vaa_sum[index]; + vbb_sum[thread_idx] += vbb_sum[index]; + vab_sum[thread_idx] += vab_sum[index]; + } + if (thread_idx < 8) { + int index = thread_idx + 8; + vaa_sum[thread_idx] += vaa_sum[index]; + vbb_sum[thread_idx] += vbb_sum[index]; + vab_sum[thread_idx] += vab_sum[index]; + } + if (thread_idx < 4) { + int index = thread_idx + 4; + vaa_sum[thread_idx] += vaa_sum[index]; + vbb_sum[thread_idx] += vbb_sum[index]; + vab_sum[thread_idx] += vab_sum[index]; + } + if (thread_idx < 4) { + int index = thread_idx + 2; + vaa_sum[thread_idx] += vaa_sum[index]; + vbb_sum[thread_idx] += vbb_sum[index]; + vab_sum[thread_idx] += vab_sum[index]; + } + if (thread_idx < 2) { + int index = thread_idx + 1; + vaa_sum[thread_idx] += vaa_sum[index]; + vbb_sum[thread_idx] += vbb_sum[index]; + vab_sum[thread_idx] += vab_sum[index]; + } + } + if (thread_idx == 0) { + auto c = aa_sum[0] * bb_sum[0]; + if (c < epsilon) { + out_data[block_idx] = 0; + } else { + out_data[block_idx] = ab_sum[0] / sqrt(c); + } + } +} + +template +SaberStatus SaberCosSim::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam& param) { + + CHECK_EQ(inputs.size(), 2) << "CosSim input num need be 2, but is" << inputs.size(); + CHECK_EQ(outputs.size(), 1) << "CosSim input num need be 1, but is" << outputs.size(); + size_t count_0 = inputs[0]->valid_size(); + size_t count_1 = inputs[1]->valid_size(); + CHECK_EQ(count_0, count_1) << "input0 and input1 valid size is not equal"; + + size_t num = inputs[0]->num(); + size_t inner_size = count_0 / inputs[0]->num(); + + const OpDataType *in_0_data = (const OpDataType*)inputs[0]->data(); + const OpDataType *in_1_data = (const OpDataType*)inputs[1]->data(); + + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + float epsilon = param.epsilon; + + int block_size = exp2(floor(log2(float(inner_size)))); + block_size = std::min(block_size, CUDA_NUM_THREADS); + + ker_cos_sim_fwd + <<>>( + out_data, in_0_data, in_1_data, num, inner_size, epsilon); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + +template class SaberCosSim; +DEFINE_OP_TEMPLATE(SaberCosSim, CosSimParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberCosSim, CosSimParam, NV, AK_HALF); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu index ea3aef53d..75bef56f6 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_depthwiseconv_act.cu @@ -1,17 +1,20 @@ //#include "saber/funcs/impl/cuda/saber_conv_act.h" + #include "saber/saber_types.h" #include "saber/core/common.h" +#include namespace anakin{ namespace saber{ -template +template __global__ void depthwise_conv_1d(const int nthreads, - const Dtype* const din, const int num, const int channels, - const int hin, const int win, const int hout, - const int wout, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const dout, const Dtype* const weight, const Dtype* const bias) { + const float* const din, const int num, const int channels, + const int hin, const int win, const int hout, + const int wout, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + float* const dout, const float* const weight, const float* const bias) { + int size_channel_in = hin * win; int size_channel_out = hout * wout; int size_kernel = kernel_h * kernel_w; @@ -22,149 +25,305 @@ __global__ void depthwise_conv_1d(const int nthreads, const int n = index / size_channel_out / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; - int hend = min(hstart + kernel_h, hin + pad_h); - int wend = min(wstart + kernel_w, win + pad_w); + int hend = hstart + kernel_h; + int wend = wstart + kernel_w; + + int khstart = hstart < 0 ? 0 - hstart : 0; + int kwstart = wstart < 0 ? 0 - wstart : 0; hstart = max(hstart, 0); wstart = max(wstart, 0); hend = min(hend, hin); wend = min(wend, win); - Dtype aveval = 0; - const Dtype* const bottom_slice = - din + (n * channels + c) * size_channel_in; - const Dtype* const weight_slice = - weight + c * size_kernel; - - int khstart = hend < kernel_h ? kernel_h - hend : 0; - int kwstart = wend < kernel_w ? kernel_w - wend : 0; - + float aveval = 0; + const float* const bottom_slice = din + (n * channels + c) * size_channel_in; + const float* const weight_slice = weight + c * size_kernel; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - aveval += bottom_slice[h * win + w] * weight_slice[(khstart + h - hstart) * kernel_w + (kwstart + w - wstart)]; + aveval += bottom_slice[h * win + w] + * weight_slice[(khstart + h - hstart) * kernel_w + (kwstart + w - wstart)]; } } if (bias_flag) { aveval+=bias[c]; } if (relu_flag) { - aveval = max(aveval, (Dtype)0); + aveval = max(aveval, (float)0); } dout[index] = aveval; } } -template -__global__ void depthwise_conv_2d(const int channel_in_stride, const int channel_out_stride, - const int kernel_size, - const Dtype* const din, const int num, const int channels, - const int hin, const int win, const int hout, - const int wout, const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, const int pad_h, const int pad_w, - Dtype* const dout, const Dtype* const weight, const Dtype* const bias) { - - int w = blockIdx.x * blockDim.x + threadIdx.x; - int h = blockIdx.y * blockDim.y + threadIdx.y; - int c = blockIdx.z % channels; - //int n = blockIdx.z / channels; - int i = blockIdx.z; - int index = i * channel_out_stride + h * wout + w; - - if (w < wout && h < hout) { - int hstart = h * stride_h - pad_h; - int wstart = w * stride_w - pad_w; - int hend = min(hstart + kernel_h, hin + pad_h); - int wend = min(wstart + kernel_w, win + pad_w); +template +SaberStatus saber_depthwise_conv_act(const float* input, float* output, + int num, int cin, int hin, int win, int hout, int wout, + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, + const float* weights, const float* bias, cudaStream_t stream) { + + const int count = num * cin * hout * wout; + if (bias != nullptr) { + depthwise_conv_1d<<>>( + count, input, num, cin, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, pad_w, + output, weights, bias); + } else { + depthwise_conv_1d<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( + count, input, num, cin, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, + pad_w, output, weights, nullptr); + } + return SaberSuccess; +} + +#define MASK3 0xff000000 +#define MASK2 0x00ff0000 +#define MASK1 0x0000ff00 +#define MASK0 0x000000ff + +template +__global__ void depthwise_conv_1d_s8_s8(const int nthreads, + const void* din, const int num, const int channels, + const int hin, const int win, const int hout, + const int wout, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + void* dout, const void* weight, const float* bias, float alpha = 1.f) { +#if __CUDA_ARCH__ > 600 + int size_channel_in = hin * win; + int size_channel_out = hout * wout; + int size_kernel = kernel_h * kernel_w; + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % wout; + const int ph = (index / wout) % hout; + const int c = (index / size_channel_out) % channels; + const int n = index / size_channel_out / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = hstart + kernel_h; + int wend = wstart + kernel_w; + + int khstart = hstart < 0 ? 0 - hstart : 0; + int kwstart = wstart < 0 ? 0 - wstart : 0; hstart = max(hstart, 0); wstart = max(wstart, 0); hend = min(hend, hin); wend = min(wend, win); - Dtype aveval = 0; - const Dtype* const bottom_slice = din + i * channel_in_stride; - const Dtype* const weight_slice = weight + c * kernel_size; - int khstart = hend < kernel_h? kernel_h - hend : 0; - int kwstart = wend < kernel_w? kernel_w - wend : 0; + int aveval0 = 0; + int aveval1 = 0; + int aveval2 = 0; + int aveval3 = 0; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - aveval += bottom_slice[ih * win + iw] * weight_slice[(khstart + ih - hstart) * kernel_w + (kwstart + iw - wstart)]; + const int* bottom_slice = ((const int*)din); + bottom_slice += (n * channels + c) * size_channel_in; + const int* weight_slice= (const int*)weight; + weight_slice += c * size_kernel; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int in_data = bottom_slice[h * win + w]; + int weight_data = weight_slice[(khstart + h - hstart) * kernel_w + + (kwstart + w - wstart)]; + + int mask_weight; + mask_weight = MASK0 & weight_data; + aveval0 = __dp4a(in_data, mask_weight, aveval0); + mask_weight = MASK1 & weight_data; + aveval1 = __dp4a(in_data, mask_weight, aveval1); + mask_weight = MASK2 & weight_data; + aveval2 = __dp4a(in_data, mask_weight, aveval2); + mask_weight = MASK3 & weight_data; + aveval3 = __dp4a(in_data, mask_weight, aveval3); } } + float fa0 = static_cast(aveval0); + float fa1 = static_cast(aveval1); + float fa2 = static_cast(aveval2); + float fa3 = static_cast(aveval3); + fa0 *= alpha; + fa1 *= alpha; + fa2 *= alpha; + fa3 *= alpha; if (bias_flag) { - aveval+=bias[c]; + fa0 += bias[4 * c + 0]; + fa1 += bias[4 * c + 1]; + fa2 += bias[4 * c + 2]; + fa3 += bias[4 * c + 3]; } if (relu_flag) { - aveval = max(aveval, (Dtype)0); + fa0 = max(fa0, (float)0); + fa1 = max(fa1, (float)0); + fa2 = max(fa2, (float)0); + fa3 = max(fa3, (float)0); } - dout[index] = aveval; + char4 res = make_char4(static_cast(fa0), + static_cast(fa1), + static_cast(fa2), + static_cast(fa3)); + char4* d = ((char4*)dout); + d[index] = res; } +#endif } -template -SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ - int num, int cin, int hin, int win, int hout, int wout, \ - int kw, int kh, int stride_w, int stride_h, \ - int pad_w, int pad_h, const dtype* weights, const dtype* bias, \ - cudaStream_t stream) { +template +SaberStatus saber_depthwise_conv_act_s8_s8(const void* input, void* output, + int num, int cin, int hin, int win, int hout, int wout, + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, + const void* weights, const float* bias, cudaStream_t stream) { -#define D1 + CHECK_EQ(cin % 4, 0); + int cin_4 = cin / 4; + const int count = num * cin_4 * hout * wout; -#ifdef D1 - const int count = num * cin * hout * wout; -#else - dim3 block(32, 32); - int gx = (wout + block.x - 1) / block.x; - int gy = (hout + block.y - 1) / block.y; - dim3 grid(gx, gy, num * cin); - int channel_in_stride = hin * win; - int channel_out_stride = hout * wout; - int kernel_size = kw * kh; -#endif + if (bias != nullptr) { + depthwise_conv_1d_s8_s8<<>>( + count, input, num, cin_4, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, pad_w, + output, weights, bias, alpha); + } else { + depthwise_conv_1d_s8_s8<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( + count, input, num, cin_4, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, + pad_w, output, weights, nullptr, alpha); + } + return SaberSuccess; +} - if (bias_flag) { -#ifdef D1 - depthwise_conv_1d<<>>( - count, input, num, cin, hin, win, hout, wout, kh, \ - kw, stride_h, stride_w, pad_h, pad_w, \ - output, weights, bias); -#else - depthwise_conv_2d<<>>( - channel_in_stride, channel_out_stride, kernel_size, \ - input, num, cin, hin, win, hout, wout, kh, \ - kw, stride_h, stride_w, pad_h, pad_w, \ - output, weights, bias); +template +__global__ void depthwise_conv_1d_s8_f32(const int nthreads, + const void* din, const int num, const int channels, + const int hin, const int win, const int hout, + const int wout, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + void* dout, const void* weight, const float* bias, float alpha = 1.f) { +#if __CUDA_ARCH__ > 600 + int size_channel_in = hin * win; + int size_channel_out = hout * wout; + int size_kernel = kernel_h * kernel_w; + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % wout; + const int ph = (index / wout) % hout; + const int c = (index / size_channel_out) % channels; + const int n = index / size_channel_out / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = hstart + kernel_h; + int wend = wstart + kernel_w; + + int khstart = hstart < 0 ? 0 - hstart : 0; + int kwstart = wstart < 0 ? 0 - wstart : 0; + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, hin); + wend = min(wend, win); + + int aveval0 = 0; + int aveval1 = 0; + int aveval2 = 0; + int aveval3 = 0; + + const int* bottom_slice = (const int*)din + (n * channels + c) * size_channel_in; + const int* weight_slice = (const int*)weight + c * size_kernel; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int in_data = bottom_slice[h * win + w]; + int weight_data = weight_slice[(khstart + h - hstart) * kernel_w + + (kwstart + w - wstart)]; + int mask_weight; + mask_weight = MASK0 & weight_data; + aveval0 = __dp4a(in_data, mask_weight, aveval0); + mask_weight = MASK1 & weight_data; + aveval1 = __dp4a(in_data, mask_weight, aveval1); + mask_weight = MASK2 & weight_data; + aveval2 = __dp4a(in_data, mask_weight, aveval2); + mask_weight = MASK3 & weight_data; + aveval3 = __dp4a(in_data, mask_weight, aveval3); + } + } + float fa0 = static_cast(aveval0); + float fa1 = static_cast(aveval1); + float fa2 = static_cast(aveval2); + float fa3 = static_cast(aveval3); + fa0 *= alpha; + fa1 *= alpha; + fa2 *= alpha; + fa3 *= alpha; + + if (bias_flag) { + fa0 += bias[4 * c + 0]; + fa1 += bias[4 * c + 1]; + fa2 += bias[4 * c + 2]; + fa3 += bias[4 * c + 3]; + } + if (relu_flag) { + fa0 = max(fa0, (float)0); + fa1 = max(fa1, (float)0); + fa2 = max(fa2, (float)0); + fa3 = max(fa3, (float)0); + } + + int output_slice = hout * wout; + int out_idx = (index % output_slice) + 4 * c * output_slice; + ((float*)dout)[out_idx] = fa0; out_idx += output_slice; + ((float*)dout)[out_idx] = fa1; out_idx += output_slice; + ((float*)dout)[out_idx] = fa2; out_idx += output_slice; + ((float*)dout)[out_idx] = fa3; + } #endif +} + +template +SaberStatus saber_depthwise_conv_act_s8_f32(const void* input, void* output, + int num, int cin, int hin, int win, int hout, int wout, + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, + const void* weights, const float* bias, cudaStream_t stream) { + + CHECK_EQ(cin % 4, 0); + int cin_4 = cin / 4; + const int count = num * cin_4 * hout * wout; + + if (bias != nullptr) { + depthwise_conv_1d_s8_f32<<>>( + count, input, num, cin_4, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, pad_w, + output, weights, bias, alpha); } else { -#ifdef D1 - depthwise_conv_1d<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( - count, input, num, cin, hin, win, hout, wout, kh, \ - kw, stride_h, stride_w, pad_h, \ - pad_w, output, weights, nullptr); -#else - depthwise_conv_2d<<>>( - channel_in_stride, channel_out_stride, kernel_size, \ - input, num, cin, hin, win, hout, wout, kh, \ - kw, stride_h, stride_w, pad_h, pad_w, \ - output, weights, nullptr); -#endif + depthwise_conv_1d_s8_f32<<< CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>> ( + count, input, num, cin_4, hin, win, hout, wout, kh, + kw, stride_h, stride_w, pad_h, + pad_w, output, weights, nullptr, alpha); } - return SaberSuccess; } -#define INSTANCE_CONVACT(dtype, ifbias, ifrelu) \ +#define INSTANCE_CONVACT(ifrelu) \ template \ - SaberStatus saber_depthwise_conv_act (const dtype* input, dtype* output, \ + SaberStatus saber_depthwise_conv_act (const float* input, float* output, \ int num, int cin, int hin, int win, int hout, int wout, \ int kw, int kh, int stride_w, int stride_h, \ - int pad_h, int pad_w, const dtype* weights, const dtype* bias, cudaStream_t stream); + int pad_h, int pad_w, const float* weights, const float* bias, cudaStream_t stream); -INSTANCE_CONVACT(float, true, true); -INSTANCE_CONVACT(float, true, false); -INSTANCE_CONVACT(float, false, true); -INSTANCE_CONVACT(float, false, false); +#define INSTANCE_CONVACT_S8_S8(ifrelu) \ +template \ +SaberStatus saber_depthwise_conv_act_s8_s8(const void* input, void* output, \ + int num, int cin, int hin, int win, int hout, int wout, \ + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, \ + const void* weights, const float* bias, cudaStream_t stream); -} //namespace anakin +#define INSTANCE_CONVACT_S8_F32(ifrelu) \ +template \ +SaberStatus saber_depthwise_conv_act_s8_f32(const void* input, void* output, \ + int num, int cin, int hin, int win, int hout, int wout, \ + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, \ + const void* weights, const float* bias, cudaStream_t stream); +INSTANCE_CONVACT(true); +INSTANCE_CONVACT(false); +INSTANCE_CONVACT_S8_S8(true); +INSTANCE_CONVACT_S8_S8(false); +INSTANCE_CONVACT_S8_F32(true); +INSTANCE_CONVACT_S8_F32(false); + +} //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu index 19e915c07..8c88ddae1 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_detection_output.cu @@ -5,25 +5,24 @@ namespace anakin{ namespace saber{ template __global__ void permute_data_kernel(const int nthreads, - const dtype* data, const int num_classes, const int num_data, - const int num_dim, dtype* new_data) { + const dtype* data, const int num_classes, const int priors, + const int num_dim, dtype* new_data) { CUDA_KERNEL_LOOP(index, nthreads) { const int i = index % num_dim; const int c = (index / num_dim) % num_classes; - const int d = (index / num_dim / num_classes) % num_data; - const int n = index / num_dim / num_classes / num_data; - const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i; + const int d = (index / num_dim / num_classes) % priors; + const int n = index / num_dim / num_classes / priors; + const int new_index = ((n * num_classes + c) * priors + d) * num_dim + i; new_data[new_index] = data[index]; } } template -void permute_data(const int nthreads, - const dtype* data, const int num_classes, const int num_data, - const int num_dim, dtype* new_data, cudaStream_t stream) { +void permute_data(const int nthreads, const dtype* data, const int num_classes, const int priors, \ + const int num_dim, dtype* new_data, cudaStream_t stream) { // NOLINT_NEXT_LINE(whitespace/operators) permute_data_kernel<<>>(nthreads, data, num_classes, num_data, num_dim, new_data); + CUDA_NUM_THREADS, 0, stream>>>(nthreads, data, num_classes, priors, num_dim, new_data); } template @@ -35,44 +34,93 @@ SaberStatus SaberDetectionOutput::dispatch(const std::vector* t_loc = inputs[0]; Tensor* t_conf = inputs[1]; - Tensor* t_prior = inputs[2]; + Tensor* t_prior; - const dtype* loc_data = static_cast(t_loc->data()); - const dtype* prior_data = static_cast(t_prior->data()); - const int num = t_loc->num(); + CHECK_EQ(t_loc->get_dtype(), AK_FLOAT) << "input data type must be float"; + CHECK_EQ(t_conf->get_dtype(), AK_FLOAT) << "input data type must be float"; - // Decode predictions. - dtype* bbox_data = static_cast(_bbox_preds.mutable_data()); - const int loc_count = _bbox_preds.valid_size(); - decode_bboxes(loc_count, loc_data, prior_data, param.type, \ - param.variance_encode_in_target, _num_priors, param.share_location, \ - _num_loc_classes, param.background_id, bbox_data, stream); - // Retrieve all decoded location predictions. - if (!param.share_location) { - dtype * bbox_permute_data = static_cast(_bbox_permute.mutable_data()); - permute_data(loc_count, bbox_data, _num_loc_classes, _num_priors, - 4, bbox_permute_data, stream); - } - // Retrieve all confidences. - dtype* conf_permute_data = static_cast(_conf_permute.mutable_data()); - permute_data(t_conf->valid_size(), static_cast(t_conf->data()), \ - this->_num_classes, _num_priors, 1, conf_permute_data, stream); - - CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, static_cast(_bbox_preds.data()), \ - _bbox_preds.valid_size() * sizeof(dtype), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, static_cast(_conf_permute.data()), \ - _conf_permute.valid_size() * sizeof(dtype), cudaMemcpyDeviceToHost, stream)); - cudaStreamSynchronize(stream); - - std::vector result; + std::vector priors; - nms_detect(_bbox_cpu_data, _conf_cpu_data, result, num, this->_num_classes, _num_priors, param.background_id, \ - param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, param.share_location); + if (_shared_loc) { + //! for one stage + const int num = t_loc->num(); + for (int i = 0; i < num; ++i) { + priors.push_back(_num_priors / num); + } + //! for ssd + bool is_ssd = inputs.size() > 2; + if (is_ssd) { + t_prior = inputs[2]; + } + if (is_ssd) { + int num_priors = _num_priors / num; + auto loc_data = static_cast(t_loc->data()); + auto prior_data = static_cast(t_prior->data()); + // Decode predictions. + float* bbox_data = static_cast(_bbox_preds.mutable_data()); + const int loc_count = _bbox_preds.valid_size(); + decode_bboxes(loc_count, loc_data, prior_data, param.type, \ + param.variance_encode_in_target, num_priors, param.share_location, \ + _num_loc_classes, param.background_id, bbox_data, stream); + // Retrieve all decoded location predictions. + if (!param.share_location) { + float * bbox_permute_data = static_cast(_bbox_permute.mutable_data()); + permute_data(loc_count, bbox_data, _num_loc_classes, num_priors, + 4, bbox_permute_data, stream); + } + // Retrieve all confidences. + float* conf_permute_data = static_cast(_conf_permute.mutable_data()); + permute_data(t_conf->valid_size(), static_cast(t_conf->data()), \ + this->_num_classes, num_priors, 1, conf_permute_data, stream); + CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, static_cast(_bbox_preds.data()), \ + _bbox_preds.valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, static_cast(_conf_permute.data()), \ + _conf_permute.valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + } else { //! for multiclass nms + CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, t_loc->data(), \ + t_loc->valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, t_conf->data(), \ + t_conf->valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + } + cudaStreamSynchronize(stream); + } else { + auto conf_permute = static_cast(_conf_permute.mutable_data()); + auto bbox_permute = static_cast(_bbox_permute.mutable_data()); + auto conf_ori = static_cast(t_conf->data()); + auto bbox_ori = static_cast(t_loc->data()); + //! for two stage + //! sizeof seq offset is N + 1 + auto offset = t_loc->get_seq_offset()[0]; + for (int i = 0; i < offset.size() - 1; ++i) { + int num_priors = offset[i + 1] - offset[i]; + priors.push_back(num_priors); + const float* conf_ori_batch = conf_ori + this->_num_classes * offset[i]; + const float* bbox_ori_batch = bbox_ori + this->_num_classes * 4 * offset[i]; + float* conf_permute_batch = conf_permute + this->_num_classes * offset[i]; + float* bbox_permute_batch = bbox_permute + this->_num_classes * 4 * offset[i]; + //! permute conf and bbox + //! input bbox layout is [M, C, 4], multi-batch view: [{priors0, C, 4}, {priors1, C, 4}, ...] + //! permute bbox data to [{C, priors0, 4}, {C, priors1, 4}, ...] + //! input conf layout is [M, C], multi-batch view: [{priors0, C}, {priors1, C}, ...] + //! permute conf data to [{C, priors0}, {C, priors1}, ...] + permute_data(num_priors * this->_num_classes, conf_ori_batch, + this->_num_classes, num_priors, 1, conf_permute_batch, stream); + permute_data(num_priors * this->_num_classes * 4, bbox_ori_batch, + this->_num_classes, num_priors, 4, bbox_permute_batch, stream); + } + CUDA_CHECK(cudaMemcpyAsync(_bbox_cpu_data, bbox_permute, \ + _bbox_permute.valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(_conf_cpu_data, conf_permute, \ + _conf_permute.valid_size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); + } + std::vector result; + nms_detect(_bbox_cpu_data, _conf_cpu_data, result, priors, this->_num_classes, param.background_id, \ + param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, _shared_loc); if(result.size() == 0) { result.resize(7); for (int i = 0; i < 7; ++i) { - result[i] = (dtype)-1; + result[i] = (float)-1; } outputs[0]->reshape(Shape({1, 1, 1, 7})); } else { @@ -80,7 +128,7 @@ SaberStatus SaberDetectionOutput::dispatch(const std::vectormutable_data(), result.data(), \ - result.size() * sizeof(dtype), cudaMemcpyHostToDevice, stream)); + result.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); return SaberSuccess; } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu index 8fdf09bbd..6932d3e68 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_eltwise.cu @@ -63,9 +63,9 @@ static __global__ void ker_multi_elt_max(Dtype* out_data, const Dtype** in_data, } #endif -template -__global__ void ker_elt_production(Dtype* out_data, const Dtype* in_data_a, const Dtype* in_data_b, - int count) { +template +__global__ void ker_elt_prod(Dtype* out_data, const Dtype* in_data_a, const Dtype* in_data_b, + int count, bool with_relu) { CUDA_KERNEL_LOOP(tid, count) { Dtype tmp = in_data_a[tid] * in_data_b[tid]; @@ -77,9 +77,9 @@ __global__ void ker_elt_production(Dtype* out_data, const Dtype* in_data_a, cons } } -template +template __global__ void ker_elt_sum(Dtype* out_data, const Dtype* in_data1, const Dtype* in_data2, - Dtype coeff1, Dtype coeff2, int count) { + Dtype coeff1, Dtype coeff2, int count, bool with_relu) { CUDA_KERNEL_LOOP(tid, count) { Dtype tmp = coeff1 * in_data1[tid] + coeff2 * in_data2[tid]; @@ -91,9 +91,9 @@ __global__ void ker_elt_sum(Dtype* out_data, const Dtype* in_data1, const Dtype* } } -template +template __global__ void ker_elt_max(Dtype* out_data, const Dtype* in_data_a, const Dtype* in_data_b, - int count) { + int count, bool with_relu) { CUDA_KERNEL_LOOP(tid, count) { Dtype tmp; @@ -110,115 +110,184 @@ __global__ void ker_elt_max(Dtype* out_data, const Dtype* in_data_a, const Dtype } } +template +__global__ void ker_elt_div(Dtype* out_data, const Dtype* in_data1, const Dtype* in_data2, + int count, bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = in_data1[tid] /in_data2[tid]; + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + +template +__global__ void ker_elt_with_axis_div(Dtype* out_data, const Dtype* in_data1, const Dtype* in_data2, + int outer_num, int mid_num, int inner_num, int count, bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + int mid_id = (tid /inner_num) % mid_num; + Dtype tmp = in_data1[tid] /in_data2[mid_id]; + + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} -template -SaberStatus SaberEltwise::dispatch(\ +template +__global__ void ker_elt_sum_v(Dtype* out_data, const Dtype** in_data_v, const Dtype* coeff, int in_num, int count, + bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = 0.f; + for (int i = 0; i < in_num; i++) { + tmp += coeff[i] * in_data_v[i][tid]; + } + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + +template +__global__ void ker_elt_prod_v(Dtype* out_data, const Dtype** in_data_v,int in_num, int count, + bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = 1.f; + for (int i = 0; i < in_num; i++) { + tmp *=in_data_v[i][tid]; + } + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + +template +__global__ void ker_elt_max_v(Dtype* out_data, const Dtype** in_data_v, int in_num, int count, + bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = in_data_v[0][tid]; + for (int i = 1; i < in_num; i++) { + tmp = in_data_v[i][tid] > tmp ? in_data_v[i][tid] : tmp; + } + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + +template +__global__ void ker_elt_div_v(Dtype* out_data, const Dtype** in_data_v, int in_num, int count, + bool with_relu) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype tmp = in_data_v[0][tid]; + for (int i = 1; i < in_num; i++) { + tmp = tmp / in_data_v[i][tid]; + } + if (with_relu) { + out_data[tid] = tmp > static_cast(0.0f) ? tmp : static_cast(0.0f); + } else { + out_data[tid] = tmp; + } + } +} + + +template <> +SaberStatus SaberEltwise::dispatch(\ const std::vector *>& inputs, \ std::vector *>& outputs, \ EltwiseParam& param) { const int count = outputs[0]->valid_size(); - OpDataType* out_data = static_cast(outputs[0]->mutable_data()); - const OpDataType* in_data_a = static_cast(inputs[0]->data()); - const OpDataType* in_data_b = static_cast(inputs[1]->data()); + float* out_data = static_cast(outputs[0]->mutable_data()); + const float* in_data_a = static_cast(inputs[0]->data()); + const float* in_data_b = static_cast(inputs[1]->data()); cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int in_num = inputs.size(); + uint64_t in_data_h[in_num]; + for (int i = 0; i < in_num; i++) { + in_data_h[i] = (uint64_t)inputs[i]->data(); + } + uint64_t* in_data_d = (uint64_t*) _inputs_d.mutable_data(); + const float* coeff_data_d = (const float*) _coeff_d.data(); + cudaMemcpyAsync(in_data_d, in_data_h, sizeof(uint64_t) * in_num, cudaMemcpyHostToDevice, cuda_stream); int grid_dim = CUDA_GET_BLOCKS(count); int block_dim = CUDA_NUM_THREADS; + switch (param.operation) { case Eltwise_prod: - if (_with_relu) { - if (inputs.size() <= 2) { - ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, in_data_a, - in_data_b, count); - } else { - ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, - in_data_a, - in_data_b, count); - - for (int i = 2; i < inputs.size() - 1; i++) { - ker_elt_production - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[i]->data()), count); - } - - ker_elt_production - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[inputs.size() - 1]->data()), count); - } - + if (inputs.size() <= 2) { + ker_elt_prod <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, in_data_a, + in_data_b, count, _with_relu); } else { - - ker_elt_production <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, - in_data_a, - in_data_b, count); - - for (int i = 2; i < inputs.size(); i++) { - ker_elt_production - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[i]->data()), count); - } - + ker_elt_prod_v <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, + (const float**)in_data_d, + in_num, + count, + _with_relu); } break; case Eltwise_sum: - if (_with_relu) { - ker_elt_sum - <<< - grid_dim, block_dim, 0, cuda_stream >>> (out_data, + if (inputs.size() <= 2) { + ker_elt_sum <<>> (out_data, in_data_a, in_data_b, - param.coeff[0], param.coeff[1], count); + param.coeff[0], param.coeff[1], count, _with_relu); } else { - ker_elt_sum - <<< - grid_dim, block_dim, 0, cuda_stream >>> (out_data, - in_data_a, in_data_b, - param.coeff[0], param.coeff[1], count); + ker_elt_sum_v<<>> (out_data, + (const float**)in_data_d, + coeff_data_d, in_num, count, _with_relu); } break; case Eltwise_max: + if (inputs.size() <= 2) { + ker_elt_max <<>> (out_data, + in_data_a, in_data_b, + count, _with_relu); + } else { + ker_elt_max_v<<>> (out_data, + (const float**)in_data_d, + in_num, + count, _with_relu); + } - // mask = (float *) _max_idx.mutable_data(); - if (_with_relu) { - if (inputs.size() <= 2) { - ker_elt_max - <<< grid_dim, block_dim, 0, cuda_stream >>>(out_data, - in_data_a, in_data_b, count); + break; + case Eltwise_div: + if (inputs.size() <= 2) { + if (inputs[0]->valid_size() == inputs[1]->valid_size()) { + ker_elt_div <<>> (out_data, + in_data_a, in_data_b, + count, _with_relu); } else { - ker_elt_max <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, - in_data_a, - in_data_b, count); - - for (int i = 2; i < inputs.size() - 1; i++) { - ker_elt_max - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[i]->data()), count); - } - - ker_elt_max - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[inputs.size() - 1]->data()), count); + int outer_num = inputs[0]->count(0, param.axis); + int mid_num = outputs[0]->valid_size(); + int inner_num = inputs[0]->count(param.axis, inputs[0]->dims()) / mid_num; + ker_elt_with_axis_div <<>> (out_data, + in_data_a, in_data_b, outer_num, mid_num, inner_num, + count, _with_relu); } } else { - - ker_elt_max <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, - in_data_a, - in_data_b, count); - - for (int i = 2; i < inputs.size() ; i++) { - ker_elt_max - <<< grid_dim, block_dim, 0, cuda_stream>>>(out_data, out_data, - static_cast(inputs[i]->data()), count); - } - + ker_elt_div_v<<>> (out_data, + (const float**)in_data_d, in_num, count, _with_relu); } - break; default: @@ -233,9 +302,38 @@ SaberStatus SaberEltwise::dispatch(\ return SaberSuccess; } +template <> +SaberStatus SaberEltwise::create( + const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param, + Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberEltwise::init( + const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param, + Context& ctx) { + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberEltwise::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + EltwiseParam& param) { + return SaberSuccess; +} + template class SaberEltwise; +template class SaberEltwise; DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, NV, AK_HALF); -DEFINE_OP_TEMPLATE(SaberEltwise, EltwiseParam, NV, AK_INT8); + +} } -} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_fake_quantize_abs_max.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_fake_quantize_abs_max.cu deleted file mode 100644 index ab7f43abf..000000000 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_fake_quantize_abs_max.cu +++ /dev/null @@ -1,172 +0,0 @@ -#include "saber/funcs/impl/cuda/saber_fake_quantize_abs_max.h" -#include "cuda_fp16.h" -#include "saber/funcs/impl/cuda/cudnn_helper.h" - -namespace anakin { -namespace saber { - -template <> -SaberStatus SaberFakeQuantizeAbsMax::\ - create(const std::vector *>& inputs, - std::vector *>& outputs, - FakeQuantizeAbsMaxParam& param, Context& ctx) { - if (&ctx != this->_ctx) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - this->_ctx = &ctx; - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - - - Shape in_stride = inputs[0]->get_stride(); - Shape max_abs_stride = std::vector{1, 1, 1, 1}; - - int dim_a[] = {input_num, input_channel, - input_height, input_width}; - int dim_b[] = {1, 1, 1, 1}; - - cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); - - cudnn::setTensorNdDesc(&_output_descs, - _max_abs.dims(), dim_b, &max_abs_stride[0]); - - cudnn::setReduceTensorDesc(&_reduce_tensor_descs, - CUDNN_REDUCE_TENSOR_AMAX, - CUDNN_PROPAGATE_NAN, - CUDNN_REDUCE_TENSOR_NO_INDICES, - CUDNN_64BIT_INDICES); - - // Get fastest implement of cudnn - // set up algo and workspace size - size_t workspace_size = 0; - - CUDNN_CHECK(cudnnGetReductionWorkspaceSize( - _handle, _reduce_tensor_descs, _input_descs, _output_descs, &workspace_size)); - - if (workspace_size > _workspaceSizeInBytes) { - _workspaceSizeInBytes = workspace_size; - if (_workspace != NULL) { - cudaFree(_workspace); - } - cudaMalloc(&_workspace, _workspaceSizeInBytes); - } - - size_t indices_size = 0; - CUDNN_CHECK(cudnnGetReductionIndicesSize(_handle, _reduce_tensor_descs, - _input_descs, _output_descs, &indices_size)); - if (indices_size > _indices_size) { - _indices_size = indices_size; - if (_indices != NULL) { - cudaFree(_indices); - } - cudaMalloc(&_indices, _indices_size); - } - - return SaberSuccess; -} - -template <> -SaberStatus SaberFakeQuantizeAbsMax::\ - init(const std::vector *>& inputs, - std::vector *>& outputs, - FakeQuantizeAbsMaxParam& param, Context& ctx) { - _workspaceSizeInBytes = 0; - _workspace = NULL; - _indices = NULL; - _indices_size = 0; - - this->_ctx = &ctx; - // ---- get cuda resources ---- - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - int in_channels = inputs[0]->channel(); - // ---- create cudnn Descs ---- - cudnn::createReduceTensorDesc(&_reduce_tensor_descs); - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - Shape max_abs_shape = std::vector{1, 1, 1, 1}; - _max_abs.reshape(max_abs_shape); - - return create(inputs, outputs, param, ctx); -} - - -template -__global__ void ker_fake_quantize_max_abs_fwd(Ttype * out_data, \ - const Dtype* in_data, - const Dtype scale, - const int count) -{ - CUDA_KERNEL_LOOP(tid, count){ - out_data[tid] = round(in_data[tid] * scale); - //printf("%d, %d\n", tid, (int)out_data[tid]); - } -} - - -template -SaberStatus SaberFakeQuantizeAbsMax::dispatch(\ - const std::vector *>& inputs, \ - std::vector *>& outputs, \ - FakeQuantizeAbsMaxParam& param) { - const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); - OpDataType* max_abs_data = (OpDataType*) _max_abs.mutable_data(); - - cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); - int count = outputs[0]->valid_size(); - float alpha = 1.0f; - float beta = 0.f; - OpDataType cpu_max_abs_data; - - if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { - cudnnReduceTensor(_handle, - _reduce_tensor_descs, - _indices, - _indices_size, - _workspace, - _workspaceSizeInBytes, - &alpha, - _input_descs, - in_data, - &beta, - _output_descs, - max_abs_data); - cudaMemcpyAsync((void*)&cpu_max_abs_data, (void*)max_abs_data, sizeof(OpDataType) * 1, cudaMemcpyDeviceToHost, cuda_stream); - OpDataType scale = ((1 << (param.bit_length - 1)) - 1) / cpu_max_abs_data; - auto out_data = outputs[0]->mutable_data(); - //LOG(INFO) <<"gpu max_data" << cpu_max_abs_data; - if (param.bit_length == 8) { - ker_fake_quantize_max_abs_fwd\ - <<>>(\ - (char*)out_data, in_data, \ - scale, count); - } else if (param.bit_length == 16) { - ker_fake_quantize_max_abs_fwd\ - <<>>(\ - (int16_t*)out_data, in_data, \ - scale, count); - } else { - LOG(FATAL) << "other bit length has not been supported"; - } - } - - return SaberSuccess; -} - -DEFINE_OP_TEMPLATE(SaberFakeQuantizeAbsMax, FakeQuantizeAbsMaxParam, NV, AK_HALF); -DEFINE_OP_TEMPLATE(SaberFakeQuantizeAbsMax, FakeQuantizeAbsMaxParam, NV, AK_INT8); -} -} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu index da53bd435..b08c54a05 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_fc.cu @@ -1,4 +1,5 @@ #include "saber/funcs/impl/cuda/saber_fc.h" +#include "saber/funcs/calibrate.h" #include "sass_funcs.h" namespace anakin{ @@ -13,42 +14,84 @@ __global__ void add_bias(int n, int output_size, const dtype* bias, dtype* dout) } } -template -SaberStatus SaberFc::dispatch( +template <> +SaberStatus SaberFc::create( + const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context& ctx){ + + if (!(&ctx == this->_ctx)) { + this->_ctx = &ctx; + } + + Shape shape_out = inputs[0]->valid_shape(); + _M = inputs[0]->count_valid(0, param.axis); + _K = inputs[0]->count_valid(param.axis, inputs[0]->dims()); + _N = param.num_output; + _flag_trans_weights = param.is_transpose_weights; + if (_N <= 0) { + int weight_size = param.weights->valid_size(); + _N = weight_size / _K; + } + //! weights dims must be in h and w + _gemm->init(false, !_flag_trans_weights, _M, _N, _K, *_ctx); + + return SaberSuccess; +} + +template <> +SaberStatus SaberFc::init( + const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context &ctx) { + // get context + this->_ctx = &ctx; + int generate_arch = Env::cur_env()[_ctx->get_device_id()]._info._generate_arch; + bool arch_check = (generate_arch == 50) || (generate_arch == 61); + if (arch_check) { + _gemm = new Gemm; + } else { + _gemm = new Gemm; + } + return create(inputs, outputs, param, ctx); +} + + +template <> +SaberStatus SaberFc::dispatch( const std::vector *>& inputs, std::vector *>& outputs, FcParam& param) { cudaStream_t stream = this->_ctx->get_compute_stream(); - const OpDataType *din = (const OpDataType *)inputs[0]->data(); - OpDataType *dout = (float *)outputs[0]->mutable_data(); - const OpDataType *weight = (OpDataType *)param.weights->data(); - const OpDataType *bias = nullptr; - + const float *din = (const float *)inputs[0]->data(); + float *dout = (float *)outputs[0]->mutable_data(); + const float *weight = (float *)param.weights->data(); + const float *bias = nullptr; bool bias_term = param.bias != nullptr; if (bias_term) { - bias = (const OpDataType *)param.bias->data(); + bias = (const float *)param.bias->data(); } float alpha = 1.f; float beta = 0.f; - _kernel(_M, _N, _K, alpha, din, beta, weight, dout, stream); + _gemm->dispatch(alpha, beta, din, weight, dout); if (bias_term) { int total_size = _M * _N; - add_bias<<>>\ + add_bias<<>>\ (total_size, _N, bias, dout); } return SaberSuccess; } template class SaberFc; -DEFINE_OP_TEMPLATE(SaberFc, FcParam, NV, AK_HALF); DEFINE_OP_TEMPLATE(SaberFc, FcParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberFc, FcParam, NV, AK_HALF); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_generate_proposals.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_generate_proposals.cu new file mode 100644 index 000000000..4de842c0c --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_generate_proposals.cu @@ -0,0 +1,582 @@ +#include "saber/funcs/impl/cuda/saber_generate_proposals.h" +#include "cuda_fp16.h" +#include "saber/funcs/debug.h" +#define TILE_DIM 16 +#define NMS_THREADS_PER_BLOCK 64 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +namespace anakin{ +namespace saber{ +//const float bbox_clip_default = std::log(1000.0 / 16.0); +template +__global__ void ker_nchw_to_nhwc(Dtype * out_data, + const int n, + const int c, + const int hw, + const int row_block_num_per_im, + const Dtype* in_data) +{ + __shared__ float tile[TILE_DIM][TILE_DIM]; + int im_id = blockIdx.y / row_block_num_per_im; + int block_id_y = blockIdx.y % row_block_num_per_im; + int x_index = blockIdx.x * TILE_DIM + threadIdx.x; + int y_index = block_id_y * TILE_DIM + threadIdx.y; + int index_in = im_id * c * hw + x_index + y_index * hw; + + if (x_index < hw && y_index < c) { + tile[threadIdx.y][threadIdx.x] = in_data[index_in]; + } + __syncthreads(); + + x_index = block_id_y * TILE_DIM + threadIdx.x; + y_index = blockIdx.x * TILE_DIM + threadIdx.y; + int index_out = im_id * hw * c + x_index + y_index * c; + + if (x_index < c && y_index < hw) { + out_data[index_out] = tile[threadIdx.x][threadIdx.y]; + } +} +template +void trans(Tensor* in_tensor, Tensor* out_tensor, cudaStream_t stream) { + int n = in_tensor->num(); + int c = in_tensor->channel(); + int hw = in_tensor->height() * in_tensor->width(); + auto in_data = (const Dtype*)in_tensor->data(); + auto out_data = (Dtype*)out_tensor->mutable_data(); + dim3 block_dim(TILE_DIM, TILE_DIM); + dim3 grid_dim((hw + TILE_DIM -1) / TILE_DIM, n * (c + TILE_DIM -1) / TILE_DIM); + int row_block_num_per_im = (c + TILE_DIM -1) / TILE_DIM; + ker_nchw_to_nhwc<<>>(out_data, + n, + c, + hw, + row_block_num_per_im, + in_data); + +} +__global__ void index_init(int* out_data, int h, int w) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = idx; i < h * w; i += blockDim.x * gridDim.x) { + int w_id = i % w; + out_data[i] = w_id; + } +} + + +template +void sort_descending(Tensor* out_value, + Tensor* out_index, + Tensor* in_value, + Tensor* in_index, + const int pre_nms_num, + cudaStream_t stream) { + in_index->reshape(in_value->valid_shape()); + out_value->reshape(Shape({in_value->num(), pre_nms_num, 1, 1}, Layout_NCHW)); + out_index->reshape(Shape({in_value->num(), pre_nms_num, 1, 1}, Layout_NCHW)); + in_index->set_dtype(AK_INT32); + out_index->set_dtype(AK_INT32); + int sort_length = in_value->valid_size() / in_value->num(); + index_init<<valid_size()), CUDA_NUM_THREADS, 0, stream>>>((int*)in_index->mutable_data(), in_value->num(), sort_length); + + Tensor in_h(in_value->valid_shape()); + Tensor index_h(in_index->valid_shape()); + cudaMemcpyAsync(in_h.data(), in_value->data(), sizeof(Dtype) * in_value->valid_size(), cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(index_h.data(), in_index->data(), sizeof(int) * in_index->valid_size(), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + auto in_score = (Dtype*)in_h.mutable_data(); + auto out_score = (Dtype*) out_value->mutable_data(); + auto in_index_data = (int*)index_h.mutable_data(); + auto out_index_data = (int *) out_index->mutable_data(); + + auto compare = [in_score](const int &i, const int &j) { + return in_score[i] > in_score[j]; + }; + std::vector sorted_scores; + std::vector sorted_index; + for (int i = 0; i < in_value->num(); i++) { + std::partial_sort(in_index_data, in_index_data + pre_nms_num, in_index_data + sort_length, compare); + for (int j = 0; j < pre_nms_num; j++) { + sorted_scores.push_back(in_score[in_index_data[j]]); + sorted_index.push_back(in_index_data[j]); + } + in_score += sort_length; + in_index_data += sort_length; + } + cudaMemcpyAsync(out_index_data, &sorted_index[0], sizeof(int)*out_index->valid_size(), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(out_score, &sorted_scores[0], sizeof(Dtype)*out_value->valid_size(), cudaMemcpyHostToDevice, stream); +} + +//template +//void sort_descending(Tensor* out_value, +// Tensor* out_index, +// Tensor* in_value, +// Tensor* in_index, +// cudaStream_t stream) { +// in_index->set_dtype(AK_INT32); +// out_index->set_dtype(AK_INT32); +// in_index->reshape(in_value->valid_shape()); +// out_value->reshape(in_value->valid_shape()); +// out_index->reshape(in_value->valid_shape()); +// auto in_data = (Dtype*)in_value->mutable_data(); +// auto out_data = (Dtype*) out_value->mutable_data(); +// auto in_index_data = (int*)in_index->mutable_data(); +// auto out_index_data = (int *) out_index->mutable_data(); +// int sort_length = in_value->valid_size()/in_value->num(); +// int count = in_value->valid_size(); +// index_init<<>>(in_index_data, in_value->num(), sort_length); +// cudaMemcpyAsync(out_data, in_data, sizeof(Dtype) * in_value->valid_size(), cudaMemcpyDeviceToDevice, stream); +// cudaStreamSynchronize(stream); +// +// size_t temp_storage_bytes = 0; +// void* temp_storage = NULL; +// cub::DoubleBuffer d_keys(in_data, out_data); +// cub::DoubleBuffer d_values(in_index_data, out_index_data); +// cub::DeviceRadixSort::SortPairsDescending( +// temp_storage, temp_storage_bytes, d_keys, d_values, sort_length); +// cudaMalloc((void**)&temp_storage, temp_storage_bytes); +// for (int i = 0; i < in_value->num(); i++) { +// cub::DoubleBuffer d_keys(in_data, out_data); +// cub::DoubleBuffer d_values(in_index_data, out_index_data); +// size_t temp_storage_bytes = 0; +// cub::DeviceRadixSort::SortPairsDescending( +// temp_storage, temp_storage_bytes, d_keys, d_values, sort_length); +// // thrust::device_vector D(sort_length); +// // thrust::device_vector Index(sort_length); +// // thrust::sequence(Index.begin(), Index.end ()); +// // thrust::stable_sort_by_key(D.begin(), D.end(), Index.begin, thrust::greater()); +// +// //thrust::stable_sort_by_key(out_data, out_data + sort_length, out_index_data, thrust::greater()); +// in_data += sort_length; +// out_data += sort_length; +// in_index_data += sort_length; +// out_index_data += sort_length; +// } +//} +template +__device__ T Min(T a, T b) { return a > b ? b : a; } + +template +__device__ T Max(T a, T b) { return a > b ? a : b; } + +template +__global__ void ker_box_decode_and_clip(Dtype* proposals_data, + const Dtype* anchors_data, + const Dtype* deltas_data, + const Dtype* var_data, + const int* index_data, + const Dtype* im_info_data, + const float bbox_clip_default, + const int img_num, + const int index_length, + const int anchor_num, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int im_id = tid / index_length; + int anchor_id = index_data[tid]; + auto cur_anchor = anchors_data + anchor_id * 4; + auto cur_delta = deltas_data + anchor_id * 4 + im_id * anchor_num * 4; + auto cur_proposal = proposals_data + tid * 5; + auto cur_im_info = im_info_data + im_id * 3; + Dtype axmin = cur_anchor[0]; + Dtype aymin = cur_anchor[1]; + Dtype axmax = cur_anchor[2]; + Dtype aymax = cur_anchor[3]; + auto w = axmax - axmin + 1.0; + auto h = aymax - aymin + 1.0; + auto cx = axmin + 0.5 * w; + auto cy = aymin + 0.5 * h; + auto dxmin = cur_delta[0]; + auto dymin = cur_delta[1]; + auto dxmax = cur_delta[2]; + auto dymax = cur_delta[3]; + Dtype d_cx, d_cy, d_w, d_h; + if (var_data) { + auto cur_var = var_data + anchor_id * 4; + d_cx = cx + dxmin * w * cur_var[0]; + d_cy = cy + dymin * h * cur_var[1]; + d_w = exp(Min(dxmax * cur_var[2], bbox_clip_default)) * w; + d_h = exp(Min(dymax * cur_var[3], bbox_clip_default)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, bbox_clip_default)) * w; + d_h = exp(Min(dymax, bbox_clip_default)) * h; + } + auto oxmin = d_cx - d_w * 0.5; + auto oymin = d_cy - d_h * 0.5; + auto oxmax = d_cx + d_w * 0.5 - 1.; + auto oymax = d_cy + d_h * 0.5 - 1.; + cur_proposal[0] = im_id; + cur_proposal[1] = Max(Min(oxmin, cur_im_info[1] - 1.), 0.); + cur_proposal[2] = Max(Min(oymin, cur_im_info[0] - 1.), 0.); + cur_proposal[3] = Max(Min(oxmax, cur_im_info[1] - 1.), 0.); + cur_proposal[4] = Max(Min(oymax, cur_im_info[0] - 1.), 0.); + } + +} + +template +void box_decode_and_clip(Tensor* proposals, + const Tensor* anchors, + const Tensor* deltas, + const Tensor* variances, + const Tensor* index, + const Tensor* im_info, + cudaStream_t stream) { + int img_num = index->num(); + int anchor_num = anchors->valid_size() / 4; + auto anchors_data = (const Dtype*)anchors->data(); + auto deltas_data = (const Dtype*) deltas->data(); + auto var_data = (const Dtype*) variances->data(); + auto index_data = (const int*) index->data(); + auto im_info_data = (const Dtype*) im_info->data(); + int index_valid_size = index->valid_size(); + int index_length = index->channel(); + proposals->reshape(Shape({img_num * index_length, 5, 1, 1})); + auto proposals_data = (Dtype*) proposals->mutable_data(); + const float bbox_clip_default = std::log(1000.0 / 16.0); + ker_box_decode_and_clip<<>>( + proposals_data, anchors_data, deltas_data, var_data, index_data, + im_info_data, bbox_clip_default, img_num, index_length, anchor_num, index->valid_size()); +} + +template +__global__ void ker_filter_bboxes( + int *keep, + int *keep_num, + const Dtype* bboxes, + const Dtype* im_info, + const Dtype min_size, + const int img_num, + const int pre_nms_num) { + int im_id = blockIdx.x; + Dtype im_h = im_info[0]; + Dtype im_w = im_info[1]; + Dtype im_scale = im_info[2]; + + int cnt = 0; + __shared__ int keep_index[CUDA_NUM_THREADS]; + for (int tid = threadIdx.x; tid < pre_nms_num; tid += blockDim.x) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + auto bboxes_tmp = bboxes + (tid + blockIdx.x * pre_nms_num) * 5; + Dtype xmin = bboxes_tmp[1]; + Dtype ymin = bboxes_tmp[2]; + Dtype xmax = bboxes_tmp[3]; + Dtype ymax = bboxes_tmp[4]; + + Dtype w = xmax - xmin + 1.0; + Dtype h = ymax - ymin + 1.0; + Dtype cx = xmin + w / 2.; + Dtype cy = ymin + h / 2.; + + Dtype w_s = (xmax - xmin) / im_scale + 1.; + Dtype h_s = (ymax - ymin) / im_scale + 1.; + + if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = tid; + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (pre_nms_num - tid) < CUDA_NUM_THREADS ? pre_nms_num - tid : CUDA_NUM_THREADS; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[im_id * pre_nms_num + cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + keep_num[im_id] = cnt; + } + +} + +template +void filter_bboxes(Tensor* keep_num, + Tensor* keep, + Tensor* proposals, + Tensor* im_info, + const Dtype min_size, + const int img_num, + const int pre_nms_num, + cudaStream_t stream) { + keep_num->reshape(Shape({img_num, 1, 1, 1}, Layout_NCHW)); + keep->reshape(Shape({img_num, pre_nms_num, 1, 1}, Layout_NCHW)); + keep->set_dtype(AK_INT32); + keep_num->set_dtype(AK_INT32); + auto proposals_data = (const Dtype*)proposals->data(); + auto im_info_data = (const Dtype*)im_info->data(); + auto keep_num_data = (int*)keep_num->data(); + auto keep_data = (int*)keep->data(); + Dtype min_size_final = std::max(min_size, 1.0f); + + ker_filter_bboxes<<>>( + keep_data, + keep_num_data, + proposals_data, + im_info_data, + min_size_final, + img_num, + pre_nms_num); +} + +template + __device__ inline Dtype IoU(const Dtype *a, const Dtype *b) { + Dtype left = max(a[0], b[0]), right = min(a[2], b[2]); + Dtype top = max(a[1], b[1]), bottom = min(a[3], b[3]); + Dtype width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + Dtype inter_s = width * height; + Dtype s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + Dtype s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return inter_s / (s_a + s_b - inter_s); +} + + +__global__ void NMSKernel(uint64_t *dev_mask, + const int n_boxes, + const int* keep_index, + const float nms_overlap_thresh, + const int col_blocks, + const float *dev_boxes) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * NMS_THREADS_PER_BLOCK, NMS_THREADS_PER_BLOCK); + const int col_size = + min(n_boxes - col_start * NMS_THREADS_PER_BLOCK, NMS_THREADS_PER_BLOCK); + + __shared__ float block_boxes[NMS_THREADS_PER_BLOCK * 4]; + if (threadIdx.x < col_size) { + int box_id = keep_index[NMS_THREADS_PER_BLOCK * col_start + threadIdx.x]; + block_boxes[threadIdx.x * 4 + 0] = dev_boxes[box_id * 5 + 1]; + block_boxes[threadIdx.x * 4 + 1] = dev_boxes[box_id * 5 + 2]; + block_boxes[threadIdx.x * 4 + 2] = dev_boxes[box_id * 5 + 3]; + block_boxes[threadIdx.x * 4 + 3] = dev_boxes[box_id * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = NMS_THREADS_PER_BLOCK * row_start + threadIdx.x; + const float *cur_box = dev_boxes + keep_index[cur_box_idx] * 5 + 1; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + + +template +void NMS(Tensor *keep_out, + const Tensor *proposals, + const int boxes_num, + const int* keep_index, + const Dtype nms_threshold, + const int post_nms_top_n, + cudaStream_t stream) { + const int col_blocks = DIVUP(boxes_num, NMS_THREADS_PER_BLOCK); + dim3 blocks(DIVUP(boxes_num, NMS_THREADS_PER_BLOCK), + DIVUP(boxes_num, NMS_THREADS_PER_BLOCK)); + dim3 threads(NMS_THREADS_PER_BLOCK); + keep_out->set_dtype(AK_INT32); + + Tensor mask(Shape({boxes_num, col_blocks, 1, 1}, Layout_NCHW), AK_UINT64); + auto boxes_data = (const Dtype*)proposals->data(); + auto mask_data = (uint64_t*) mask.mutable_data(); + NMSKernel<<>>(mask_data, + boxes_num, keep_index, nms_threshold, col_blocks, boxes_data); + + + Tensor mask_h(Shape({boxes_num, col_blocks, 1, 1}, Layout_NCHW), AK_UINT64); + auto mask_data_h = (uint64_t*) mask_h.mutable_data(); + cudaMemcpyAsync(mask_data_h, mask_data, sizeof(uint64_t) * mask.valid_size(), cudaMemcpyDeviceToHost, stream); + std::vector keep_index_h(boxes_num); + cudaMemcpyAsync(keep_index_h.data(), keep_index, sizeof(int)* boxes_num, cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / NMS_THREADS_PER_BLOCK; + int inblock = i % NMS_THREADS_PER_BLOCK; + if (num_to_keep >= post_nms_top_n) { + break; + } + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(keep_index_h[i]); + uint64_t *p = mask_data_h + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + keep_out->reshape(Shape({num_to_keep, 1, 1, 1}, Layout_NCHW)); + cudaMemcpyAsync(keep_out->mutable_data(), &keep_vec[0], sizeof(int)*num_to_keep, cudaMemcpyHostToDevice, stream); +} + +template +__global__ void ker_gather(Dtype* boxes_out, + const Dtype* proposals, + const int box_num, + const int box_dim, + const int* keep_index) { + CUDA_KERNEL_LOOP(tid, box_num * box_dim) { + int box_id = tid / box_dim; + int dim_id = tid % box_dim; + boxes_out[tid] = proposals[keep_index[box_id] * box_dim + dim_id]; + } +} + + +template +void gather_box(Tensor *boxes_out, + const Tensor*proposals, + const int* index, + const int num, + cudaStream_t stream) { + const Dtype* proposals_data = (const Dtype*) proposals->data(); + boxes_out->reshape(std::vector{num, 5, 1, 1}); + Dtype* boxes_out_data = (Dtype*) boxes_out->mutable_data(); + ker_gather<<valid_size()), CUDA_NUM_THREADS, 0, stream>>>(boxes_out_data, proposals_data, num, 5, index); + +} + +template +void gather_score(Tensor *scores_out, + const Tensor*scores, + const int* index, + const int num, + cudaStream_t stream) { + const Dtype* scores_data = (const Dtype*) scores->data(); + scores_out->reshape(Shape({num, 1, 1, 1}, Layout_NCHW)); + Dtype* scores_out_data = (Dtype*) scores_out->mutable_data(); + ker_gather<<valid_size()), CUDA_NUM_THREADS, 0, stream>>>(scores_out_data, scores_data, num, 1, index); + +} + + +template +SaberStatus SaberGenerateProposals::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam& param) { + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + auto anchors = *inputs[0]; + auto bbox_deltas = *inputs[1]; + auto im_info = *inputs[2]; + auto scores = *inputs[3]; + auto variances = *inputs[4]; + auto rpn_rois = outputs[0]; + auto rpn_roi_probs = outputs[1]; + int pre_nms_top_n = param.pre_nms_top_n; + int post_nms_top_n = param.post_nms_top_n; + float nms_threshold = param.nms_thresh; + float min_size = param.min_size; + float eta = param.eta; + CHECK_EQ(eta, 1.0f) << "eta is not equal to 1, now other param has not been supported"; + Shape scores_shape = scores.valid_shape(); + Shape scores_swap_shape({scores_shape[0], scores_shape[2], scores_shape[3] , scores_shape[1]}, Layout_NCHW); + Shape bbox_deltas_shape = bbox_deltas.valid_shape(); + Shape bbox_deltas_swap_shape({bbox_deltas_shape[0], bbox_deltas_shape[2], + bbox_deltas_shape[3] , bbox_deltas_shape[1]}, Layout_NCHW); + _scores_swap.reshape(scores_swap_shape); + _bbox_deltas_swap.reshape(bbox_deltas_swap_shape); + /*swap and sort*/ + trans(&scores, &_scores_swap, cuda_stream); + trans(&bbox_deltas, &_bbox_deltas_swap, cuda_stream); + cudaStreamSynchronize(cuda_stream); + + int bbox_num = bbox_deltas.valid_size() / 4; + rpn_rois->reshape(std::vector{post_nms_top_n, 5, 1, 1}); + rpn_roi_probs->reshape(std::vector{post_nms_top_n, 1, 1, 1}); + int pre_nms_num = (_scores_swap.valid_size() <= 0 || _scores_swap.valid_size() > pre_nms_top_n) ? pre_nms_top_n : _scores_swap.valid_size(); + int img_num = _scores_swap.num(); + sort_descending(&_sorted_scores, &_sorted_index, &_scores_swap, &_scores_index, pre_nms_num, cuda_stream); + + // 2. box decode and clipping + box_decode_and_clip(&_proposals, + &anchors, &_bbox_deltas_swap, + &variances, + &_sorted_index, + &im_info, + cuda_stream); + // 3. filter bbox + filter_bboxes(&_keep_num, &_keep, &_proposals, &im_info, + min_size, img_num, pre_nms_num, + cuda_stream); + + // 4. NMS + std::vector keep_num_vec; + keep_num_vec.resize(img_num); + cudaMemcpyAsync(&keep_num_vec[0], _keep_num.data(), sizeof(int)*img_num, cudaMemcpyDeviceToHost, cuda_stream); + + int total_boxes = 0; + std::vector seq_offset; + seq_offset.push_back(0); + for (int i = 0; i < img_num; i++) { + Shape score_slice_shape = _sorted_scores.valid_shape(); + Shape proposals_slice_shape = _proposals.valid_shape(); + proposals_slice_shape[0] = pre_nms_num; + score_slice_shape[0] = 1; + Tensor sorted_scores_slice((void*)((OpDataType*)_sorted_scores.mutable_data() + i * _sorted_scores.get_stride()[0]), NV(), this->_ctx->get_device_id(), score_slice_shape); + Tensor proposals_slice((void*)((OpDataType*)_proposals.mutable_data() + i * pre_nms_num * _proposals.get_stride()[0]), NV(), this->_ctx->get_device_id(), proposals_slice_shape); + + auto keep_data = (const int*)_keep.data() + i * pre_nms_num; + auto keep_num = keep_num_vec[i]; + if (nms_threshold <= 0) { + gather_box(&_boxes_out, &proposals_slice, keep_data, keep_num, cuda_stream); + gather_score(&_scores_out, &sorted_scores_slice, keep_data, keep_num, cuda_stream); + total_boxes += keep_num; + } else { + NMS(&_keep_nms, &proposals_slice, keep_num, keep_data, nms_threshold, post_nms_top_n, cuda_stream); + auto keep_nms_data = (const int*)_keep_nms.data(); + auto keep_nms_num = _keep_nms.valid_size(); + gather_box(&_boxes_out, &proposals_slice, keep_nms_data, keep_nms_num, cuda_stream); + gather_score(&_scores_out, &sorted_scores_slice, keep_nms_data, keep_nms_num, cuda_stream); + } + + cudaMemcpyAsync((OpDataType*)rpn_rois->mutable_data() + total_boxes * 5, + (const OpDataType*)_boxes_out.data(), + sizeof(OpDataType) * _boxes_out.valid_size(), + cudaMemcpyDefault, + cuda_stream); + cudaMemcpyAsync((OpDataType*)rpn_roi_probs->mutable_data() + total_boxes, + (const OpDataType*)_scores_out.data(), + sizeof(OpDataType) * _scores_out.valid_size(), + cudaMemcpyDefault, + cuda_stream); + total_boxes += _keep_nms.valid_size(); + seq_offset.push_back(total_boxes); + } + rpn_rois->reshape(std::vector{total_boxes, 5, 1, 1}); + rpn_roi_probs->reshape(std::vector{total_boxes, 1, 1, 1}); + rpn_rois->set_seq_offset({seq_offset}); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + +template class SaberGenerateProposals; +DEFINE_OP_TEMPLATE(SaberGenerateProposals, GenerateProposalsParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberGenerateProposals, GenerateProposalsParam, NV, AK_HALF); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu index 75e99003a..ca9865c25 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_gru.cu @@ -5,8 +5,19 @@ namespace anakin { namespace saber { -static int round_up(int k, int c) { - return ((k + c - 1) / c) * c; +static void cudnn_gemm(cublasHandle_t handle, const bool TransA, + const bool TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (!TransA/* == CblasNoTrans*/) ? K : M; + int ldb = (!TransB/* == CblasNoTrans*/) ? N : K; + cublasOperation_t cuTransA = + (!TransA/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (!TransB/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(handle, cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template @@ -123,8 +134,10 @@ SaberStatus SaberGru::dispatch(\ Shape shape_whr({1, batch_size, 1, _hidden_size}); utils::try_expand_tensor(_temp_whr,shape_whr); - _gemm_wx(seq_sum, 3 * _hidden_size, _word_size, 1.f, x_data, 0.f, weights_i2h, - static_cast(_temp_wx.mutable_data()), _ctx->get_compute_stream()); +// _gemm_wx(seq_sum, 3 * _hidden_size, _word_size, 1.f, x_data, 0.f, weights_i2h, +// static_cast(_temp_wx.mutable_data()), _ctx->get_compute_stream()); + + cudnn_gemm(_handle,false,false,seq_sum, 3 * _hidden_size, _word_size,1.f, x_data,weights_i2h,0.f,static_cast(_temp_wx.mutable_data())); const OpDataType* b_r = weights_bias + r_offset * _hidden_size; const OpDataType* b_z = weights_bias + z_offset * _hidden_size; @@ -175,22 +188,25 @@ SaberStatus SaberGru::dispatch(\ OpDataType* w_h_r = static_cast(_temp_wh.mutable_data()) + 0 * _hidden_size; OpDataType* w_h_z = static_cast(_temp_wh.mutable_data()) + 1 * _hidden_size; - _gemm_wh_2(emit_word_length, 2 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, - weights_h2h + _hidden_size * _hidden_size, static_cast( _temp_wh.mutable_data()), - _ctx->get_compute_stream()); +// _gemm_wh_2(emit_word_length, 2 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, +// weights_h2h + _hidden_size * _hidden_size, static_cast( _temp_wh.mutable_data()), +// _ctx->get_compute_stream()); + cudnn_gemm(_handle,false,false,emit_word_length, 2 * _hidden_size, _hidden_size, 1.f, hidden_in, + weights_h2h + _hidden_size * _hidden_size,0.f, static_cast( _temp_wh.mutable_data())); const OpDataType *w_o = weights_h2h; const int block_dim = 512; - const int grid_dim = round_up(emit_word_length * _hidden_size, block_dim); + const int grid_dim = utils::div_up(emit_word_length * _hidden_size, block_dim); cal_reset_kernel << < grid_dim, block_dim, 0 , _ctx->get_compute_stream() >> > ( w_x_r, w_h_r , b_r, _hidden_size, emit_word_length, hidden_out, hidden_in, param.gate_activity); - _gemm_wh_o(emit_word_length, _hidden_size, _hidden_size, 1.f, hidden_out, 0.f, w_o, - static_cast(_temp_whr.mutable_data()), _ctx->get_compute_stream()); +// _gemm_wh_o(emit_word_length, _hidden_size, _hidden_size, 1.f, hidden_out, 0.f, w_o, +// static_cast(_temp_whr.mutable_data()), _ctx->get_compute_stream()); + cudnn_gemm(_handle,false,false,emit_word_length, _hidden_size, _hidden_size,1.f,hidden_out, w_o,0.f,static_cast(_temp_whr.mutable_data())); cal_final_kernel << < grid_dim, block_dim, 0 , _ctx->get_compute_stream() >> > ( @@ -201,14 +217,17 @@ SaberStatus SaberGru::dispatch(\ OpDataType* w_h_z = static_cast(_temp_wh.mutable_data()) + z_offset * _hidden_size; OpDataType* w_h_o = static_cast(_temp_wh.mutable_data()) + o_offset * _hidden_size; - _gemm_wh_2(emit_word_length, 3 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, - static_cast(_temp_weights_h2h.data()), static_cast( _temp_wh.mutable_data()), - _ctx->get_compute_stream()); +// _gemm_wh_2(emit_word_length, 3 * _hidden_size, _hidden_size, 1.f, hidden_in, 0.f, +// static_cast(_temp_weights_h2h.data()), static_cast( _temp_wh.mutable_data()), +// _ctx->get_compute_stream()); + + cudnn_gemm(_handle,false,false,emit_word_length, 3 * _hidden_size, _hidden_size, 1.f,hidden_in, + static_cast(_temp_weights_h2h.data()),0.f,static_cast( _temp_wh.mutable_data())); const OpDataType *w_o = weights_h2h; const int block_dim = 512; - const int grid_dim = round_up(emit_word_length * _hidden_size, block_dim); + const int grid_dim = utils::div_up(emit_word_length * _hidden_size, block_dim); cal_cudnn_kernel<< < grid_dim, block_dim, 0 , _ctx->get_compute_stream() >> >( w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_h_o,b_r, b_z, b_o,_hidden_size, emit_word_length, hidden_out, hidden_in); diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_lstm.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_lstm.cu index 4dd591672..f9a84d202 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_lstm.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_lstm.cu @@ -276,7 +276,7 @@ SaberLstm::dispatch_batch( const int block_dim=512; - const int grid_dim=round_up(emit_word_length*_aligned_hidden_size,block_dim); + const int grid_dim=utils::div_up(emit_word_length*_aligned_hidden_size,block_dim); if (param.gate_activity == Active_sigmoid && param.cell_activity == Active_tanh diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_lstmp.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_lstmp.cu new file mode 100644 index 000000000..d5589ea26 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_lstmp.cu @@ -0,0 +1,161 @@ +#include "saber/funcs/impl/cuda/saber_lstmp.h" +#include "saber/core/tensor_op.h" +#include "cuda_inline_activation.h" +#include "cuda_utils.h" +namespace anakin { + +namespace saber { + +static void cudnn_gemm(cublasHandle_t handle, const bool TransA, + const bool TransB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (!TransA/* == CblasNoTrans*/) ? K : M; + int ldb = (!TransB/* == CblasNoTrans*/) ? N : K; + cublasOperation_t cuTransA = + (!TransA/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (!TransB/* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(handle, cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +template +__global__ void kernel_lstm_with_peephole( + const Dtype* w_x, const Dtype* b_i, const Dtype* b_f, const Dtype* b_c, const Dtype* b_o, + const Dtype* w_ci, const Dtype* w_cf, const Dtype* w_co, Dtype* cell, const int hidden_size, + const int batch_size, + Dtype* output) { + + + const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int batch_id = thread_id / hidden_size; + const int tid = thread_id % hidden_size; + + if (tid < hidden_size && batch_id < batch_size) { + const int emit_wx_offset = batch_id * hidden_size * 4; + const Dtype* w_x_i = w_x + emit_wx_offset; + const Dtype* w_x_f = w_x_i + hidden_size ; + const Dtype* w_x_c = w_x_f + hidden_size; + const Dtype* w_x_o = w_x_c + hidden_size; + Dtype* gate_h_p = output + batch_id * hidden_size; + Dtype* gate_c_p = cell + batch_id * hidden_size; + if(first_iter){ + const Dtype gate_i = Sigmoid(w_x_i[tid] + b_i[tid]); + const Dtype gate_f = Sigmoid(w_x_f[tid] + b_f[tid]); + + const Dtype gate_c_s = Tanh(w_x_c[tid] + b_c[tid]); + const Dtype gate_c = gate_i * gate_c_s; + const Dtype gate_o = Sigmoid(w_x_o[tid] + b_o[tid] + gate_c * w_co[tid]); + gate_c_p[tid] = gate_c; + gate_h_p[tid] = gate_o * Tanh(gate_c); + }else{ + const Dtype c_1 = gate_c_p[tid]; + const Dtype gate_i = Sigmoid(w_x_i[tid] + b_i[tid] + w_ci[tid] * c_1); + const Dtype gate_f = Sigmoid(w_x_f[tid] + b_f[tid] + w_cf[tid] * c_1); + + const Dtype gate_c_s = Tanh(w_x_c[tid] + b_c[tid]); + const Dtype gate_c = gate_f * c_1 + gate_i * gate_c_s; + const Dtype gate_o = Sigmoid(w_x_o[tid] + b_o[tid] + gate_c * w_co[tid]); + gate_c_p[tid] = gate_c; + gate_h_p[tid] = gate_o * Tanh(gate_c); + } + } +} + +template +void cal_lstm_batch(int emit_word_id_size, Dtype* temp_wx, + const Dtype* weight_peephole, + Dtype* hout, Dtype* inner_cell, const Dtype* b_i_in, const Dtype* b_f_in, const Dtype* b_c_in, + const Dtype* b_o_in, int hidden_size,cudaStream_t cuda_stream){ + const int block_dim=256; + const int grid_dim=utils::div_up(emit_word_id_size*hidden_size,block_dim); + const Dtype* wc_i=weight_peephole; + const Dtype* wc_f=weight_peephole+hidden_size; + const Dtype* wc_o=weight_peephole+2*hidden_size; + kernel_lstm_with_peephole<<>>(temp_wx,b_i_in,b_f_in,b_c_in,b_o_in,wc_i,wc_f,wc_o,inner_cell,hidden_size,emit_word_id_size,hout); + +}; + +template +__global__ void kernel_vTanh(Dtype* data,int count){ + const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if(thread_id +static inline void vTanh(Dtype* data,int count,cudaStream_t cuda_stream){ + kernel_vTanh<<>>(data,count); +} + + +template<> +SaberStatus +SaberLstmp::dispatch( + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param) { + auto offset_vec = inputs[0]->get_seq_offset(); + CHECK_EQ(offset_vec.size(), 1); + auto offset = offset_vec[0]; + CHECK_EQ(offset.size(), 2); + const int skip_num = param.skip_num; + CHECK_GT(skip_num, 1); + int word_num = inputs[0]->num(); + int word_dim = inputs[0]->channel(); + int iter_num = utils::div_up(word_num, skip_num); + + utils::try_expand_tensor(_wx_tensor,word_num*4*_inner_hidden_dim); + utils::try_expand_tensor(_temp_hidden_tensor,skip_num*_inner_hidden_dim); + utils::try_expand_tensor(_temp_cell_tensor,skip_num*_inner_hidden_dim); + + float* wx_ptr = static_cast(_wx_tensor.mutable_data()); + const float* x_ptr = static_cast(inputs[0]->data()); + const float* weights_x_ptr = static_cast(param.weight()->data()); + const float* weights_h_ptr = weights_x_ptr + word_dim * _inner_hidden_dim * 4; + const float* weights_project_ptr = weights_h_ptr + _output_hidden_dim * _inner_hidden_dim * 4; + const float* weights_bias_ptr = static_cast(param.bias()->data()); + const float* weights_bias_i_ptr = weights_bias_ptr; + const float* weights_bias_f_ptr = weights_bias_i_ptr + _inner_hidden_dim; + const float* weights_bias_c_ptr = weights_bias_f_ptr + _inner_hidden_dim; + const float* weights_bias_o_ptr = weights_bias_c_ptr + _inner_hidden_dim; + const float* weights_peephole_ptr = weights_bias_ptr + _inner_hidden_dim * 4; + float* output_ptr = static_cast(outputs[0]->mutable_data()); + float* temp_hidden_out = static_cast(_temp_hidden_tensor.mutable_data()); + float* temp_cell_out = static_cast(_temp_cell_tensor.mutable_data()); + + cudaStream_t stream=_ctx->get_compute_stream(); + cudnn_gemm(_handle,false, false, word_num, 4*_inner_hidden_dim, word_dim, 1.f, x_ptr, weights_x_ptr, 0.f, wx_ptr); + + for (int i = 0; i < iter_num; i++) { + const int run_batch_dim=(i==(iter_num-1))?(word_num-skip_num*i):skip_num; + float* wx_iter = wx_ptr + i * skip_num * 4 * _inner_hidden_dim; + if(i>=1){ + float* hidden_in = output_ptr + (i - 1) * skip_num * _output_hidden_dim; + cudnn_gemm(_handle,false, false, run_batch_dim, 4*_inner_hidden_dim, _output_hidden_dim, 1.f, hidden_in, weights_h_ptr, + 1.f, wx_iter); + + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, temp_hidden_out, temp_cell_out,weights_bias_i_ptr,weights_bias_f_ptr,weights_bias_c_ptr,weights_bias_o_ptr,_inner_hidden_dim,stream); + + }else{ + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, temp_hidden_out, temp_cell_out,weights_bias_i_ptr,weights_bias_f_ptr,weights_bias_c_ptr,weights_bias_o_ptr,_inner_hidden_dim,stream); + } + + float* hidden_out = output_ptr + i * skip_num * _output_hidden_dim; + cudnn_gemm(_handle,false,false,run_batch_dim,_output_hidden_dim,_inner_hidden_dim,1.f,temp_hidden_out,weights_project_ptr,0.f,hidden_out); + vTanh(hidden_out,run_batch_dim*_output_hidden_dim,stream); + } + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; + +}; + + +DEFINE_OP_TEMPLATE(SaberLstmp, LstmParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberLstmp, LstmParam, NV, AK_INT8); +} +} + diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_match_matrix.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_match_matrix.cu index 283d49589..b777fd02b 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_match_matrix.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_match_matrix.cu @@ -67,7 +67,7 @@ SaberStatus SaberMatchMatrix::dispatch( \ int len_l = offset_l[1] - offset_l[0]; int len_r = offset_r[offset_r.size() - 1]; - + int batch = offset_l.size() - 1; const OpDataType *input_l = (const OpDataType*)inputs[0]->data(); const OpDataType *input_r = (const OpDataType*)inputs[1]->data(); @@ -76,18 +76,39 @@ SaberStatus SaberMatchMatrix::dispatch( \ OpDataType* input_l_transform_reorganize = (OpDataType*)_input_l_transform_reorganize.mutable_data(); OpDataType* output_tmp = (OpDataType*)_output_tmp.mutable_data(); OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); - _gemm_l_transform.init(true, true, dim_t * dim_in, len_l, dim_in, *(this->_ctx)); - _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, input_l, input_l_transform); - for (int i = 0; i < dim_t; i++) { - int offset = i * dim_in * len_l; - gpu_transpose(_handle, - input_l_transform + offset, - dim_in, - len_l, - input_l_transform_reorganize + offset); + if (param.is_l_same) { + _gemm_l_transform.init(true, true, dim_t * dim_in, len_l, dim_in, *(this->_ctx)); + _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, input_l, input_l_transform); + for (int i = 0; i < dim_t; i++) { + int offset = i * dim_in * len_l; + gpu_transpose(_handle, + input_l_transform + offset, + dim_in, + len_l, + input_l_transform_reorganize + offset); + } + _gemm_r_transform.init(false, true, len_r, dim_t * len_l, dim_in, *(this->_ctx)); + _gemm_r_transform.dispatch(1.0f, 0.f, input_r, input_l_transform_reorganize, output_tmp); + } else { + _gemm_l_transform.init(true, true, dim_t * dim_in, len_l, dim_in, *(this->_ctx)); + + for (int i = 0; i < batch; i++) { + auto tmp_input_l = input_l + i * len_l * dim_in; + auto tmp_input_r = input_r + offset_r[i] * dim_in; + + _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, tmp_input_l, input_l_transform); + for (int j = 0; j < dim_t; j++) { + int offset = j * dim_in * len_l; + gpu_transpose(_handle, + input_l_transform + offset, + dim_in, + len_l, + input_l_transform_reorganize + offset); + } + _gemm_r_transform.init(false, true, offset_r[i+1] - offset_r[i], dim_t * len_l, dim_in, *(this->_ctx)); + _gemm_r_transform.dispatch(1.0f, 0.f, tmp_input_r, input_l_transform_reorganize, output_tmp + offset_r[i]*dim_t * len_l); + } } - _gemm_r_transform.init(false, true, len_r, dim_t * len_l, dim_in, *(this->_ctx)); - _gemm_r_transform.dispatch(1.0f, 0.f, input_r, input_l_transform_reorganize, output_tmp); int max_len_r = 0; for (int i = 0; i < offset_r.size() - 1; i++) { int cur_len = offset_r[i+1] - offset_r[i]; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_mean.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_mean.cu new file mode 100644 index 000000000..41e4c3281 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_mean.cu @@ -0,0 +1,86 @@ +#include "saber/funcs/impl/cuda/saber_mean.h" + +namespace anakin { +namespace saber { + +template +__global__ void mean_kernel(const dtype* input, dtype* output, const int count) { + + int tid = threadIdx.x; + int n_id = threadIdx.x + blockIdx.x * blockDim.x; + int thread_num = blockDim.x * gridDim.x; + extern __shared__ dtype sdata[]; + if (n_id==0) output[0] = (dtype)0.0; + dtype sum = (dtype)0.0; + for (int thread = n_id; thread < count; thread += thread_num) { + sum += input[thread]; + } + sdata[tid] = sum; + __syncthreads(); + + int powOf2 = blockDim.x; + if (powOf2 & (powOf2-1)) { + // thread block is not pow of 2. + while (powOf2 & (powOf2-1)) { + powOf2 &= (powOf2-1); + } + // find a num which is pow of 2. + if (tid >= powOf2) { + sdata[tid - powOf2] += sdata[tid]; + } + __syncthreads(); + } + for (unsigned int i = powOf2 >> 1; i > 0; i>>=1) { + if ( tid < i) { + sdata[tid] += sdata[tid + i]; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + sdata[0] /= count; + atomicAdd(&output[0], sdata[0]); + } +} + +//compute a mean of input tensor's all elements. +template +SaberStatus SaberMean::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + MeanParam& param) { + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const OpDataType* input_ptr = (const OpDataType*)inputs[0]->data(); + OpDataType* output_ptr = (OpDataType*)outputs[0]->mutable_data(); + int count = inputs[0]->valid_size(); + int thread_num; + int grid; + unsigned int blockSize; + if (count < CUDA_NUM_THREADS) { + thread_num = count; + grid = 1; + blockSize = count; + } else { + thread_num = CUDA_NUM_THREADS; + if (CUDA_GET_BLOCKS(count) >= 128) + grid = 64; + else + grid = CUDA_GET_BLOCKS(count); + blockSize = CUDA_NUM_THREADS; + } + + mean_kernel<<>>( + input_ptr, + output_ptr, + count + ); + + CUDA_POST_KERNEL_CHECK; + + return SaberSuccess; +} + +template class SaberMean; +DEFINE_OP_TEMPLATE(SaberMean, MeanParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberMean, MeanParam, NV, AK_INT8); + +} // namespace saber. +} // namespace anakin. \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_normalize.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_normalize.cu index f627780c3..3a1178002 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_normalize.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_normalize.cu @@ -5,6 +5,105 @@ namespace anakin{ namespace saber{ +template +__global__ void group_normalize_kernel(const dtype* in_data, const dtype* scale, + const dtype* bias, int n, int c, int h, int w, int group, + int group_size, float eps, dtype* out_data, dtype* out_mean, + dtype* out_var){ + + __shared__ dtype block_sums[thread_number]; + __shared__ dtype block_squares[thread_number]; + int group_index = blockIdx.x; + int thread_index = threadIdx.x; + block_squares[thread_index] = 0; + block_sums[thread_index] = 0; + __syncthreads(); + + int batch_index = group_index / group; + int inner_group_index = group_index % group; + int real_channel = (c - inner_group_index * group_size) >= group_size ? + group_size : c - inner_group_index * group_size; + int compute_size = real_channel * w * h; + int group_start_ind = inner_group_index * group_size + batch_index * c; + int group_start_num = group_start_ind * h * w; + for (int i = thread_index; i < compute_size; i += thread_number){ + block_sums[thread_index] += in_data[group_start_num + i]; + block_squares[thread_index] += in_data[group_start_num + i] * in_data[group_start_num + i]; + } + __syncthreads(); + //reduce + int activate = thread_number / 2; + //this assume thread number be 2^n + while (activate >= 64){ + if (thread_index < activate){ + block_sums[thread_index] += block_sums[thread_index + activate]; + block_squares[thread_index] += block_squares[thread_index + activate]; + } + __syncthreads(); + activate >>= 1; + } + + if (activate >= 32){ + if (thread_index < 32){ + block_sums[thread_index] += block_sums[thread_index + 32]; + block_squares[thread_index] += block_squares[thread_index + 32]; + } + } + if (activate >= 16){ + if (thread_index < 16){ + block_sums[thread_index] += block_sums[thread_index + 16]; + block_squares[thread_index] += block_squares[thread_index + 16]; + } + } + if (activate >= 8){ + if (thread_index < 8){ + block_sums[thread_index] += block_sums[thread_index + 8]; + block_squares[thread_index] += block_squares[thread_index + 8]; + } + } + if (activate >= 4){ + if (thread_index < 4){ + block_sums[thread_index] += block_sums[thread_index + 4]; + block_squares[thread_index] += block_squares[thread_index + 4]; + } + } + if (activate >= 2){ + if (thread_index < 2){ + block_sums[thread_index] += block_sums[thread_index + 2]; + block_squares[thread_index] += block_squares[thread_index + 2]; + } + } + if (activate >= 1){ + if (thread_index < 1){ + block_sums[thread_index] += block_sums[thread_index + 1]; + block_squares[thread_index] += block_squares[thread_index + 1]; + } + } + + dtype group_mean = block_sums[0] / compute_size; + dtype group_var = block_squares[0] / compute_size - group_mean * group_mean; + dtype group_var_inv = 1 / sqrt(group_var + eps); + for (int i = thread_index; i < compute_size; i += thread_number){ + int c_index = i / (h * w); + dtype dest_val = (in_data[group_start_num + i] - group_mean) * group_var_inv; + if (scale){ + dest_val *= scale[group_start_ind + c_index]; + } + if (bias){ + dest_val *= bias[group_start_ind + c_index]; + } + out_data[group_start_num + i] = dest_val; + } + if (out_mean){ + out_mean[group_index] = group_mean; + } + if (out_var){ + out_var[group_index] = group_var; + } + +} + + template __global__ void normalize_kernel_no_across_spatial(const int size_in_channel, const int n,\ const int channels,const Dtype* scale, const Dtype* bottom_data, Dtype* top_data, const float eps, const int p){ @@ -233,6 +332,40 @@ SaberStatus SaberNormalize::dispatch(\ cudaStream_t stream = this->_ctx->get_compute_stream(); const float* src = static_cast(inputs[0]->data()); float* dst = static_cast(outputs[0]->mutable_data()); + + const float eps = param.eps; + int n = inputs[0] -> num(); + int c = inputs[0] -> channel(); + int h = inputs[0] -> height(); + int w = inputs[0] -> width(); + + if (param.group > 0){ + float* scale = nullptr; + float* bias = nullptr; + float* out_mean = nullptr; + float* out_var = nullptr; + int group_size = (c - 1) / param.group + 1; + if (param.has_scale){ + scale = static_cast(param.scale->data()); + } + if (param.has_bias){ + bias = static_cast(param.bias->data()); + } + if (outputs.size() > 1){ + out_mean = static_cast(outputs[1]->data()); + } + if (outputs.size() > 2){ + out_var = static_cast(outputs[2]->data()); + } + + int blocks = n * param.group; + group_normalize_kernel + <<>> + (src, scale, bias, n, c, h, w, param.group, group_size, eps, + dst, out_mean, out_var); + return SaberSuccess; + + } if (!param.across_spatial) { int num=inputs[0]->num(); int size_in_channel = inputs[0]->width() * inputs[0]->height(); @@ -292,7 +425,6 @@ SaberStatus SaberNormalize::dispatch(\ #else //compute norm and result individually //! compute square root - const float eps = param.eps; float pw = 0.5f; if (param.p == 1) { pw = 1.f; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_one_hot.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_one_hot.cu new file mode 100644 index 000000000..ea4be9dec --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_one_hot.cu @@ -0,0 +1,57 @@ + +#include "saber/funcs/impl/cuda/saber_one_hot.h" + +namespace anakin { + +namespace saber { + +template <> +SaberStatus SaberOneHot::create( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param, Context& ctx) { + return SaberSuccess; +} + +template <> +SaberStatus SaberOneHot::init( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} +__global__ void fill_one_hot_kernel(const float* in_ptr, + float* out_ptr, const int dim, const int depth) { + + CUDA_KERNEL_LOOP(tid, dim) { + out_ptr[tid * depth + (int)in_ptr[tid]] = 1.0; + } +} +template <> +SaberStatus SaberOneHot::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param) { + + auto stream = _ctx->get_compute_stream(); + const float* input_ptr = (const float*)inputs[0]->data(); + float* output_ptr = (float*)outputs[0]->mutable_data(); + int _depth = param.depth; + int dims = inputs[0]->valid_size(); + cudaMemsetAsync(output_ptr, + 0, + outputs[0]->valid_size() * outputs[0]->get_dtype_size(), + stream); + fill_one_hot_kernel<<>>( + input_ptr, output_ptr, dims, _depth); + return SaberSuccess; +} + +template class SaberOneHot; +DEFINE_OP_TEMPLATE(SaberOneHot, OneHotParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberOneHot, OneHotParam, NV, AK_INT8); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_pixel_shuffle.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_pixel_shuffle.cu new file mode 100644 index 000000000..e8766d8d7 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_pixel_shuffle.cu @@ -0,0 +1,67 @@ +#include "saber/funcs/impl/cuda/saber_pixel_shuffle.h" + +namespace anakin{ +namespace saber{ + + +template +__global__ void ker_permute_fwd(Dtype * out_data, const int num_axes,\ + const int count, const int * permute_order,\ + const int * new_steps, const int * old_steps,\ + const Dtype* in_data) +{ + CUDA_KERNEL_LOOP(tid, count){ + int org_idx = tid; + int in_idx = 0; + #pragma unroll + for (int i = 0; i < num_axes; i++) { + int order = permute_order[i]; + int new_step = new_steps[i]; + int old_step = old_steps[order]; + in_idx += (org_idx / new_step) * old_step; + org_idx %= new_step; + } + out_data[tid] = in_data[in_idx]; + } +} + + + +template <> +SaberStatus SaberPixelShuffle::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m){ + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + const float* in_data = static_cast(inputs[0]->data()); + float* out_data = static_cast(outputs[0]->mutable_data()); + + const int* permute_order = static_cast(_permute_order.data()); + const int* new_steps = static_cast(_out_step.data()); + const int* old_steps = static_cast(_in_step.data()); + + int count = outputs[0]->valid_size(); + + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()){ + ker_permute_fwd\ + <<>>(\ + out_data, _axes, count, permute_order, \ + new_steps, old_steps, in_data); + } else { + ker_permute_fwd\ + <<>>(\ + out_data, _axes, count, permute_order, \ + new_steps, old_steps, in_data); + } + +} + + + + + +} + +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_pooling.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_pooling.cu new file mode 100644 index 000000000..720df4b25 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_pooling.cu @@ -0,0 +1,234 @@ + +#include "saber/funcs/impl/cuda/saber_pooling.h" +#include "saber/funcs/impl/cuda/vender_pooling.h" +#include "saber/funcs/calibrate.h" +#include "saber/core/tensor_op.h" +#include + +namespace anakin { +namespace saber { + +template <> +SaberStatus SaberPooling::create( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, Context &ctx) { + _impl->create(inputs, outputs, param, ctx); + return SaberSuccess; +} + +template <> +SaberStatus SaberPooling::init( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, Context &ctx) { + + this->_ctx = &ctx; + _impl = new VenderPooling; + _impl->init(inputs, outputs, param, ctx); + return create(inputs, outputs, param, ctx); +} +template <> +SaberStatus SaberPooling::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m) { + _impl->dispatch(inputs, outputs, param); + return SaberSuccess; +} + +union Reg{ + unsigned int idata; + char b[4]; +}; + +__global__ void pool_s8s8_max_c4(const int nthreads, + const void* const in_data, const int channels, + const int height, const int width, const int out_height, + const int out_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + void* const out_data, float place_holder, float trans_scale) { + + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % out_width; + const int ph = (index / out_width) % out_height; + const int c = (index / out_width / out_height) % channels; + const int n = index / out_width / out_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + unsigned int maxval = 0x80808080; // this is magic + const unsigned int* in_slice = + (const unsigned int*)(in_data); + int offset = (n * channels + c) * height * width; + in_slice += offset; + unsigned int *out = (unsigned int*)out_data; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + unsigned int read_in = in_slice[h * width + w]; + asm volatile (" vmax4.s32.s32.s32 %0, %1, %2, %0;" + : "=r"(maxval) : "r"(maxval), "r"(read_in)); + } + } + + out[index] = maxval; + } +} +__global__ void pool_s8s8_avrg_c4(const int nthreads, + const void* const in_data, const int channels, + const int height, const int width, const int out_height, + const int out_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, const int pad_w, + void* const out_data, float avg_1, float trans_scale) { + + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % out_width; + const int ph = (index / out_width) % out_height; + const int c = (index / out_width / out_height) % channels; + const int n = index / out_width / out_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + Reg reg; + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + const unsigned int* in_slice = + (const unsigned int*)(in_data); + int offset = (n * channels + c) * height * width; + in_slice += offset; + unsigned int *out = (unsigned int*)out_data; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + reg.idata = in_slice[h * width + w]; + sum0 += reg.b[0]; + sum1 += reg.b[1]; + sum2 += reg.b[2]; + sum3 += reg.b[3]; + } + } + float sum0f = (float)sum0 * avg_1; + float sum1f = (float)sum1 * avg_1; + float sum2f = (float)sum2 * avg_1; + float sum3f = (float)sum3 * avg_1; + reg.b[0] = static_cast(sum0f); + reg.b[1] = static_cast(sum1f); + reg.b[2] = static_cast(sum2f); + reg.b[3] = static_cast(sum3f); +// printf("%x\n", reg.idata); + out[index] = reg.idata; + } +} + +template <> +SaberStatus SaberPooling::create( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, Context &ctx) { + if (inputs[0]->get_dtype() == AK_FLOAT) { + Shape in_shape = inputs[0]->valid_shape(); + _int8_input.re_alloc(in_shape, AK_INT8); + _int8_input.set_scale(inputs[0]->get_scale()); + _int8_input.set_layout(Layout_NCHW_C4); + } + if (outputs[0]->get_dtype() == AK_FLOAT) { + Shape out_shape = outputs[0]->valid_shape(); + _int8_output.re_alloc(out_shape, AK_INT8); + _int8_output.set_scale(outputs[0]->get_scale()); + _int8_output.set_layout(Layout_NCHW_C4); + } + return SaberSuccess; +} + +template <> +SaberStatus SaberPooling::init( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, Context &ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberPooling::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m) { + + CHECK_GE(inputs[0]->get_scale().size(), 1) << "not found scale factor!!!"; + CHECK_GE(outputs[0]->get_scale().size(), 1) << "not found scale factor!!!"; + CHECK_EQ(inputs[0]->channel() % 4, 0) << "not a multipler of 4"; + + float in_scale = inputs[0]->get_scale()[0]; + float out_scale = outputs[0]->get_scale()[0]; + int count = outputs[0]->valid_size() / 4; + int channels = inputs[0]->channel() / 4; + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int out_height = outputs[0]->height(); + int out_width = outputs[0]->width(); + int stride_h = param.stride_h; + int stride_w = param.stride_w; + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int window_h = param.window_h; + int window_w = param.window_w; + auto stream = _ctx->get_compute_stream(); + + const void* in_data = nullptr; + void* out_data = nullptr; + + if (inputs[0]->get_dtype() == AK_FLOAT) { + conv_calibrate_fp32_int8_c4(_int8_input, *inputs[0], in_scale, *(this->_ctx)); + in_data = _int8_input.data(); + } else { + in_data = inputs[0]->data(); + } + + if (outputs[0]->get_dtype() == AK_FLOAT) { + out_data = _int8_output.mutable_data(); + } else { + out_data = outputs[0]->mutable_data(); + } + + float kernel_size = window_h * window_w; + kernel_size = 1.f / kernel_size; + switch (param.pooling_type) { + case Pooling_max: + pool_s8s8_max_c4 << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, + 0, stream >> > (count, + in_data, channels, height, width, + out_height, out_width, window_h, window_w, + stride_h, stride_w, pad_h, pad_w, out_data, + kernel_size, in_scale / out_scale); + break; + case Pooling_average_include_padding: + pool_s8s8_avrg_c4 << < CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, + 0, stream >> > (count, + in_data, channels, height, width, + out_height, out_width, window_h, window_w, + stride_h, stride_w, pad_h, pad_w, out_data, + kernel_size, in_scale / out_scale); + break; + default: + LOG(FATAL) << "not support yet!!!" << param.pooling_type; + break; + } + if (outputs[0]->get_dtype() == AK_FLOAT) { + calibrate_int8_c4_fp32(*outputs[0], _int8_output, out_scale, *_ctx); + } + return SaberSuccess; +} + +DEFINE_OP_TEMPLATE(SaberPooling, PoolingParam, NV, AK_HALF); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_ps_roi_pooling.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_ps_roi_pooling.cu new file mode 100644 index 000000000..79712ed5b --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_ps_roi_pooling.cu @@ -0,0 +1,295 @@ +#include "saber/funcs/impl/cuda/saber_ps_roi_pooling.h" +#include "saber/core/tensor_op.h" +#include +#include + +namespace anakin { + +namespace saber { + +/* + * crop rois and resize to [crop_height, crop_width] from in_data + * in_data shape: [pooled_h * pooled_w * c, im_h, im_w] + * rois shape: [num_rois, 4] + * out_data: [pooled_h * pooled_w * c, num_rois, crop_height, crop_width] + */ +template +__global__ void crop_and_resize_kernel( + const Dtype* in_data, + const Dtype* rois, + Dtype* out_data, + int num_rois, + int im_h, int im_w, + int crop_height, int crop_width, + int count, + int method, + float extra_value){ + + CUDA_KERNEL_LOOP(index, count){ + int temp_ind = index; + int cur_w = temp_ind % crop_width; + temp_ind /= crop_width; + int cur_h = temp_ind % crop_height; + temp_ind /= crop_height; + int cur_n = temp_ind % num_rois; + int cur_c = temp_ind / num_rois; + + const Dtype* rois_data = rois + cur_n * 4; + + float y1 = rois_data[0] * (im_h - 1); + float x1 = rois_data[1] * (im_w - 1); + float y2 = rois_data[2] * (im_h - 1); + float x2 = rois_data[3] * (im_w - 1); + + float height_scale = crop_height > 1 ? (y2 - y1)/(crop_height - 1) : 0; + float width_scale = crop_width > 1 ? (x2 - x1)/(crop_width - 1) : 0; + + float in_y = crop_height > 1 ? y1 + cur_h * height_scale : (y1 + y2)/2; + + if ( in_y < 0 || in_y > im_h - 1){ + out_data[index] = extra_value; + continue; + } + + float in_x = crop_width > 1 ? x1 + cur_w * width_scale : (x1 + x2)/2; + if ( in_x < 0 || in_x > im_w - 1){ + out_data[index] = extra_value; + continue; + } + + const Dtype* im_data = in_data + cur_c * im_h * im_w; + + //resize method 0 means bilinear + if (method == 0){ + int top_y = floor(in_y); + int bot_y = ceil(in_y); + float y_lerp = in_y - top_y; + + int left_x = floor(in_x); + int right_x = ceil(in_x); + float x_lerp = in_x - left_x; + + Dtype top_left = im_data[top_y*im_w + left_x]; + Dtype top_right = im_data[top_y*im_w + right_x]; + Dtype bot_left = im_data[bot_y*im_w + left_x]; + Dtype bot_right = im_data[bot_y*im_w + right_x]; + float top = top_left + (top_right - top_left) * y_lerp; + float bot = bot_left + (bot_right - bot_left) * y_lerp; + out_data[index] = top + (bot - top) * x_lerp; + } else { + //else method means nearest + int closest_x = round(in_x); + int closest_y = round(in_y); + out_data[index] = im_data[closest_y*im_w + closest_x]; + } + } + +} + +template +__global__ void crop_global_pooling_kernel(const Dtype* in_data, Dtype* out_data, + int pooled_size, int channel, int num_rois, int crop_height, int crop_width, + int count){ + CUDA_KERNEL_LOOP(index, count){ + int cur_n = index / channel; + int cur_c = index % channel; + int crop_size = crop_height * crop_width; + Dtype sum = 0; + for (int i = 0; i < crop_size; ++i){ + Dtype tmp_sum = 0; + for (int j = 0; j < pooled_size; ++j){ + tmp_sum += in_data[(j * num_rois + cur_n) * crop_size + i]; + } + sum += tmp_sum / pooled_size; + } + out_data[index] = sum /crop_size; + } +} + +template +__global__ void crop_no_global_pooling_kernel(const Dtype* in_data, Dtype* out_data, + int pooled_height, int pooled_width, int channel, int num_rois, int crop_height, int crop_width, + int count){ + CUDA_KERNEL_LOOP(index, count){ + int cur_pw = index % pooled_width; + index /= pooled_width; + int cur_cw = index % crop_width; + index /= crop_width; + int cur_ph = index % pooled_height; + index /= pooled_height; + int cur_ch = index % crop_height; + index /= crop_height; + int cur_c = index % channel; + int cur_n = index / channel; + + int in_index = ((((cur_ph * pooled_width + cur_pw) * channel + + cur_c) * num_rois + cur_n) * crop_height + cur_ch) * crop_width + cur_cw; + out_data[index] = in_data[in_index]; + } +} + +//for tf, it has no batch_ind +template +__global__ void psroi_pool_kernel_no_batchind(const Dtype* in_data, const Dtype* rois, Dtype* out_data, + int in_n, int in_c, int in_h, int in_w, int o_c, int o_h, int o_w, + int pooled_h, int pooled_w, float spatial_scale, int count){ + + CUDA_KERNEL_LOOP(index, count){ + int temp_ind = index; + int cur_w = temp_ind % o_w; + temp_ind /= o_w; + int cur_h = temp_ind % o_h; + temp_ind /= o_h; + int cur_c = temp_ind % o_c; + int cur_n = temp_ind / o_c; + + const Dtype* rois_data = rois + cur_n * 4; + + int roi_x0 = fminf(fmaxf(rois_data[0] * spatial_scale, 0), in_w-1); + int roi_y0 = fminf(fmaxf(rois_data[1] * spatial_scale, 0), in_h-1); + int roi_x1 = fminf(fmaxf(rois_data[2] * spatial_scale, 0), in_w-1); + int roi_y1 = fminf(fmaxf(rois_data[3] * spatial_scale, 0), in_h-1); + + int roi_h = roi_y1 - roi_y0 + 1; + int roi_w = roi_x1 - roi_x0 + 1; + + Dtype bin_w = static_cast(roi_w) / pooled_w; + Dtype bin_h = static_cast(roi_h) / pooled_h; + + int ws = roi_x0 + bin_w * cur_w; + int we = ceil(roi_x0 + bin_w * (cur_w + 1)); + int ys = roi_y0 + bin_h * cur_h; + int ye = ceil(roi_y0 + bin_h * (cur_h + 1)); + + int c_index = (cur_h * pooled_w + cur_w) * o_c + cur_c; + + const Dtype* offset_in_data = in_data + c_index * in_w * in_h; + + Dtype sum = 0; + + for (int y = ys; y < ye; ++y){ + for (int w = ws; w < we; ++w){ + sum += offset_in_data[y * in_w + w]; + } + } + sum /= (ye - ys) * (we - ws); + + //tf is set to `hwc` format, here we set `chw` format + out_data[index] = sum; + + } + +} + +//for caffe, it has batchind +template +__global__ void psroi_pool_kernel_with_batchind(const Dtype* in_data, const Dtype* rois, Dtype* out_data, + int in_n, int in_c, int in_h, int in_w, int o_c, int o_h, int o_w, + int pooled_h, int pooled_w, float spatial_scale, int count){ + + CUDA_KERNEL_LOOP(index, count){ + int temp_ind = index; + int cur_w = temp_ind % o_w; + temp_ind /= o_w; + int cur_h = temp_ind % o_h; + temp_ind /= o_h; + int cur_c = temp_ind % o_c; + int cur_n = temp_ind / o_c; + + const Dtype* rois_data = rois + cur_n * 5; + + int batch = rois_data[0]; + Dtype roi_x0 = rois_data[1] * spatial_scale; + Dtype roi_y0 = rois_data[2] * spatial_scale; + Dtype roi_x1 = (rois_data[3] + 1) * spatial_scale; + Dtype roi_y1 = (rois_data[4] + 1) * spatial_scale; + + Dtype roi_h = roi_y1 - roi_y0; + Dtype roi_w = roi_x1 - roi_x0; + + Dtype bin_w = roi_w / pooled_w; + Dtype bin_h = roi_h / pooled_h; + + int ws = roi_x0 + bin_w * cur_w; + int we = ceil(roi_x0 + bin_w * (cur_w + 1)); + int ys = roi_y0 + bin_h * cur_h; + int ye = ceil(roi_y0 + bin_h * (cur_h + 1)); + + ws = fminf(fmaxf(ws, 0), in_w); + we = fminf(fmaxf(we, 0), in_w); + ys = fminf(fmaxf(ys, 0), in_h); + ye = fminf(fmaxf(ye, 0), in_h); + + int c_index = (cur_h * pooled_w + cur_w) * o_c + cur_c; + + const Dtype* offset_in_data = in_data + (batch * in_c + c_index) * in_w * in_h; + + Dtype sum = 0.f; + + for (int y = ys; y < ye; ++y){ + for (int w = ws; w < we; ++w){ + sum += offset_in_data[y * in_w + w]; + } + } + sum /= (ye - ys) * (we - ws); + + out_data[index] = sum; + + } + +} + +template +SaberStatus SaberPsRoiPool::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PsRoiPoolParam& param) { + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + const OpDataType* in_rois = (const OpDataType*)inputs[1]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* inter_data = (OpDataType*)_crop_data.mutable_data(); + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + + int num_rois = inputs[1] -> num(); + int out_n = outputs[0]->num(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int in_n = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + + int crop_width = param.crop_width / param.pooled_width; + int crop_height = param.crop_height / param.pooled_height; + + int crop_count = _crop_data.valid_size(); + int pool_count = outputs[0]->valid_size(); + int pooled_size = param.pooled_height * param.pooled_width; + + crop_and_resize_kernel\ + <<>>(\ + in_data, in_rois, inter_data, num_rois, in_h, in_w, + crop_height, crop_width, crop_count, param.method, + param.extra_value); + if (param.global_pooling){ + crop_global_pooling_kernel\ + <<>>(\ + inter_data, out_data, pooled_size, out_c, + num_rois, crop_height, crop_width, pool_count); + } else { + crop_no_global_pooling_kernel\ + <<>>\ + (inter_data, out_data, param.pooled_height, param.pooled_width, + out_c, num_rois, crop_height, crop_width, pool_count); + } + + return SaberSuccess; + +} + +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reduce.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reduce.cu new file mode 100644 index 000000000..52142a54e --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reduce.cu @@ -0,0 +1,451 @@ + +#include "saber/funcs/impl/cuda/saber_reduce.h" +#include "saber/funcs/impl/cuda/vender_reduce.h" +namespace anakin { +namespace saber { +namespace { +template +class ReOp { +public: + __device__ + static float compute(float a, float b) { + return -1.f; + } +}; +template <> +__device__ +float ReOp::compute(float a, float b) { + return ((a > b) ? a : b); +} + +template <> +__device__ +float ReOp::compute(float a, float b) { + return ((a > b) ? b : a); +} + +template <> +__device__ +float ReOp::compute(float a, float b) { + return a + b; +} + +template <> +__device__ +float ReOp::compute(float a, float b) { + return a + b; +} + +template <> +__device__ +float ReOp::compute(float a, float b) { + return a * b; +} + +template +class IndexCompute { +public: + __device__ + static int input_idx(const int* dims, + const int* odims, + int out_idx); +}; + +template <> +__device__ +int IndexCompute<4>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int i2 = (out_idx % out_stride[1]) / out_stride[2]; + int i3 = (out_idx % out_stride[2]) / out_stride[3]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1] + + i2 * in_stride[2] + + i3 * in_stride[3]; + return idx; +} + +template <> +__device__ +int IndexCompute<3>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int i2 = (out_idx % out_stride[1]) / out_stride[2]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1] + + i2 * in_stride[2]; + return idx; +} + +template <> +__device__ +int IndexCompute<2>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1]; + return idx; +} + +template <> +__device__ +int IndexCompute<1>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int idx = i0 * in_stride[0]; + return idx; +} + +// if you are reading this, there are still a lot +// optimize here to do, This class is the right class +// to make parallel reduction. +// the compute function can run inside one block, +// try to use shuffle instruction here. +// int tdim is the threads num of one block. +template +class ReduceCompute{ +public: + __device__ + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float* in_data, int in_idx) { + return 0; + } +}; + +template +class ReduceCompute<1, tdim, type> { +public: + __device__ + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + +// int tid = threadIdx.x; + float res = in_data[in_idx]; + int idx = in_idx + in_stride[rdims[0]]; + // here is the reduction op. + for (int i = 1; i < dims[rdims[0]]; ++i) { + res = ReOp::compute(res, in_data[idx]); + idx += in_stride[rdims[0]]; + } + return res; + } +}; + +template +class ReduceCompute<2, tdim, type> { +public: + __device__ + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = in_data[idx0]; + int idx1 = idx0 + in_stride[rdims[1]]; + for (int j = 1; j < dims[rdims[1]]; ++j) { + res1 = ReOp::compute(res1, in_data[idx1]); + idx1 += in_stride[rdims[1]]; + } + idx0 += in_stride[rdims[0]]; + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + } + return res0; + } +}; + +template +class ReduceCompute<3, tdim, type> { +public: + __device__ + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = 0.f; + int idx1 = idx0; + for (int j = 0; j < dims[rdims[1]]; ++j) { + float res2 = in_data[idx1]; + int idx2 = idx1 + in_stride[rdims[2]]; + for (int k = 1; k < dims[rdims[2]]; ++k) { + res2 = ReOp::compute(res2, in_data[idx2]); + idx2 += in_stride[rdims[2]]; + } + if (j == 0) { + res1 = res2; + } else { + res1 = ReOp::compute(res1, res2); + } + idx1 += in_stride[rdims[1]]; + } + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + idx0 += in_stride[rdims[0]]; + } + return res0; + } +}; + +template +class ReduceCompute<4, tdim, type> { +public: + __device__ + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = 0.f; + int idx1 = idx0; + for (int j = 0; j < dims[rdims[1]]; ++j) { + float res2 = 0.f; + int idx2 = idx1; + for (int k = 0; k < dims[rdims[2]]; ++k) { + float res3 = in_data[idx2]; + int idx3 = idx2 + in_stride[rdims[3]]; + for (int u = 0; u < dims[rdims[3]]; ++u) { + res3 = ReOp::compute(res3, in_data[idx3]); + idx3 += in_stride[rdims[3]]; + } + if (k == 0) { + res2 = res3; + } else { + res2 = ReOp::compute(res2, res3); + } + idx2 += in_stride[rdims[2]]; + } + if (j == 0) { + res1 = res2; + } else { + res1 = ReOp::compute(res1, res2); + } + idx1 += in_stride[rdims[1]]; + } + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + idx0 += in_stride[rdims[0]]; + } + return res0; + } +}; + +template +__global__ void reduce( + const dtype* src, + dtype* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, int out_size) { + int reduce_size = 1; + for (int i = 0; i < rDim; ++i) { + reduce_size *= dims[rdim[i]]; + } + float reduce_size_1 = 1.f / ((float)reduce_size); + int bid = blockIdx.x; + + int out_idx = bid; + //init; + int in_idx = IndexCompute::input_idx(i_stride, o_stride, out_idx); + float res = ReduceCompute::compute( + dims, rdim, i_stride, src, in_idx); + dst[out_idx] = res; + if (Reduce_avg == type) { + dst[out_idx] *= reduce_size_1; + } +} + +__global__ +void reduce_unknow( + const float* src, + float* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, int out_size) {return;} + +template +__global__ void reduce_all( + const dtype* src, + dtype* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, + int out_size) { + + int reduce_size = 1; + for (int i = 0; i < rDim; ++i) { + reduce_size *= dims[rdim[i]]; + } + float reduce_size_1 = 1.f / ((float)reduce_size); + //init; + float res = src[0]; + for (int i = 1; i < reduce_size; ++i) { + res = ReOp::compute(res, src[i]); + } + dst[0] = res; + if (Reduce_avg == type) { + dst[0] *= reduce_size_1; + } +} +} + +#define REG_REDUCE_TYPE_KERNEL(REDUCE_TYPE) \ + _kernel_direct_map[REDUCE_TYPE] = { \ + {reduce_unknow}, \ + {reduce_unknow, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce, \ + reduce, \ + reduce_all}} + +template +void async_copy_to_buffer(Buffer &buffer, + dtype* data, unsigned long size, cudaStream_t stream) { + buffer.re_alloc(size * sizeof(dtype)); + cudaMemcpyAsync(buffer.get_data_mutable(), data, + size * sizeof(dtype), cudaMemcpyHostToDevice, stream); +} + +template <> +SaberStatus SaberReduce::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + this->_ctx = &ctx; + + if (_template_reduction) { + auto stream = _ctx->get_compute_stream(); + + auto i_stride = inputs[0]->get_stride(); + auto o_stride = outputs[0]->get_stride(); + std::vector ndim(inputs[0]->valid_shape()); + async_copy_to_buffer(_rdim_b, + param.reduce_dim.data(), + param.reduce_dim.size(), stream); + async_copy_to_buffer(_ndim_b, + inputs[0]->valid_shape().data(), + inputs[0]->valid_shape().size(), stream); + async_copy_to_buffer(_i_stride_b, + i_stride.data(), i_stride.size(), stream); + async_copy_to_buffer(_o_stride_b, + o_stride.data(), o_stride.size(), stream); + return SaberSuccess; + + } else { + return _impl->create(inputs, outputs, param, ctx); + } +} + +template <> +SaberStatus SaberReduce::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + + this->_ctx = &ctx; + + if (_template_reduction) { + REG_REDUCE_TYPE_KERNEL(Reduce_avg); + REG_REDUCE_TYPE_KERNEL(Reduce_min); + REG_REDUCE_TYPE_KERNEL(Reduce_max); + REG_REDUCE_TYPE_KERNEL(Reduce_sum); + REG_REDUCE_TYPE_KERNEL(Reduce_prod); + } else { + _impl = new VenderReduce; + _impl->init(inputs, outputs, param, ctx); + } + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberReduce::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param) { + + if (_template_reduction) { + int out_size = outputs[0]->valid_size(); + _kernel_direct_map[param.reduce_type] + [inputs[0]->dims()] + [param.reduce_dim.size()] << < out_size, 1, + 0, _ctx->get_compute_stream() >> > ( + (const float *) inputs[0]->data(), + (float *) outputs[0]->mutable_data(), + (const int *) _rdim_b.get_data(), + (const int *) _ndim_b.get_data(), + (const int *) _i_stride_b.get_data(), + (const int *) _o_stride_b.get_data(), + outputs[0]->valid_size()); + return SaberSuccess; + } else { + return _impl->dispatch(inputs, outputs, param); + } + +} + +template class SaberReduce; +DEFINE_OP_TEMPLATE(SaberReduce, ReduceParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberReduce, ReduceParam, NV, AK_INT8); + +} // namespace saber. +} // namespace anakin. diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reduce_min.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reduce_min.cu new file mode 100644 index 000000000..607c5c7c1 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reduce_min.cu @@ -0,0 +1,297 @@ +#include "saber/funcs/impl/cuda/saber_reduce_min.h" + +namespace anakin { +namespace saber { + +/** + * @brief reduce tensor acorrding to the given reduce dim. + * e.g. + * input tensor with shape [5, 2, 10, 4] (rank = 4, how many dimentions does a tensor have.) + * and the reduce dim may have the following forms: + * 1) reduce_dim = None, no reduce dim. It means that reduce all dimentions [default] + * output's shape [1, 1, 1, 1]. + * 2) reduce_dim = x, x is the dimention we want to reduce. + * output's shape: + * x = 0, for example, the shape will be [1, 2, 10, 4] if keep_dim is true, otherwise it will be [2*10*4, 1, 1, 1]. + * x = 2, for example, the shape will be [5, 2, 1, 4] if keep_dim is true, otherwise it will be [5*2*4, 1, 1, 1]. + * and so on. + * 3) reduce_dim = [x, y], It will reduce two dimetions x and y. + * output's shape: + * reduce_dim = [0, 1], for example, the shape will be [1, 1, 10 ,4] or [10*4, 1, 1, 1] and so on. + * Notes: + * if reduce_dim[i] < 0: + * do + * reduce_dim[i] += rank. + * + * @tparam OpDtype + * @param inputs + * @param outputs + * @param param + * @return SaberStatus + */ + + //This function is used to implement atioMin based on CAS function. +// __device__ float atomicMin(float* address, float val) { +// unsigned long long int* address_as_ull = (unsigned long long int*)address; +// unsigned long long int old = *address_as_ull, assumed; +// do{ +// assumed = old; +// old = atomicCAS(address_as_ull, assumed, __float_as_longlong( +// fminf(val, __longlong_as_float(assumed)))); + +// }while(assumed != old); +// return __longlong_as_float(old); +// } + + __device__ double atomicMin(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do{ + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong( + fmin(val, __longlong_as_double(assumed)))); + + }while(assumed != old); + return __longlong_as_double(old); + } + + __device__ double atomicMin(float* address, float val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do{ + assumed = old; + old = atomicCAS(address_as_ull, assumed, __float_as_int( + fminf(val, __int_as_float(assumed)))); + + }while(assumed != old); + return __longlong_as_double(old); + } + +//thread num: CHW +template +__global__ void kernel_reduce_n(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in, const int count) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int thread_num = blockDim.x * gridDim.x; + int feature_map = height_in * width_in; //HW + int size = channel_in * feature_map;// CHW + int c_id = tid / feature_map; + int feature_map_inner_index = tid % feature_map; + dtype min = src[tid]; + for (int n = 1; n < num_in; ++n) { + dtype tmp = src[n * size + c_id * feature_map + feature_map_inner_index]; + min = tmp < min ? tmp : min; + } + dst[tid] = min; +} + +//thread num:NHW +template +__global__ void kernel_reduce_c(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in, const int count) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int thread_num = blockDim.x * gridDim.x; + int feature_map = height_in * width_in; + int size = channel_in * feature_map; + for (int i = tid; i < count; i += thread_num) { + int n_id = i / feature_map; + int inner_index = i % feature_map; + dtype min = src[n_id * size + inner_index]; + for (int c = 1; c < channel_in; ++c) { + dtype tmp = src[n_id * size + c * feature_map + inner_index]; + min = tmp < min? tmp : min; + } + dst[n_id * feature_map + inner_index] = min; // Is data_index same to tid/i?. + } + +} + +//thread num: NCW +template +__global__ void kernel_reduce_h(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in, const int count) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int thread_num = blockDim.x * gridDim.x; + int feature_map = height_in * width_in; //CW + int cw_size = channel_in * width_in; //CW + int size = channel_in * feature_map; //CHW + for (int i = tid; i < count; i += thread_num) { + int n_id = i / cw_size; + int c_id = (i / width_in) % channel_in; + int inner_index = i % width_in; + int data_index = n_id * size + c_id * feature_map + inner_index; + dtype min = src[data_index]; + for (int h = 1; h < height_in; ++h) { + dtype tmp = src[data_index + h * width_in]; + min = tmp < min? tmp : min; + } + dst[n_id * cw_size + c_id * width_in + inner_index] = min; // Is data_index same to tid/i?. + } +} + +//thread num: NCH +template +__global__ void kernel_reduce_w(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in, const int count) { + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int thread_num = blockDim.x * gridDim.x; + int ch_size = channel_in * height_in; //CH + int size = ch_size * width_in; //CHW + int feature_map = height_in * width_in; //HW + for (int i = tid; i < count; i += thread_num) { + int n_id = i / ch_size; + int c_id = (i / height_in) % channel_in; + int inner_index = i % height_in; + int data_index = n_id * size + c_id * feature_map + inner_index * width_in; + dtype min = src[data_index]; + for (int w = 1; w < width_in; ++w) { + dtype tmp = src[data_index + w]; + min = tmp < min? tmp : min; + } + dst[n_id * ch_size + c_id * height_in + inner_index] = min; + } +} + +//reduce all. +template +__global__ void kernel_reduce_nchw(const dtype* src, dtype* dst, const int count) { + + int n_id = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x; + int thread_num = blockDim.x * gridDim.x; + dst[0] = src[n_id]; + extern __shared__ dtype s[]; + dtype min = src[n_id]; + for (int i = n_id; i < count; i += thread_num) { + min = src[i] < min ? src[i] : min; + } + s[tid] = min; + __syncthreads(); + + int powOf2 = blockDim.x; + if (powOf2 & (powOf2 - 1)) { + //block threads are not pow of 2. + while (powOf2 & (powOf2 - 1)) { + powOf2 &= powOf2 - 1; + } // it'll end when it find pow of 2. + if (tid >= powOf2) { + s[tid - powOf2] = s[tid - powOf2] < s[tid]? s[tid - powOf2] : s[tid]; + } + __syncthreads(); + } + for (int i = powOf2>>1; i > 0; i>>=1) { + if (tid < i) { + s[tid] = s[tid] < s[tid + i]? s[tid] : s[tid + i]; + } + __syncthreads(); + } + if (threadIdx.x == 0) { + //double tmp = s[] + atomicMin(&dst[0], s[threadIdx.x]); + } +} + +template +SaberStatus SaberReduceMin::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceMinParam& param) { + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + const OpDataType* input_ptr = (const OpDataType*)inputs[0]->data(); + OpDataType* output_ptr = (OpDataType*)outputs[0]->mutable_data(); + int count = outputs[0]->valid_size(); + + if (_reduce_dim.empty()) { + // reduce_all + int count_all = inputs[0]->valid_size(); + int grid, thread_num; + if (count_all < CUDA_NUM_THREADS) { + thread_num = count_all; + grid = 1; + }else { + thread_num = CUDA_NUM_THREADS; + if (CUDA_GET_BLOCKS(count) >= 128) //This is to avoid share memory blowing up. + grid = 64; + else + grid = CUDA_GET_BLOCKS(count); + } + int sharedSize = thread_num * 4; + kernel_reduce_nchw<<>>( + input_ptr, output_ptr, count_all); + }else if (_reduce_dim.size() == 1) { + if (_reduce_dim[0] == 0) { + //reduce n + kernel_reduce_n<<>>( + input_ptr, output_ptr, _num, _channel, _height, _width, count); + } + if (_reduce_dim[0] == 1) { + //reduce c + kernel_reduce_c<<>>( + input_ptr, output_ptr, _num, _channel, _height, _width, count); + } + if (_reduce_dim[0] == 2) { + //reduce h + kernel_reduce_h<<>>( + input_ptr, output_ptr, _num, _channel, _height, _width, count); + } + if (_reduce_dim[0] == 3) { + //reduce h + kernel_reduce_w<<>>( + input_ptr, output_ptr, _num, _channel, _height, _width, count); + } + } else if (_reduce_dim.size() == 2) { + //only consecutive reduce dim? [0,1] [1, 2], not [0, 2]? + if (_reduce_dim[0] == 0 && _reduce_dim[1] == 1) { + //reduce n, c. reduce n first. + _tensor_tmp.reshape(std::vector({1, _channel, _height, _width})); + int count_n = _tensor_tmp.valid_size(); + int count_nc = count_n / _tensor_tmp.channel(); + OpDataType* tmp_out = (OpDataType*)_tensor_tmp.mutable_data(); + kernel_reduce_n<<>>( + input_ptr, tmp_out, _num, _channel, _height, _width, count_n); + + kernel_reduce_c<<>>( + tmp_out, output_ptr, 1, _channel, _height, _width, count_nc); + }else if (_reduce_dim[0] == 1 && _reduce_dim[1] == 2) { + //reduce c. h. reduce c first. + _tensor_tmp.reshape(std::vector({_num, 1, _height, _width})); + int count_c = _tensor_tmp.valid_size(); + int count_ch = count_c / _tensor_tmp.height(); + OpDataType* tmp_out = (OpDataType*)_tensor_tmp.mutable_data(); + kernel_reduce_c<<>>( + input_ptr, tmp_out, _num, _channel, _height, _width, count_c); + + kernel_reduce_h<<>>( + tmp_out, output_ptr, _num, 1, _height, _width, count_ch); + }else if (_reduce_dim[0] == 2 && _reduce_dim[1] == 3) { + //reduce h, w. reduce h first. + _tensor_tmp.reshape(std::vector({_num, _channel, 1, _width})); + int count_h = _tensor_tmp.valid_size(); + int count_hw = count_h / _tensor_tmp.width(); + OpDataType* tmp_out = (OpDataType*)_tensor_tmp.mutable_data(); + kernel_reduce_h<<>>( + input_ptr, tmp_out, _num, _channel, _height, _width, count_h); + + kernel_reduce_w<<>>( + tmp_out, output_ptr, _num, _channel, 1, _width, count_hw); + }else { + LOG(FATAL) <<"[reduce_min] invalid reduce_dim!!!"; + } + }else { + LOG(FATAL) << "[reduce_min]Reducing size over than 2 is not support!!"; + } + + CUDA_POST_KERNEL_CHECK; + + return SaberSuccess; +} + +template class SaberReduceMin; +DEFINE_OP_TEMPLATE(SaberReduceMin, ReduceMinParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberReduceMin, ReduceMinParam, NV, AK_INT8); + +} // namespace saber. +} // namespace anakin. diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu index 30b69cc93..851ad9aa2 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_resize.cu @@ -6,7 +6,7 @@ namespace anakin{ namespace saber{ template -__global__ void resize_bilinear_2d_kernel(const int wout, const int hout, +__global__ static void resize_bilinear_custom_kernel(const int wout, const int hout, const int num,const int channels, const int dst_stride_w, const int dst_stride_h, @@ -90,6 +90,203 @@ __global__ void resize_bilinear_2d_kernel(const int wout, const int hout, } } +template +__global__ static void resize_bilinear_no_align_kernel(const int wout, const int hout, + const int num,const int channels, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_c, + const int dst_stride_batch, + const int win, const int hin, + const int src_stride_w, + const int src_stride_h, + const int src_stride_c, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst) +{ + + int dst_w = blockIdx.x * blockDim.x + threadIdx.x; + int dst_h = blockIdx.y * blockDim.y + threadIdx.y; + + if (dst_w < wout && dst_h < hout){ + float scale_w_new = (float)win / wout; + float scale_h_new = (float)hin / hout; + float fh = scale_h_new * (dst_h + 0.5) - 0.5; + float fw = scale_w_new * (dst_w + 0.5) - 0.5; + fh = fh < 0 ? 0 : fh; + fw = fw < 0 ? 0 : fw; + const int src_h = int(fh); + const int src_w = int(fw); + int w_id = src_w < win - 1 ? 1 : 0; + int h_id = src_h < hin -1 ? 1 : 0; + int w = src_w + w_id; + int h = src_h + h_id; + + fh -= src_h; + fw -= src_w; + const float w_h0 = 1.0f - fh; + const float w_w0 = 1.0f - fw; + const float w_h1 = fh; + const float w_w1 = fw; + + float w_00 = w_h0 * w_w0; + float w_01 = w_h0 * w_w1; + float w_10 = w_h1 * w_w0; + float w_11 = w_h1 * w_w1; + + for (int i = 0; i < num; ++i) { + int src_batch_idx = i * src_stride_batch; + + int hl = src_h * src_stride_h; + int hh = h * src_stride_h; + int wl = src_w * src_stride_w; + int wh = w * src_stride_w; + + int src_indexTL = src_batch_idx + hl + wl; + int src_indexTR = src_batch_idx + hl + wh; + int src_indexBL = src_batch_idx + hh + wl; + int src_indexBR = src_batch_idx + hh + wh; + + int dst_index = i * dst_stride_batch + dst_w * dst_stride_w + dst_h * dst_stride_h; + + for (int j = 0; j < channels; ++j) { + dtype tl = src[src_indexTL]; + dtype tr = src[src_indexTR];//w > win? 0 : + dtype bl = src[src_indexBL];//h > hin? 0 : + dtype br = src[src_indexBR];//(w > win || h > hin)? 0 : + + dst[dst_index] = static_cast(w_00 * tl + w_01 * tr + w_10 * bl + w_11 * br); + src_indexBR += src_stride_c; + src_indexBL += src_stride_c; + src_indexTR += src_stride_c; + src_indexTL += src_stride_c; + dst_index += dst_stride_c; + } + } + } +} + +template +__global__ static void resize_bilinear_align_kernel(const int wout, const int hout, + const int num,const int channels, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_c, + const int dst_stride_batch, + const int win, const int hin, + const int src_stride_w, + const int src_stride_h, + const int src_stride_c, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst) +{ + + int dst_w = blockIdx.x * blockDim.x + threadIdx.x; + int dst_h = blockIdx.y * blockDim.y + threadIdx.y; + + if (dst_w < wout && dst_h < hout){ + + float scale_w_new = (float)(win - 1) / (wout - 1); + float scale_h_new = (float)(hin - 1) / (hout - 1); + float fh = scale_h_new * dst_h; + float fw = scale_w_new * dst_w; + const int src_h = int(fh); + const int src_w = int(fw); + int w_id = src_w < win - 1 ? 1 : 0; + int h_id = src_h < hin -1 ? 1 : 0; + int w = src_w + w_id; + int h = src_h + h_id; + fh -= src_h; + fw -= src_w; + const float w_h0 = 1.0f - fh; + const float w_w0 = 1.0f - fw; + const float w_h1 = fh; + const float w_w1 = fw; + + float w_00 = w_h0 * w_w0; + float w_01 = w_h0 * w_w1; + float w_10 = w_h1 * w_w0; + float w_11 = w_h1 * w_w1; + + for (int i = 0; i < num; ++i) { + int src_batch_idx = i * src_stride_batch; + + int hl = src_h * src_stride_h; + int hh = h * src_stride_h; + int wl = src_w * src_stride_w; + int wh = w * src_stride_w; + + int src_indexTL = src_batch_idx + hl + wl; + int src_indexTR = src_batch_idx + hl + wh; + int src_indexBL = src_batch_idx + hh + wl; + int src_indexBR = src_batch_idx + hh + wh; + + int dst_index = i * dst_stride_batch + dst_w * dst_stride_w + dst_h * dst_stride_h; + + for (int j = 0; j < channels; ++j) { + dtype tl = src[src_indexTL]; + dtype tr = src[src_indexTR];//w > win? 0 : + dtype bl = src[src_indexBL];//h > hin? 0 : + dtype br = src[src_indexBR];//(w > win || h > hin)? 0 : + + dst[dst_index] = static_cast(w_00 * tl + w_01 * tr + w_10 * bl + w_11 * br); + src_indexBR += src_stride_c; + src_indexBL += src_stride_c; + src_indexTR += src_stride_c; + src_indexTL += src_stride_c; + dst_index += dst_stride_c; + } + } + } +} + +template +__global__ static void resize_nearest_kernel(const int wout, const int hout, + const int num,const int channels, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_c, + const int dst_stride_batch, + const int win, const int hin, + const int src_stride_w, + const int src_stride_h, + const int src_stride_c, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst) +{ + + int dst_w = blockIdx.x * blockDim.x + threadIdx.x; + int dst_h = blockIdx.y * blockDim.y + threadIdx.y; + + if (dst_w < wout && dst_h < hout){ + + float scale_w_new = (float)(win - 1) / (wout - 1); + float scale_h_new = (float)(hin - 1) / (hout - 1); + + int fh = static_cast(scale_h_new * dst_h + 0.5); + int fw = static_cast(scale_w_new * dst_w + 0.5); + fh = fh < 0 ? 0 : fh; + fw = fw < 0 ? 0 : fw; + const int src_h = fh; + const int src_w = fw; + + for (int i = 0; i < num; ++i) { + int src_index = i * src_stride_batch + src_h * src_stride_h + src_w * src_stride_w; + int dst_index = i * dst_stride_batch + dst_w * dst_stride_w + dst_h * dst_stride_h; + + for (int j = 0; j < channels; ++j) { + + dst[dst_index] = src[src_index]; + src_index += src_stride_c; + dst_index += dst_stride_c; + } + } + } +} + template SaberStatus SaberResize::dispatch(\ @@ -106,6 +303,13 @@ SaberStatus SaberResize::dispatch(\ int c_out = outputs[0]->channel(); int n_out = outputs[0]->num(); + if (inputs.size() > 1) { + int* out_size_data = static_cast(inputs[1]->data()); + h_out = out_size_data[0]; + w_out = out_size_data[1]; + outputs[0]->reshape(Shape({n_out, c_out, h_out, w_out})); + } + int w_in = inputs[0]->width(); int h_in = inputs[0]->height(); int c_in = inputs[0]->channel(); @@ -140,7 +344,15 @@ SaberStatus SaberResize::dispatch(\ } else { dst_real_shape = outputs[0]->shape(); } - + float scale_w = 0.f; + float scale_h = 0.f; + if (param.out_width != -1 && param.out_height != -1){ + scale_w = (float)param.out_width / w_in; + scale_h = (float)param.out_height / h_in; + } else { + scale_w = param.width_scale; + scale_h = param.height_scale; + } int src_stride_w = src_real_shape.count(width_idx + 1);//inputs[0]->count_valid(width_idx + 1, dims); int src_stride_h = src_real_shape.count(height_idx + 1);//inputs[0]->count_valid(height_idx + 1, dims); int src_stride_channel = src_real_shape.count(channel_idx + 1);//inputs[0]->count_valid(channel_idx + 1, dims); @@ -149,13 +361,38 @@ SaberStatus SaberResize::dispatch(\ int dst_stride_h = dst_real_shape.count(height_idx + 1);//outputs[0]->count(height_idx + 1, dims); int dst_stride_channel = dst_real_shape.count(channel_idx + 1);//outputs[0]->count(channel_idx + 1, dims); int dst_stride_batch = dst_real_shape.count(num_idx + 1);//outputs[0]->count(num_idx + 1, dims); - resize_bilinear_2d_kernel<<>>( - w_out, h_out, n_out, c_out, - dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, - w_in, h_in, - src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, - 1 / param.width_scale, 1 / param.height_scale, - (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data()); + switch (param.resize_type){ + case BILINEAR_ALIGN: + resize_bilinear_align_kernel<<>>(w_out, h_out, n_out, c_out, \ + dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, \ + w_in, h_in, src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, \ + 1 / scale_w, 1 / scale_h, \ + (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data());; + break; + case BILINEAR_NO_ALIGN: + resize_bilinear_no_align_kernel<<>>(w_out, h_out, n_out, c_out, \ + dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, \ + w_in, h_in, src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, \ + 1 / scale_w, 1 / scale_h, \ + (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data());; + break; + case RESIZE_CUSTOM: + resize_bilinear_custom_kernel<<>>(w_out, h_out, n_out, c_out, \ + dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, \ + w_in, h_in, src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, \ + 1 / scale_w, 1 / scale_h, \ + (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data());; + break; + case NEAREST_ALIGN: + resize_nearest_kernel<<>>(w_out, h_out, n_out, c_out, \ + dst_stride_w, dst_stride_h, dst_stride_channel, dst_stride_batch, \ + w_in, h_in, src_stride_w, src_stride_h, src_stride_channel, src_stride_batch, \ + 1 / scale_w, 1 / scale_h, \ + (const OpDataType*)inputs[0]->data(), (OpDataType*)outputs[0]->mutable_data());; + break; + default: + LOG(FATAL) << "Unimply resize type: " << (int)param.resize_type; + } //outputs[0]->record_event(stream); return SaberSuccess; @@ -165,4 +402,4 @@ template class SaberResize; DEFINE_OP_TEMPLATE(SaberResize, ResizeParam, NV, AK_HALF); } //namespace anakin -} //namespace +} //namespace diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu index 2c6674733..5ec2edd17 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_input.cu @@ -8,73 +8,79 @@ namespace saber { template SaberStatus SaberReverseInput::init(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m, - Context &ctx) { - this->_ctx=&ctx; - for(int i=0;i& outputs, + EmptyParam& param, + Context& ctx) { + this->_ctx = &ctx; + + for (int i = 0; i < inputs.size(); ++i) { _offset_map_vec.push_back(*new Tensor()); _offset_map_vec[i].set_dtype(AK_INT32); _offset_map_cu_vec.push_back(*new OpTensor()); _offset_map_cu_vec[i].set_dtype(AK_INT32); } - return create(inputs,outputs,param,ctx); + return create(inputs, outputs, param, ctx); }; template SaberStatus SaberReverseInput::create(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m, - Context &ctx) { - if(this->_ctx=&ctx){ - this->_ctx=&ctx; + std::vector& outputs, + EmptyParam& param, + Context& ctx) { + if (this->_ctx = &ctx) { + this->_ctx = &ctx; } + return SaberSuccess; }; -static inline int round_up(int k, int c) { - return ((k + c - 1) / c) * c; -} - template -__global__ static void ker_reverse_input(const Dtype* in,Dtype* out,int length,int* offset){ - int tid=blockIdx.x*blockDim.x+threadIdx.x; - if(tid SaberStatus SaberReverseInput::dispatch(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m) { - int input_size=inputs.size(); - - cudaStream_t stream=this->_ctx->get_compute_stream(); - for(int input_id=0;input_id> offset_vec=inputs[input_id]->get_seq_offset(); - std::vector offset=offset_vec[offset_vec.size()-1]; - int word_sum=offset[offset.size()-1]; - utils::try_expand_tensor(_offset_map_vec[input_id],word_sum); - utils::try_expand_tensor(_offset_map_cu_vec[input_id],word_sum); - int* offset_map_ptr= static_cast(_offset_map_vec[input_id].mutable_data()); - int* offset_map_cu_ptr= static_cast(_offset_map_cu_vec[input_id].mutable_data()); - for(int sequence_id=0;sequence_id& outputs, + EmptyParam& param) { + int input_size = inputs.size(); + + cudaStream_t stream = this->_ctx->get_compute_stream(); + + for (int input_id = 0; input_id < input_size; ++input_id) { + std::vector> offset_vec = inputs[input_id]->get_seq_offset(); + std::vector offset = offset_vec[offset_vec.size() - 1]; + int word_sum = offset[offset.size() - 1]; + utils::try_expand_tensor(_offset_map_vec[input_id], word_sum); + utils::try_expand_tensor(_offset_map_cu_vec[input_id], word_sum); + int* offset_map_ptr = static_cast(_offset_map_vec[input_id].mutable_data()); + int* offset_map_cu_ptr = static_cast(_offset_map_cu_vec[input_id].mutable_data()); + + for (int sequence_id = 0; sequence_id < offset.size() - 1; sequence_id++) { + int start = offset[sequence_id]; + int end = offset[sequence_id + 1] - 1; + + for (int index = 0; index <= end - start; index++) { + offset_map_ptr[end - index] = start + index; } } - CUDA_CHECK(cudaMemcpyAsync(offset_map_cu_ptr,offset_map_ptr, sizeof(int)*word_sum,cudaMemcpyHostToDevice,stream)); - int block_dim=256; - if(word_sum(inputs[input_id]->data()); - OpDataType* out=static_cast(outputs[input_id]->mutable_data()); - ker_reverse_input<<>>(in,out,word_sum,offset_map_cu_ptr); + + int grid_dim = utils::div_up(word_sum, block_dim); + const OpDataType* in = static_cast(inputs[input_id]->data()); + OpDataType* out = static_cast(outputs[input_id]->mutable_data()); + ker_reverse_input <<< grid_dim, block_dim, 0, stream>>>(in, out, word_sum, offset_map_cu_ptr); } return SaberSuccess; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu index a0bb556b2..f04d0e573 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_reverse_sequence.cu @@ -8,78 +8,84 @@ namespace saber { template SaberStatus SaberReverseSequence::init(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m, - Context &ctx) { - this->_ctx=&ctx; + std::vector& outputs, + EmptyParam& param, + Context& ctx) { + this->_ctx = &ctx; - return create(inputs,outputs,param,ctx); + return create(inputs, outputs, param, ctx); }; template SaberStatus SaberReverseSequence::create(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m, - Context &ctx) { - if(this->_ctx=&ctx){ - this->_ctx=&ctx; + std::vector& outputs, + EmptyParam& param, + Context& ctx) { + if (this->_ctx = &ctx) { + this->_ctx = &ctx; } - int input_size=inputs.size(); - CHECK_EQ(input_size,1)<<"only support one input now"; + + int input_size = inputs.size(); + CHECK_EQ(input_size, 1) << "only support one input now"; return SaberSuccess; }; -static inline int round_up(int k, int c) { - return ((k + c - 1) / c); -} template -__global__ static void ker_reverse_sequence(const Dtype* in,Dtype* out,int length,int word_size,int* offset){ - int tid=blockIdx.x*blockDim.x+threadIdx.x; - if(tid SaberStatus SaberReverseSequence::dispatch(const std::vector& inputs, - std::vector& outputs, - EmptyParam ¶m) { - int input_size=inputs.size(); - CHECK_EQ(input_size,1)<<"only support one input now"; + std::vector& outputs, + EmptyParam& param) { + int input_size = inputs.size(); + CHECK_EQ(input_size, 1) << "only support one input now"; - cudaStream_t stream=this->_ctx->get_compute_stream(); - std::vector> offset_vec=inputs[0]->get_seq_offset(); - std::vector offset=offset_vec[offset_vec.size()-1]; + cudaStream_t stream = this->_ctx->get_compute_stream(); + std::vector> offset_vec = inputs[0]->get_seq_offset(); + std::vector offset = offset_vec[offset_vec.size() - 1]; - int batch_size=offset.size()-1; - int word_size=inputs[0]->valid_shape()[1]; - int word_sum=offset[batch_size]; + int batch_size = offset.size() - 1; + int word_size = inputs[0]->valid_shape()[1]; + int word_sum = offset[batch_size]; - utils::try_expand_tensor(_offset_map,word_sum); - utils::try_expand_tensor(_offset_map_cu,word_sum); - int* offset_map_ptr= static_cast(_offset_map.mutable_data()); - int* offset_map_cu_ptr= static_cast(_offset_map_cu.mutable_data()); + utils::try_expand_tensor(_offset_map, word_sum); + utils::try_expand_tensor(_offset_map_cu, word_sum); + int* offset_map_ptr = static_cast(_offset_map.mutable_data()); + int* offset_map_cu_ptr = static_cast(_offset_map_cu.mutable_data()); for (int i = 0; i < batch_size; i++) { int seq_len = offset[i + 1] - offset[i]; - int start_word_id=offset[i]; + int start_word_id = offset[i]; + for (int j = 0; j < seq_len; j++) { - offset_map_ptr[start_word_id+seq_len-1-j]=start_word_id+j; + offset_map_ptr[start_word_id + seq_len - 1 - j] = start_word_id + j; } } - CUDA_CHECK(cudaMemcpyAsync(offset_map_cu_ptr,offset_map_ptr, sizeof(int)*word_sum,cudaMemcpyHostToDevice,stream)); - int tid_sum=word_sum*word_size; - int block_dim=256; - if(tid_sum(inputs[0]->data()); - OpDataType* out=static_cast(outputs[0]->mutable_data()); - ker_reverse_sequence<<>>(in,out,tid_sum,word_size,offset_map_cu_ptr); + + int grid_dim = utils::div_up(tid_sum, block_dim); + const OpDataType* in = static_cast(inputs[0]->data()); + OpDataType* out = static_cast(outputs[0]->mutable_data()); + ker_reverse_sequence <<< grid_dim, block_dim, 0, stream>>>(in, out, tid_sum, word_size, + offset_map_cu_ptr); return SaberSuccess; diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_roi_align.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_roi_align.cu new file mode 100644 index 000000000..4917758d2 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_roi_align.cu @@ -0,0 +1,145 @@ +#include "saber/funcs/impl/cuda/saber_roi_align.h" +#include "saber/core/tensor_op.h" +// #include "cuda_fp16.h" +// #include + +namespace anakin { + +namespace saber { + +//The Bilinear interpolation +template +__device__ dtype BilinearInterpolate(const dtype* input_data, const int height, + const int width, dtype y, dtype x) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return 0; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + dtype ly = y - y_low, lx = x - x_low; + dtype hy = 1. - ly, hx = 1. - lx; + + dtype v1 = input_data[y_low * width + x_low]; + dtype v2 = input_data[y_low * width + x_high]; + dtype v3 = input_data[y_high * width + x_low]; + dtype v4 = input_data[y_high * width + x_high]; + dtype w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + dtype val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__global__ void kernel_roi_align(const dtype* src, + const dtype* input_rois, + dtype* dst, + const int in_n_stride, + const int in_c_stride, + const int in_h_stride, + const int in_w_stride, + const int out_n_stride, + const int out_c_stride, + const int out_h_stride, + const int out_w_stride, + const int in_c, + const int in_h, + const int in_w, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const int kROISize, + const int num_threads, + const dtype spatial_scale) { + CUDA_KERNEL_LOOP(tid, num_threads) { + int n = tid / out_n_stride; + int c = (tid / out_c_stride) % in_c; + int ph = (tid / pooled_width) % pooled_height; + int pw = tid % pooled_width; + + const dtype* offset_input_rois = input_rois + n * kROISize; + int roi_batch_id = offset_input_rois[0]; + dtype roi_xmin = offset_input_rois[1] * spatial_scale; + dtype roi_ymin = offset_input_rois[2] * spatial_scale; + dtype roi_xmax = offset_input_rois[3] * spatial_scale; + dtype roi_ymax = offset_input_rois[4] * spatial_scale; + + dtype roi_width = fmaxf(roi_xmax - roi_xmin, 1.0f); + dtype roi_height = fmaxf(roi_ymax - roi_ymin, 1.0f); + dtype bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + dtype bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const dtype* offset_src = src + roi_batch_id * in_n_stride + c * in_c_stride; + int roi_bin_grid_h = sampling_ratio > 0? sampling_ratio : ceil(roi_height / pooled_height); + int roi_bin_grid_w = sampling_ratio > 0? sampling_ratio : ceil(roi_width / pooled_width); + const int sample_count = roi_bin_grid_h * roi_bin_grid_w; + dtype val = 0; + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + dtype y = roi_ymin + ph * bin_size_h + + static_cast(iy + 0.5f) * bin_size_h / static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + dtype x = roi_xmin + pw * bin_size_w + + static_cast(ix + 0.5f) * bin_size_w / static_cast(roi_bin_grid_w); + dtype tmp = BilinearInterpolate(offset_src, in_h, in_w, y, x); + val += tmp; + } + } + val /= sample_count; + dst[tid] = val; + } +} + +template +SaberStatus SaberRoiAlign::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + RoiAlignParam& param) { + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + const OpDataType* in_rois = (const OpDataType*)inputs[1]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int count = outputs[0]->valid_size(); + int out_n = outputs[0]->num(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int in_n = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + kernel_roi_align\ + <<>>(\ + in_data, in_rois, out_data, \ + _in_n_stride, _in_c_stride, _in_h_stride, _in_w_stride,\ + _out_n_stride, _out_c_stride, _out_h_stride, _out_w_stride,\ + in_c, in_h, in_w, + param.pooled_height, param.pooled_width, param.sampling_ratio, \ + _kROISize, count, param.spatial_scale); + } + return SaberSuccess; +} + +template class SaberRoiAlign; +DEFINE_OP_TEMPLATE(SaberRoiAlign, RoiAlignParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberRoiAlign, RoiAlignParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu index 454f0d481..c869bd4b3 100755 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_scale.cu @@ -69,6 +69,7 @@ SaberStatus SaberScale::dispatch( \ } CUDA_POST_KERNEL_CHECK; + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); return SaberSuccess; } diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_concat.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_concat.cu new file mode 100644 index 000000000..04b16acbd --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_concat.cu @@ -0,0 +1,139 @@ +#include "saber/funcs/impl/cuda/saber_sequence_concat.h" +#include "saber/core/tensor_op.h" +#define BUILD_DEV __device__ + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_sequence_concat_fwd(Dtype * out_data, + const uint64_t* in_locate_data, + const int* o2i_map, + const int* o2i_w_map, + const int seq_num, + const int emb_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int input_id = o2i_map[word_id]; + int cur_word_id = o2i_w_map[word_id]; + const Dtype* in_data = (const Dtype*)(in_locate_data[input_id]); + out_data[tid] = in_data[cur_word_id * emb_size + emb_id]; + } +} + + +template <> +SaberStatus SaberSequenceConcat::create( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} + +template <> +SaberStatus SaberSequenceConcat::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam& param, Context& ctx) { + int out_num = 0; + for (int i = 0; i < inputs.size(); i++) { + out_num += inputs[i]->num(); + } + Shape shape({out_num, 1, 1, 1}, Layout_NCHW); + _out2in_map_tensor.re_alloc(shape, AK_INT32); + _out2in_word_map_tensor.re_alloc(shape, AK_INT32); + + int in_num = inputs.size(); + Shape in_locate_shape({in_num, 1, 1, 1}, Layout_NCHW); + _in_locate_tensor.re_alloc(in_locate_shape, AK_UINT64); + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberSequenceConcat::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam& param) { +/* + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + const int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + float *output_data = (float*)outputs[0]->mutable_data(); + for (int i = 0; i < seq_num; i++) { + for (int j = 0; j < inputs.size(); j++) { + size_t cur_len = inputs[j]->get_seq_offset()[0][i+1] - inputs[j]->get_seq_offset()[0][i]; + + const OpDataType *input_data = (const OpDataType*)inputs[j]->data() + inputs[j]->get_seq_offset()[0][i] * emb_size; + cudaMemcpyAsync(output_data, input_data, sizeof(OpDataType) * cur_len * emb_size, cudaMemcpyDeviceToDevice, cuda_stream); + output_data += cur_len * emb_size; + } + } +*/ + + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + const int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_num = inputs[i]->get_seq_offset()[0].size() - 1; + CHECK_EQ(emb_size, cur_emb_size) << "sequence concat emb size must be the same"; + CHECK_EQ(seq_num, cur_seq_num) << "sequence concat seq num must be the same"; + } + + float *out_data = (float*)outputs[0]->mutable_data(); + std::vector in_locate_vec; + for (int i = 0; i < inputs.size(); i++) { + //in_locate_vec.push_back(static_cast(inputs[i]->data())); + in_locate_vec.push_back((uint64_t)(inputs[i]->data())); + } + std::vector out2in_map; + std::vector out2in_word_map; + for (int i = 0; i < seq_num; i++) { + for (int j = 0; j < inputs.size(); j++) { + auto offset = inputs[j]->get_seq_offset()[0]; + int cur_len = offset[i+1] - offset[i]; + for (int k = 0; k < cur_len; k++) { + out2in_map.push_back(j); + out2in_word_map.push_back(offset[i] + k); + } + } + } + int word_num = out2in_map.size(); + Shape o2i_map_shape({word_num, 1, 1, 1}, Layout_NCHW); + _out2in_map_tensor.reshape(o2i_map_shape); + _out2in_word_map_tensor.reshape(o2i_map_shape); + + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int* gpu_o2i_map_data = (int *)_out2in_map_tensor.mutable_data(); + int* gpu_o2i_w_map_data = (int *)_out2in_word_map_tensor.mutable_data(); + uint64_t* gpu_in_locate_data = (uint64_t*)_in_locate_tensor.mutable_data(); + + cudaMemcpyAsync(gpu_o2i_map_data, &out2in_map[0], sizeof(int) * out2in_map.size(), cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync(gpu_o2i_w_map_data, &out2in_word_map[0], sizeof(int) * out2in_word_map.size(), cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync(gpu_in_locate_data, &in_locate_vec[0], sizeof(uint64_t) * in_locate_vec.size(), cudaMemcpyHostToDevice, cuda_stream); + + + int count = inputs[0]->valid_size(); + for (int i = 1; i < inputs.size(); i++) { + count += inputs[i]->valid_size(); + } + ker_sequence_concat_fwd + <<>>( + out_data, gpu_in_locate_data, gpu_o2i_map_data, gpu_o2i_w_map_data, + seq_num, emb_size, count); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + +template class SaberSequenceConcat; +DEFINE_OP_TEMPLATE(SaberSequenceConcat, SequenceConcatParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequenceConcat, SequenceConcatParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_depadding.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_depadding.cu new file mode 100644 index 000000000..799d8a842 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_depadding.cu @@ -0,0 +1,92 @@ +#include "saber/funcs/impl/cuda/saber_sequence_depadding.h" +#include "saber/core/tensor_op.h" +#define BUILD_DEV __device__ + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_sequence_depadding_fwd(Dtype * out_data, + const Dtype* in_data, + const int* seq_id_map, + const int seq_num, + const int max_len, + const int emb_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = seq_id_map[word_id]; + out_data[tid] = in_data[seq_id * emb_size + emb_id]; + } +} + +template +SaberStatus SaberSequenceDePadding::create( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSequenceDePadding::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSequenceDePadding::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam& param) { + + const OpDataType *in_data = (const OpDataType*)inputs[0]->data(); + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + + const int count = outputs[0]->valid_size(); + + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + int max_len = inputs[0]->get_seq_offset()[0][1]; + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + + auto src_seq_offset = inputs[1]->get_seq_offset()[0]; + auto pad_seq_offset = inputs[0]->get_seq_offset()[0]; + std::vector seq_id_map; + for (int i = 0;i < seq_num; i++) { + int cur_len = src_seq_offset[i+1] - src_seq_offset[i]; + for (int j = 0; j < cur_len; j++) { + seq_id_map.push_back(i * max_len + j); + } + } + int map_size = seq_id_map.size(); + _seq_id_map.reshape(Shape({map_size, 1, 1, 1}, Layout_NCHW)); + int* seq_id_map_data = (int*)_seq_id_map.mutable_data(); + cudaMemcpyAsync(seq_id_map_data, &seq_id_map[0], sizeof(int) * seq_id_map.size(), cudaMemcpyHostToDevice, cuda_stream); + + ker_sequence_depadding_fwd<<>>(out_data, + in_data, + seq_id_map_data, + seq_num, + max_len, + emb_size, + count); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + + +template class SaberSequenceDePadding; +DEFINE_OP_TEMPLATE(SaberSequenceDePadding, SequenceDePaddingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequenceDePadding, SequenceDePaddingParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_padding.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_padding.cu new file mode 100644 index 000000000..858566cda --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_padding.cu @@ -0,0 +1,89 @@ +#include "saber/funcs/impl/cuda/saber_sequence_padding.h" +#include "saber/core/tensor_op.h" +#define BUILD_DEV __device__ + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_sequence_padding_fwd(Dtype * out_data, + const Dtype* in_data, + const int* offset, + const int seq_num, + const int max_len, + const int emb_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = word_id / max_len; + int word_id_in_seq = word_id % max_len; + int cur_len = offset[seq_id + 1] - offset[seq_id]; + if (word_id_in_seq < cur_len) { + out_data[tid] = in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id]; + } else { + out_data[tid] = 0.f; + } + } +} + +template +SaberStatus SaberSequencePadding::create( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam& param, Context& ctx) { + + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSequencePadding::init( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam& param, Context& ctx) { + + this->_ctx = &ctx; + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + Shape offset_shape({seq_num + 1, 1, 1, 1}, Layout_NCHW); + _in_seq_offset.re_alloc(offset_shape, AK_INT32); + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSequencePadding::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam& param) { + + const OpDataType *in_data = (const OpDataType*)inputs[0]->data(); + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + + const int count = outputs[0]->valid_size(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + int max_len = outputs[0]->get_seq_offset()[0][1]; + int seq_num = outputs[0]->get_seq_offset()[0].size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + _in_seq_offset.reshape(Shape({seq_num+1, 1, 1, 1}, Layout_NCHW)); + int* offset_data = (int*)_in_seq_offset.mutable_data(); + auto in_seq_offset = inputs[0]->get_seq_offset()[0]; + cudaMemcpyAsync(offset_data, &in_seq_offset[0], sizeof(int) * in_seq_offset.size(), cudaMemcpyHostToDevice, cuda_stream); + + ker_sequence_padding_fwd<<>>(out_data, + in_data, + offset_data, + seq_num, + max_len, + emb_size, + count); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + + +template class SaberSequencePadding; +DEFINE_OP_TEMPLATE(SaberSequencePadding, SequencePaddingParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequencePadding, SequencePaddingParam, NV, AK_INT8); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool_concat.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool_concat.cu new file mode 100644 index 000000000..41be897c5 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_sequence_pool_concat.cu @@ -0,0 +1,100 @@ + +#include "core/common.h" +#include "saber/funcs/impl/cuda/saber_sequence_pool_concat.h" +#include "saber/saber_funcs_param.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus SaberSequencePoolConcat::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, Context& ctx) { + if (inputs[0]->get_seq_offset().size() > 0 && inputs[0]->get_seq_offset()[0].size() > 0) { + auto offset = inputs[0]->get_seq_offset()[0]; + auto stream = _ctx->get_compute_stream(); + + _offset_buffer.re_alloc(offset.size() * sizeof(float)); + cudaMemcpyAsync(_offset_buffer.get_data_mutable(), offset.data(), + offset.size() * sizeof(float), cudaMemcpyHostToDevice, stream); + } + return SaberSuccess; +} + +template <> +SaberStatus SaberSequencePoolConcat::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, Context& ctx) { + + _ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +__global__ +void sequence_pool_sum_concat(const float* input_data, + float* output_data, const int* offset, int n_total, int xdim) { + + int bid = blockIdx.x; + int tid = threadIdx.x; + int gid = bid * blockDim.x + tid; + int n_idx = gid / xdim; + int feature_num; + int x_idx = gid % xdim; + if (n_idx < n_total) { + feature_num = offset[n_idx + 1] - offset[n_idx]; + float* out_data = output_data + n_idx * xdim; + const float* in_data = input_data + offset[n_idx] * xdim; + float res = 0.f; + for (int i = 0; i < feature_num; ++i) { + res += in_data[x_idx]; + in_data += xdim; + } +// printf("gid = %d, feature_num = %d, n_idx = %d, xdim = %d feature_num = %d idx = %d\n", gid, feature_num, n_idx, xdim, feature_num, offset[n_idx] * xdim); + out_data[x_idx] = res; + } +} + +template <> +SaberStatus SaberSequencePoolConcat::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param) { + + CHECK_GE(inputs[0]->get_seq_offset().size(), 1); + auto offset = inputs[0]->get_seq_offset()[0]; + CHECK_GE(offset.size(), 1); + auto stream = _ctx->get_compute_stream(); + + int slot_num = param.slot_num; + int batch = (offset.size() - 1) / slot_num; + int xdim = outputs[0]->valid_size(); + CHECK_EQ((xdim % slot_num), 0) << "some data is wrong!!!" << xdim << " " << slot_num; + CHECK_GE(batch, 1); + xdim /= slot_num; + xdim /= batch; + int count = slot_num * batch * xdim; + + const float* in_data = (const float*)inputs[0]->data(); + float* out_data = (float*)outputs[0]->mutable_data(); + const int* offset_data = (const int*)_offset_buffer.get_data(); + switch (param.sequence_pool_param.sequence_pool_type) { + case Sequence_pool_sum: + sequence_pool_sum_concat<<>> ( + in_data, out_data, offset_data, slot_num * batch, xdim); + break; + default: + LOG(FATAL) << "not implemented yet!!!"; + break; + } + //cudaDeviceSynchronize(); + + return SaberSuccess; +} + +template class SaberSequencePoolConcat; +DEFINE_OP_TEMPLATE(SaberSequencePoolConcat, SequencePoolConcatParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequencePoolConcat, SequencePoolConcatParam, NV, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_slice_v2.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_slice_v2.cu new file mode 100644 index 000000000..fdfcd4f03 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_slice_v2.cu @@ -0,0 +1,119 @@ +#include "saber/funcs/impl/cuda/saber_slice_v2.h" + +namespace anakin{ + +namespace saber{ + +template +__global__ void slice_v2_impl_cuda(const int count, const dtype* in_data, + const int* in_stride_data, + const int* out_shape_data, + const int* starts_data, + const int* axes_data, + const int dims, + const int start_size, + const int in_outer_stride, + const int out_outer_stride, + const int inner, + dtype* out_data) { + CUDA_KERNEL_LOOP(tid, count) { + int inner_id = tid % inner; + int out_id = tid / out_outer_stride; + int in_offset = inner_id + out_id * in_outer_stride; + int new_i = tid / inner; + for (int k = start_size - 1; k >= 0; k--) { + int axes_id = axes_data[k]; + int cur_id = new_i % out_shape_data[axes_id]; + in_offset += (cur_id + starts_data[k]) * in_stride_data[axes_id]; + new_i /= out_shape_data[axes_id]; + } + + out_data[tid] = in_data[in_offset]; + } +} + +template +SaberStatus SaberSliceV2::create(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx) { + auto starts = param.starts; + auto ends = param.ends; + auto axes = param.axes; + CHECK_EQ(axes.size(), starts.size()) << "the size of axes and starts are not equal "; + CHECK_EQ(ends.size(), starts.size()) << "the size of starts and ends are not valid"; + std::vector starts_h; + std::vector ends_h; + starts_h.resize(starts.size()); + ends_h.resize(ends.size()); + Shape output_shape = inputs[0]->valid_shape(); + for (int i = 0; i < starts.size(); i++) { + int dim_value = output_shape[axes[i]]; + int start = starts[i] < 0 ? starts[i] + dim_value : starts[i]; + int end = ends[i] < 0 ? ends[i] + dim_value : ends[i]; + start = std::max(start, 0); + start = std::min(start, dim_value); + end = std::max(end, 0); + end = std::min(end, dim_value); + output_shape[axes[i]] = end - start; + starts_h[i] = start; + ends_h[i] = end; + } + auto in_stride = inputs[0]->get_stride(); + auto out_stride = outputs[0]->get_stride(); + Shape stride_shape({inputs[0]->dims(), 1, 1, 1}, Layout_NCHW); + _in_stride_d.re_alloc(stride_shape, AK_INT32); + _out_shape_d.re_alloc(stride_shape, AK_INT32); + int starts_size = param.starts.size(); + Shape start_shape({starts_size, 1, 1, 1}, Layout_NCHW); + _starts_d.re_alloc(start_shape, AK_INT32); + _axes_d.re_alloc(start_shape, AK_INT32); + int* in_stride_data = (int*)_in_stride_d.mutable_data(); + int* out_shape_data = (int*)_out_shape_d.mutable_data(); + int* starts_data = (int*)_starts_d.mutable_data(); + int* axes_data = (int*)_axes_d.mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + cudaMemcpyAsync(in_stride_data, &in_stride[0], sizeof(int) * in_stride.size(), cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync(out_shape_data, &output_shape[0], sizeof(int) * output_shape.size() , cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync(starts_data, &starts_h[0], sizeof(int) * starts_size, + cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync(axes_data, ¶m.axes[0], sizeof(int) * starts_size, + cudaMemcpyHostToDevice, cuda_stream); + return SaberSuccess; +} + + +template +SaberStatus SaberSliceV2::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SliceV2Param& param) { + + cudaStream_t stream = this->_ctx->get_compute_stream(); + //! inputs only has one tensor + Shape shape_in = inputs[0]->valid_shape(); + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + int* in_stride_data = (int*)_in_stride_d.mutable_data(); + int* out_shape_data = (int*)_out_shape_d.mutable_data(); + int* starts_data = (int*)_starts_d.mutable_data(); + int* axes_data = (int*)_axes_d.mutable_data(); + const int count = outputs[0]->valid_size(); + int inner = inputs[0]->count_valid(param.axes.back() + 1, inputs[0]->dims()); + int out_outer_stride = outputs[0]->count_valid(param.axes[0], outputs[0]->dims()); + int in_outer_stride = inputs[0]->count_valid(param.axes[0], inputs[0]->dims()); + int start_size = param.starts.size(); + slice_v2_impl_cuda<<>>( + count, in_data, in_stride_data, out_shape_data, + starts_data, axes_data, inputs[0]->dims(), start_size, + in_outer_stride, out_outer_stride, + inner, out_data); + return SaberSuccess; + +} +DEFINE_OP_TEMPLATE(SaberSliceV2, SliceV2Param, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSliceV2, SliceV2Param, NV, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_soft_sign.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_soft_sign.cu new file mode 100644 index 000000000..da6461f89 --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_soft_sign.cu @@ -0,0 +1,43 @@ +#include "saber/funcs/impl/cuda/saber_soft_sign.h" +#include "cuda_fp16.h" + +namespace anakin{ +namespace saber{ + +template +__global__ void ker_soft_sign_fwd(Dtype * out_data, + const Dtype* in_data, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + Dtype in_var = in_data[tid]; + Dtype in_abs = in_var > 0 ? in_var : -in_var; + out_data[tid] = in_var / (in_abs + (Dtype)1.f); + } +} + +template +SaberStatus SaberSoftSign::dispatch( \ + const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam& param) { + + const OpDataType *in_data = (const OpDataType*)inputs[0]->data(); + OpDataType *out_data = (OpDataType*)outputs[0]->mutable_data(); + + const int count = inputs[0]->valid_size(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + + //y = x / (x + 1) + ker_soft_sign_fwd + <<>>( + out_data, in_data, count); + + CUDA_POST_KERNEL_CHECK; + return SaberSuccess; +} + +template class SaberSoftSign; +DEFINE_OP_TEMPLATE(SaberSoftSign, SoftSignParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberSoftSign, SoftSignParam, NV, AK_HALF); +} +} diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu index 3dc92608f..3f8e827e2 100755 --- a/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_softmax.cu @@ -293,19 +293,80 @@ __global__ void sharemem_softmax_roi_kernel(int total_size, \ } } -template -SaberStatus SaberSoftmax::dispatch(\ - const std::vector& inputs, \ - std::vector& outputs, \ +template <> +SaberStatus SaberSoftmax::create( + const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) { + + //! compute size + Shape shape_in = inputs[0]->valid_shape(); + Shape shape_out = outputs[0]->valid_shape(); + CHECK_EQ(shape_in == shape_out, true) << "valid shapes must be the same"; + _outer_num = inputs[0]->count_valid(0, param.axis); + _inner_num = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); + _axis_size = shape_in[param.axis]; + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, API::get_device_id()); + size_t sharedmem_size = deviceProp.sharedMemPerBlock; + _max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS; + + Shape sh_tmp({1, 1, 1, _outer_num * _inner_num}); + if (_axis_size > _max_dimsize){ + //! re_alloc device memory + _max_data.reshape(sh_tmp); + _sum_data.reshape(sh_tmp); + } + + //! CHECK whether the input or output tensor is with continuous buffer or not + _is_continue_buf = outputs[0]->is_continue_mem() && inputs[0]->is_continue_mem(); + _dims = shape_in.size(); + if (!_is_continue_buf) { + Shape sh_input_real_stride = inputs[0]->get_stride(); + Shape sh_output_real_stride = outputs[0]->get_stride(); + + //! re_alloc device memory + Shape sh({1, 1, 1, _dims}); + _valid_shape.reshape(sh); + _input_stride.reshape(sh); + _output_stride.reshape(sh); + + CUDA_CHECK(cudaMemcpy(_valid_shape.mutable_data(), inputs[0]->valid_shape().data(), \ + sizeof(int) * _dims, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(_input_stride.mutable_data(), sh_input_real_stride.data(), \ + sizeof(int) * _dims, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(_output_stride.mutable_data(), sh_output_real_stride.data(), \ + sizeof(int) * _dims, cudaMemcpyHostToDevice)); + } + return SaberSuccess; +} + +template <> +SaberStatus SaberSoftmax::init( + const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) { + + //! get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + + +template <> +SaberStatus SaberSoftmax::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ SoftmaxParam& param) { cudaStream_t stream = this->_ctx->get_compute_stream(); //! inputs only has one tensor int total_threads = this->_inner_num * this->_outer_num; - const OpDataType* data_in = (const OpDataType* )inputs[0]->data(); - OpDataType* data_out = (OpDataType*)outputs[0]->mutable_data(); - OpDataType* max_data = (OpDataType*)this->_max_data.mutable_data(); - OpDataType* sum_data = (OpDataType*)this->_sum_data.mutable_data(); + const float* data_in = (const float* )inputs[0]->data(); + float* data_out = (float*)outputs[0]->mutable_data(); + float* max_data = (float*)this->_max_data.mutable_data(); + float* sum_data = (float*)this->_sum_data.mutable_data(); const int* valid_shape = (const int*)_valid_shape.data(); const int* input_stride = (const int*)_input_stride.data(); const int* output_stride = (const int*)_output_stride.data(); @@ -313,25 +374,25 @@ SaberStatus SaberSoftmax::dispatch(\ if (_is_continue_buf) { //! softmax kernel without roi if (this->_axis_size <= _max_dimsize){ - int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(OpDataType); - sharemem_softmax_kernel\ + int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(float); + sharemem_softmax_kernel\ <<>>( total_threads, data_in, data_out, this->_inner_num, this->_outer_num, this->_axis_size); } else { //! firstly, get maximum data - OpDataType min_data = std::numeric_limits::min(); - softmax_max_kernel\ + float min_data = std::numeric_limits::min(); + softmax_max_kernel\ <<>>( total_threads, data_in, max_data, min_data, \ this->_inner_num, this->_outer_num, this->_axis_size); //! then, compute exp and sum data - softmax_sub_exp_sum_kernel + softmax_sub_exp_sum_kernel <<>>( total_threads, data_in, data_out, max_data, sum_data, \ this->_inner_num, this->_outer_num, this->_axis_size); //! lastly, compute divided output - softmax_divid_output_kernel\ + softmax_divid_output_kernel\ <<>>( total_threads, data_out, sum_data, \ this->_inner_num, this->_outer_num, this->_axis_size); @@ -339,28 +400,28 @@ SaberStatus SaberSoftmax::dispatch(\ } else { //! softmax kernel with roi if (this->_axis_size <= _max_dimsize){ - int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(OpDataType); - sharemem_softmax_roi_kernel\ + int sharemem_size = this->_axis_size * CUDA_NUM_THREADS * sizeof(float); + sharemem_softmax_roi_kernel\ <<>>( total_threads, data_in, data_out, input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); } else { //! firstly, get maximum data - OpDataType min_data = std::numeric_limits::min(); - softmax_max_roi_kernel\ + float min_data = std::numeric_limits::min(); + softmax_max_roi_kernel\ <<>>( total_threads, data_in, max_data, min_data, \ input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); //! then, compute exp and sum data - softmax_sub_exp_sum_roi_kernel + softmax_sub_exp_sum_roi_kernel <<>>( total_threads, data_in, data_out, max_data, sum_data, \ input_stride, output_stride, valid_shape, \ param.axis, _axis_size, _dims); //! lastly, compute divided output - softmax_divid_output_roi_kernel\ + softmax_divid_output_roi_kernel\ <<>>( total_threads, data_out, sum_data, \ input_stride, output_stride, valid_shape, \ @@ -368,11 +429,41 @@ SaberStatus SaberSoftmax::dispatch(\ } } - //outputs[0]->record_event(stream); return SaberSuccess; } + +// ============================================= int8 +template <> +SaberStatus SaberSoftmax::create( + const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberSoftmax::init( + const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberSoftmax::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param) { + + return SaberSuccess; +} + +template class SaberSoftmax; +template class SaberSoftmax; DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, NV, AK_HALF); -DEFINE_OP_TEMPLATE(SaberSoftmax, SoftmaxParam, NV, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/cuda/base/cuda_c/saber_yolo_box.cu b/saber/funcs/impl/cuda/base/cuda_c/saber_yolo_box.cu new file mode 100644 index 000000000..6e7e2a6fa --- /dev/null +++ b/saber/funcs/impl/cuda/base/cuda_c/saber_yolo_box.cu @@ -0,0 +1,171 @@ + +#include "saber/funcs/impl/cuda/saber_yolo_box.h" + +namespace anakin { +namespace saber { + +namespace { +__device__ +inline float sigmoid(float x) { + return 1.f / (1.f + std::exp(-x)); +} +__device__ +inline void get_yolo_box(float* box, const float* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} +__device__ +inline int get_entry_index(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} +__device__ +inline void calc_detection_box(float* boxes, float* box, const int box_idx, + const int img_height, + const int img_width) { + + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} +__device__ +inline void calc_label_score(float* scores, const float* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} +} + +__global__ void ker_yolo_box(const float* input, const float* imgsize, float* boxes, + float* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + float box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, grid_num, 4); + float conf = sigmoid(input[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, grid_num, 0); + get_yolo_box(box, input, anchors, l, k, j, h, input_size, box_idx, + grid_num, img_height, img_width); + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + calc_detection_box(boxes, box, box_idx, img_height, img_width); + + int label_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + calc_label_score(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template <> +SaberStatus SaberYoloBox::create( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param, Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberYoloBox::init( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberYoloBox::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param) { + + auto* input = inputs[0]; + auto* img_size = inputs[1]; + auto* boxes = outputs[0]; + auto* scores = outputs[1]; + + auto anchors = param.anchors; + int class_num = param.class_num; + float conf_thresh = param.conf_thresh; + int downsample_ratio = param.downsample_ratio; + + const int n = input->num(); + const int h = input->height(); + const int w = input->width(); + const int box_num = boxes->valid_shape()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + Buffer _anchors_buf; + _anchors_buf.re_alloc(sizeof(int) * anchors.size()); + + cudaMemcpyAsync(_anchors_buf.get_data_mutable(), anchors.data(), + sizeof(int) * anchors.size(), cudaMemcpyHostToDevice, _ctx->get_compute_stream()); + + const float* input_data = (const float*)input->data(); + const float* imgsize_data = (const float*)img_size->data(); + float* boxes_data = (float*)boxes->mutable_data(); + float* scores_data =(float*)scores->mutable_data(); + + int grid_dim = (n * box_num + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + ker_yolo_box<<get_compute_stream()>>>( + input_data, imgsize_data, boxes_data, scores_data, conf_thresh, + (const int*)_anchors_buf.get_data(), n, h, w, an_num, class_num, box_num, input_size); + + return SaberSuccess; +} + +template class SaberYoloBox; +DEFINE_OP_TEMPLATE(SaberYoloBox, YoloBoxParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberYoloBox, YoloBoxParam, NV, AK_INT8); + +} // namespace saber. +} // namespace anakin. diff --git a/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu b/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu index d0a103c51..92fea4dcf 100644 --- a/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu +++ b/saber/funcs/impl/cuda/base/cuda_c/tensor_op_cuda.cu @@ -221,13 +221,16 @@ double tensor_mean_value(Tensor& tensor, typename Tensor::API::strea tensor.set_shape(tensor.shape()); tvalid.copy_from(tensor); tensor.set_shape(valid_shape); + tvalid.set_scale(tensor.get_scale()); return tensor_mean_value(tvalid, stream); } template<> double tensor_mean_value_valid(Tensor& tensor, typename Tensor::API::stream_t stream) { - Tensor tvalid(tensor.valid_shape()); + Tensor tvalid; + tvalid.re_alloc(tensor.valid_shape(), tensor.get_dtype()); tvalid.copy_from(tensor); + tvalid.set_scale(tensor.get_scale()); return tensor_mean_value(tvalid, stream); } #endif diff --git a/saber/funcs/impl/cuda/cuda_utils.h b/saber/funcs/impl/cuda/cuda_utils.h index a6246e4e4..6c7f32b88 100644 --- a/saber/funcs/impl/cuda/cuda_utils.h +++ b/saber/funcs/impl/cuda/cuda_utils.h @@ -141,11 +141,11 @@ class SeqSortedseqTranseUtil { int target_word_id = 0; std::vector length_vec_cnt = length_vec; - + int last_batch_size = batch_size; for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) { _emit_offset_vec[word_id_in_seq] = target_word_id; - for (int batch_id = 0; batch_id < batch_size; batch_id++) { + for (int batch_id = 0; batch_id < last_batch_size; batch_id++) { int old_batch_id = _length_index[batch_id]; if (length_vec_cnt[old_batch_id] > 0) { @@ -157,10 +157,11 @@ class SeqSortedseqTranseUtil { int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq; _map_vec[old_word_id] = target_word_id; + // printf("map %d -> %d\n",old_word_id,target_word_id); length_vec_cnt[old_batch_id]--; target_word_id++; } else { - + last_batch_size--; break; } } diff --git a/saber/funcs/impl/cuda/cudnn_helper.h b/saber/funcs/impl/cuda/cudnn_helper.h index 357a8e23e..8a4a74e53 100644 --- a/saber/funcs/impl/cuda/cudnn_helper.h +++ b/saber/funcs/impl/cuda/cudnn_helper.h @@ -156,24 +156,6 @@ class cudnnTypeWrapper { return &v; } }; -template -inline void createReduceTensorDesc(cudnnReduceTensorDescriptor_t* desc) { - CUDNN_CHECK(cudnnCreateReduceTensorDescriptor(desc)); -} - -template -inline void setReduceTensorDesc(cudnnReduceTensorDescriptor_t* desc, - cudnnReduceTensorOp_t reduceTensorOp, - cudnnNanPropagation_t reduceTensorNanOpt, - cudnnReduceTensorIndices_t reduceTensorIndices, - cudnnIndicesType_t reduceTensorIndicesType) { - CUDNN_CHECK(cudnnSetReduceTensorDescriptor(*desc, - reduceTensorOp, - cudnnTypeWrapper::type, - reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType)); -} template inline void createTensorDesc(cudnnTensorDescriptor_t* desc) { diff --git a/saber/funcs/impl/cuda/reorder.h b/saber/funcs/impl/cuda/reorder.h new file mode 100644 index 000000000..e9e990dce --- /dev/null +++ b/saber/funcs/impl/cuda/reorder.h @@ -0,0 +1,26 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_REORDER_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_REORDER_H + +#include "saber/core/common.h" +#include "saber/core/tensor.h" +#include "saber/core/context.h" + +namespace anakin { +namespace saber { + +template +SaberStatus convert_nchw_to_nchwc4( + Tensor &out_tensor, + const Tensor &in_tensor, + Context ctx); + +template +SaberStatus convert_nchwc4_to_nchw( + Tensor &out_tensor, + const Tensor &in_tensor, + Context ctx); + +} +} + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/cuda/saber_activation.h b/saber/funcs/impl/cuda/saber_activation.h index cb62040be..ec68e61a7 100644 --- a/saber/funcs/impl/cuda/saber_activation.h +++ b/saber/funcs/impl/cuda/saber_activation.h @@ -30,24 +30,21 @@ class SaberActivation : public: typedef typename DataTrait::Dtype OpDataType; SaberActivation() = default; - ~SaberActivation() {} + ~SaberActivation() = default; virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, - ActivationParam& param, Context& ctx) { - this->_ctx = &ctx; - return SaberSuccess; - } + ActivationParam& param, Context& ctx); virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, - ActivationParam& param, Context &ctx) { - return SaberSuccess; - } + ActivationParam& param, Context &ctx); virtual SaberStatus dispatch(const std::vector*>& inputs, std::vector*>& outputs, ActivationParam& param); +private: + Tensor _int8_input; }; } diff --git a/saber/funcs/impl/cuda/saber_aligned_mat_mul.cpp b/saber/funcs/impl/cuda/saber_aligned_mat_mul.cpp new file mode 100644 index 000000000..b71d5d23c --- /dev/null +++ b/saber/funcs/impl/cuda/saber_aligned_mat_mul.cpp @@ -0,0 +1,52 @@ +#include "saber/funcs/impl/cuda/saber_aligned_mat_mul.h" + +namespace anakin { + +namespace saber { + +template +SaberStatus SaberAlignedMatMul::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + AlignedMatMulParam ¶m) { + + + cudaStream_t stream = this->_ctx->get_compute_stream(); + const OpDataType* X = (const OpDataType*)inputs[0]->data(); + const OpDataType* Y = (const OpDataType*)inputs[1]->data(); + OpDataType* out = (OpDataType*)outputs[0]->mutable_data(); + auto seq_offset_x = inputs[0]->get_seq_offset()[0]; + auto seq_offset_y = inputs[1]->get_seq_offset()[0]; + CHECK_EQ(seq_offset_x.size(), seq_offset_y.size()) << "AlignedMatMul inputs have different seq num"; + int seq_num = seq_offset_x.size() - 1; + int inner_A = inputs[0]->count_valid(1, inputs[0]->dims()); + int inner_B = inputs[1]->count_valid(1, inputs[1]->dims()); + int batch_A = seq_offset_x[1]; + int batch_B = seq_offset_y[1]; + int M = param.is_transpose_X ? inner_A : batch_A; + int N = param.is_transpose_Y ? batch_B: inner_B; + int K_A = param.is_transpose_X ? batch_A : inner_A; + int K_B = param.is_transpose_Y ? inner_B : batch_B; + CHECK_EQ(K_A, K_B) << "mat mul two inputs K is not equal"; + int K = K_A; + _kernel = saber_find_fast_sass_gemm(param.is_transpose_X, param.is_transpose_Y, M, N, K); + + //should add batch gemm here + for (int b = 0; b < seq_num; b++) { + _kernel(M, N, K, param.scale, + X + b * M * K, + 0.f, + Y + b * K * N, + out + b * M * N, stream); + } + // print_tensor(*outputs[0]); + return SaberSuccess; +} + +template class SaberAlignedMatMul; +DEFINE_OP_TEMPLATE(SaberAlignedMatMul, AlignedMatMulParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAlignedMatMul, AlignedMatMulParam, NV, AK_INT8); + +} // namespace saber; + +} // namespace anakin; diff --git a/saber/funcs/impl/cuda/saber_fake_quantize_abs_max.h b/saber/funcs/impl/cuda/saber_aligned_mat_mul.h similarity index 52% rename from saber/funcs/impl/cuda/saber_fake_quantize_abs_max.h rename to saber/funcs/impl/cuda/saber_aligned_mat_mul.h index 85017047d..47a5549f0 100644 --- a/saber/funcs/impl/cuda/saber_fake_quantize_abs_max.h +++ b/saber/funcs/impl/cuda/saber_aligned_mat_mul.h @@ -13,55 +13,55 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_FAKE_DQUANTIZE_ABS_MAX_H -#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_FAKE_DQUANTIZE_ABS_MAX_H - -#include "saber/funcs/impl/impl_fake_quantize_abs_max.h" +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_ALIGNED_MAT_MUL_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_ALIGNED_MAT_MUL_H +#include "saber/funcs/impl/impl_aligned_mat_mul.h" +#include "sass_funcs.h" namespace anakin{ namespace saber{ template -class SaberFakeQuantizeAbsMax: public ImplBase > { +class SaberAlignedMatMul: public ImplBase > { public: typedef typename DataTrait::Dtype OpDataType; - SaberFakeQuantizeAbsMax() {} - ~SaberFakeQuantizeAbsMax() {} + SaberAlignedMatMul() {} + + ~SaberAlignedMatMul() {} virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam ¶m, - Context &ctx); + AlignedMatMulParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); + } virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam &crop_param, - Context &ctx); + AlignedMatMulParam ¶m, + Context &ctx) { + return SaberSuccess; + } virtual SaberStatus dispatch(const std::vector *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam ¶m); + AlignedMatMulParam ¶m); private: - Tensor _max_abs; - cudnnHandle_t _handle; - cudnnReduceTensorDescriptor_t _reduce_tensor_descs; - cudnnTensorDescriptor_t _input_descs; - cudnnTensorDescriptor_t _output_descs; - size_t _workspaceSizeInBytes; - void *_workspace; - size_t _indices_size; - void *_indices; + + std::function _kernel; }; -template class SaberFakeQuantizeAbsMax; - -} //namespace saber +} //namespace saber. } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_FAKE_QUANTIZE_ABS_MAX_H +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_ALIGNED_MAT_MUL_H diff --git a/saber/funcs/impl/cuda/saber_anchor_generator.h b/saber/funcs/impl/cuda/saber_anchor_generator.h new file mode 100644 index 000000000..e89d69bdd --- /dev/null +++ b/saber/funcs/impl/cuda/saber_anchor_generator.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ANCHOR_GENERATOR_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ANCHOR_GENERATOR_H + +#include "saber/funcs/impl/impl_anchor_generator.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberAnchorGenerator: public ImplBase > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberAnchorGenerator() {} + ~SaberAnchorGenerator() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + AnchorGeneratorParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + AnchorGeneratorParam ¶m, + Context &ctx) { + Shape shape_aspect({1, (int)(param.aspect_ratios.size()), 1, 1}, Layout_NCHW); + Shape shape_anchor_sizes({1, (int)(param.anchor_sizes.size()), 1, 1}, Layout_NCHW); + _aspect_ratios.reshape(shape_aspect); + _anchor_sizes.reshape(shape_anchor_sizes); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + cudaMemcpyAsync((float*)(_aspect_ratios.mutable_data()), + ¶m.aspect_ratios[0], + sizeof(float) * param.aspect_ratios.size(), + cudaMemcpyHostToDevice, cuda_stream); + cudaMemcpyAsync((float*)(_anchor_sizes.mutable_data()), + ¶m.anchor_sizes[0], + sizeof(float) * param.anchor_sizes.size(), + cudaMemcpyHostToDevice, + cuda_stream); + CHECK_EQ(param.stride.size(), 2) << "anchor generator stride size must be equal to 2"; + CHECK_EQ(param.variances.size(), 4) << "anchor generator variances size must be equal to 4"; + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + AnchorGeneratorParam ¶m); + +private: + Tensor _aspect_ratios; + Tensor _anchor_sizes; +}; + +template class SaberAnchorGenerator; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ANCHOR_GENERATOR_H diff --git a/saber/funcs/impl/cuda/saber_arithmetic.h b/saber/funcs/impl/cuda/saber_arithmetic.h new file mode 100644 index 000000000..936a26d61 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_arithmetic.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARITHMETIC_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARITHMETIC_H + +#include "saber/funcs/impl/impl_arithmetic.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberArithmetic : + public ImplBase< + NV, OpDtype, + ArithmeticParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberArithmetic() = default; + ~SaberArithmetic() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ArithmeticParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ArithmeticParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam& param); +private: + Tensor word_id_to_seq_id; + Tensor offset_tensor_0; + Tensor offset_tensor_1; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ARITHMETIC_H diff --git a/saber/funcs/impl/cuda/saber_attention_padding_mask.h b/saber/funcs/impl/cuda/saber_attention_padding_mask.h new file mode 100644 index 000000000..d9c169702 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_attention_padding_mask.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ATTENTION_PADDING_MASK_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ATTENTION_PADDING_MASK_H + +#include "saber/funcs/impl/impl_attention_padding_mask.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberAttentionPaddingMask : + public ImplBase< + NV, OpDtype, + AttentionPaddingMaskParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberAttentionPaddingMask() = default; + ~SaberAttentionPaddingMask() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + AttentionPaddingMaskParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + AttentionPaddingMaskParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam& param); +private: + Tensor _src_offset; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ATTENTION_PADDING_MASK_H diff --git a/saber/funcs/impl/cuda/saber_box_clip.h b/saber/funcs/impl/cuda/saber_box_clip.h new file mode 100644 index 000000000..8fa541479 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_box_clip.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_BOX_CLIP_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_BOX_CLIP_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_box_clip.h" +#include "saber/core/tensor.h" + +namespace anakin { + +namespace saber { + +template +class SaberBoxClip : \ + public ImplBase < + NV, + OpDtype, + EmptyParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberBoxClip() = default; + ~SaberBoxClip() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param, Context& ctx) { + // get context + this->_ctx = &ctx; + cuda_seq_offset.re_alloc(Shape({1, 1, 1, 1}), AK_FLOAT); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param, Context& ctx) { + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param)override; + +private: + Tensor cuda_seq_offset; +}; + +} //namespace saber + +} //namespace anakin +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_BOX_CLIP_H diff --git a/saber/funcs/impl/cuda/saber_box_coder.h b/saber/funcs/impl/cuda/saber_box_coder.h new file mode 100644 index 000000000..049397a35 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_box_coder.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_BOX_CODER_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_BOX_CODER_H +#include "anakin_config.h" +#include "saber/funcs/impl/impl_box_coder.h" +#include "saber/core/tensor.h" +namespace anakin { + +namespace saber { + +template +class SaberBoxCoder : \ + public ImplBase < + NV, + OpDtype, + BoxCoderParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberBoxCoder() = default; + ~SaberBoxCoder() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param, Context& ctx) { + //get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param, Context& ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param)override; + +private: +}; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_BOX_CODER_H diff --git a/saber/funcs/impl/cuda/saber_cast.h b/saber/funcs/impl/cuda/saber_cast.h index c79530577..b4725d476 100644 --- a/saber/funcs/impl/cuda/saber_cast.h +++ b/saber/funcs/impl/cuda/saber_cast.h @@ -51,13 +51,13 @@ class SaberCast : \ _inDtype = param.in_type; _outDtype = param.out_type; if(_inDtype != 1 && _inDtype !=5){// AK_FLOAT AK_INT32 - LOG(FATAL) << "Cast not impl other type: " << _inDtype; + //LOG(FATAL) << "Cast not impl other type: " << _inDtype; } if(_outDtype != 1 && _outDtype !=5){ - LOG(FATAL) << "Cast not impl other type: " << _outDtype; + //LOG(FATAL) << "Cast not impl other type: " << _outDtype; } - CHECK_EQ(_inDtype, inputs[0]->get_dtype()) << "inputs data type should be same with param.in_type"; - CHECK_EQ(_outDtype, outputs[0]->get_dtype()) << "outputs data type should be same with param.out_type"; + //CHECK_EQ(_inDtype, inputs[0]->get_dtype()) << "inputs data type should be same with param.in_type"; + //CHECK_EQ(_outDtype, outputs[0]->get_dtype()) << "outputs data type should be same with param.out_type"; return SaberSuccess; } diff --git a/saber/funcs/impl/cuda/saber_concat.h b/saber/funcs/impl/cuda/saber_concat.h index a774ce096..07c88734c 100644 --- a/saber/funcs/impl/cuda/saber_concat.h +++ b/saber/funcs/impl/cuda/saber_concat.h @@ -34,21 +34,12 @@ class SaberConcat : virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, ConcatParam& param, - Context &ctx) { - // get context - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); - } + Context &ctx); virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, ConcatParam& param, - Context& ctx) { - - _num_concats = inputs[0]->count_valid(0, param.axis); - _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); - return SaberSuccess; - } + Context& ctx); virtual SaberStatus dispatch(const std::vector *>& inputs, std::vector *>& outputs, @@ -57,6 +48,8 @@ class SaberConcat : private: int _num_concats; int _concat_input_size; + std::vector> _input_v; + Tensor _output; }; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_conv.cpp b/saber/funcs/impl/cuda/saber_conv.cpp index bac8b63ad..2c8d97a74 100644 --- a/saber/funcs/impl/cuda/saber_conv.cpp +++ b/saber/funcs/impl/cuda/saber_conv.cpp @@ -7,68 +7,120 @@ #include "saber/funcs/impl/cuda/saber_conv_gemmlike.h" #include "saber/funcs/impl/cuda/saber_conv_winograd.h" #include "saber/funcs/impl/cuda/vender_conv.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/debug.h" namespace anakin { namespace saber { +template <> +void SaberConv2D::find_fastest_alg( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + + int generate_arch = Env::cur_env()[_ctx->get_device_id()]._info._generate_arch; + bool arch_check = (generate_arch == 50) || (generate_arch == 61); + + bool use_k1s1p0 = arch_check; + bool use_k3s1 = arch_check; + bool use_direct = arch_check; + bool use_depthwise = true; + + use_k1s1p0 = use_k1s1p0 && (param.weight()->height() == 1); + use_k1s1p0 = use_k1s1p0 && (param.weight()->width() == 1); + use_k1s1p0 = use_k1s1p0 && (param.pad_h == 0); + use_k1s1p0 = use_k1s1p0 && (param.pad_w == 0); + use_k1s1p0 = use_k1s1p0 && (param.stride_h == 1); + use_k1s1p0 = use_k1s1p0 && (param.stride_w == 1); + use_k1s1p0 = use_k1s1p0 && (param.dilation_h == 1); + use_k1s1p0 = use_k1s1p0 && (param.dilation_w == 1); + use_k1s1p0 = use_k1s1p0 && (param.group == 1); + use_k1s1p0 = use_k1s1p0 && (param.bias()->valid_size() > 0); + + use_k3s1 = use_k3s1 && (param.stride_h == 1); + use_k3s1 = use_k3s1 && (param.stride_w == 1); + use_k3s1 = use_k3s1 && (param.weight()->height() == 3); + use_k3s1 = use_k3s1 && (param.weight()->width() == 3); + use_k3s1 = use_k3s1 && (param.dilation_h == 1); + use_k3s1 = use_k3s1 && (param.dilation_w == 1); + use_k3s1 = use_k3s1 && (param.group == 1); + + use_direct = use_direct && (param.group == 1); + use_direct = use_direct && (inputs[0]->height() > 8); + use_direct = use_direct && (inputs[0]->width() > 8); + + use_depthwise = use_depthwise && (param.group == inputs[0]->channel()); + use_depthwise = use_depthwise && (param.group == outputs[0]->channel()); + + if (use_k1s1p0) { + _kernel_alg = K_k1s1p0; + } else if (use_k3s1) { + _kernel_alg = K_k3s1; + } else if (use_direct) { + _kernel_alg = K_direct; + } else if (use_depthwise) { + _kernel_alg = K_depthwise; + } else { + _kernel_alg = K_vender; + } +} + +template <> +SaberStatus SaberConv2D::create( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + KernelAlg last_alg = _kernel_alg; + find_fastest_alg(inputs, outputs, param, ctx); + if (_kernel_alg != last_alg) { + // bad case. + if (_impl != nullptr) { + delete _impl; + } + if (_kernel_alg == K_direct) { +// LOG(INFO) << "change to use direct!!!"; + _impl = new SaberDirectConv; + return _impl->init(inputs, outputs, param, ctx); + } else if (_kernel_alg == K_vender) { +// LOG(INFO) << "change to use vender!!!!"; + _impl = new VenderConv2D; + dynamic_cast *>( + this->_impl)->load_origin_weight(_origin_weight, ctx); + return _impl->init(inputs, outputs, param, ctx); + } else { + LOG(FATAL) << "this situation should not happened!!"; + } + + } + if (_impl != nullptr) { + return _impl->create(inputs, outputs, param, ctx); + } else { + return SaberUnImplError; + } +} + template <> SaberStatus SaberConv2D::init(const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param, Context &ctx) { this->_ctx = &ctx; - int generate_arch = Env::cur_env()[_ctx->get_device_id()]._info._generate_arch; - bool arch_check = (generate_arch == 50) || (generate_arch == 61); +// LOG(INFO) << "only copy once!!!"; + _origin_weight.re_alloc(param.weight()->valid_shape(), param.weight()->get_dtype()); + _origin_weight.async_copy_from(*param.weight(), ctx.get_compute_stream()); if (_impl == nullptr) { - bool use_k1s1p0 = arch_check; - use_k1s1p0 = use_k1s1p0 && (param.weight()->height() == 1); - use_k1s1p0 = use_k1s1p0 && (param.weight()->width() == 1); - use_k1s1p0 = use_k1s1p0 && (param.pad_h == 0); - use_k1s1p0 = use_k1s1p0 && (param.pad_w == 0); - use_k1s1p0 = use_k1s1p0 && (param.stride_h == 1); - use_k1s1p0 = use_k1s1p0 && (param.stride_w == 1); - use_k1s1p0 = use_k1s1p0 && (param.dilation_h == 1); - use_k1s1p0 = use_k1s1p0 && (param.dilation_w == 1); - use_k1s1p0 = use_k1s1p0 && (param.group == 1); - use_k1s1p0 = use_k1s1p0 && (param.bias()->valid_size() > 0); - if (arch_check && use_k1s1p0) { + find_fastest_alg(inputs, outputs, param, ctx); + + if (_kernel_alg == K_k1s1p0) { _impl = new SaberGemmLikeConv; - } else if (arch_check && param.stride_h == 1 && - param.stride_w == 1 && - param.weight()->height() == 3 && - param.weight()->width() == 3 && - param.dilation_h == 1 && - param.dilation_w == 1 && - param.group == 1) { + } else if (_kernel_alg == K_k3s1) { this->_impl = new SaberWinogradConv; - } else if (arch_check && param.group == 1) { - //TODO [zs] This will be a good feature to check if the kernel is out performance of cudnn!!!! - //TODO this will remove the bad case of saber - //TODO Better to extract this as a function, whose template is a specify Conv, return(bool) if faster than cudnn -// SaberDirectConv temp; -// VenderConv2D vender_temp; -// temp.init(inputs, outputs, param, ctx); -// vender_temp.init(inputs, outputs, param, ctx); -// SaberTimer s_t, v_t; -// temp.dispatch(inputs, outputs, param); -// s_t.start(ctx); -// for (int i = 0; i < 10; ++i) { -// temp.dispatch(inputs, outputs, param); -// } -// s_t.end(ctx); -// v_t.start(ctx); -// for (int i = 0; i < 10; ++i) { -// vender_temp.dispatch(inputs, outputs, param); -// } -// v_t.end(ctx); -// if (v_t.get_average_ms() < s_t.get_average_ms()) { -// _use_vender = true; -// this->_impl = new VenderConv2D; -// } else { -// _impl = new SaberDirectConv; -// } + } else if (_kernel_alg == K_direct) { + _impl = new SaberDirectConv; - } else if (param.group == inputs[0]->channel() && param.group == outputs[0]->channel()) { + } else if (_kernel_alg == K_depthwise) { + _impl = new SaberDepthWiseConv; } else { // I will never fail!!! @@ -79,6 +131,17 @@ SaberStatus SaberConv2D::init(const std::vector *>& inp this->_impl->init(inputs, outputs, param, ctx); return create(inputs, outputs, param, ctx); } +template <> +SaberStatus SaberConv2D::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + if (_impl != nullptr) { + return _impl->dispatch(inputs, outputs, param); + } else { + return SaberUnImplError; + } +} template <> SaberStatus SaberConv2D::trans_weights(Tensor &target_weights, @@ -123,42 +186,14 @@ SaberStatus SaberConv2D::trans_weights(Tensor &target_weights, _extern_trans = true; return SaberSuccess; } - -template <> -SaberStatus SaberConv2D::init( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context &ctx) { - - _impl = new SaberDirectConv; - - _impl->init(inputs, outputs, param, ctx); - return create(inputs, outputs, param, ctx); -} - -template <> -SaberStatus SaberConv2D::init( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context &ctx) { - - return SaberUnImplError; -} - template <> SaberStatus SaberConv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { - return SaberUnImplError; -} - -template <> -SaberStatus SaberConv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { - return SaberUnImplError; - -} + Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group) { + return SaberSuccess; +}; +DEFINE_OP_TEMPLATE(SaberConv2D, ConvParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberConv2D, ConvParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/saber_conv.h b/saber/funcs/impl/cuda/saber_conv.h index 9289373d2..e3febdb0e 100644 --- a/saber/funcs/impl/cuda/saber_conv.h +++ b/saber/funcs/impl/cuda/saber_conv.h @@ -30,6 +30,7 @@ class SaberConv2D : public ImplBase< public: typedef typename DataTrait::Dtype OpDataType; typedef ImplBase > Impl_t; + SaberConv2D() = default; ~SaberConv2D() { if (_impl != nullptr) { @@ -37,38 +38,49 @@ class SaberConv2D : public ImplBase< } } - virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context &ctx); + SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) override; - virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context& ctx) { - if (_impl != nullptr) { - return _impl->create(inputs, outputs, param, ctx); - } else { - return SaberUnImplError; - } - } + SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) override; - virtual SaberStatus dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - ConvParam& param) { - if (_impl != nullptr) { - return _impl->dispatch(inputs, outputs, param); - } else { - return SaberUnImplError; - } - } + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) override; SaberStatus trans_weights(Tensor &target_weights, Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, int stride_h, int stride_w, int group); private: + + std::vector *> _in_data_tensor; + std::vector *> _out_data_tensor; + Tensor int8_input; + Tensor int8_output; Impl_t* _impl{nullptr}; bool _extern_trans{false}; bool _use_vender{false}; + float _in_scale{0.f}; + float _out_scale{0.f}; + bool _scale_per_k{false}; + bool _output_int8{false}; + + Tensor _origin_weight; + enum KernelAlg{ + K_unknown = 0, + K_k1s1p0 = 1, + K_k3s1 = 2, + K_direct = 3, + K_depthwise = 4, + K_vender = 5 + }; + void find_fastest_alg(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param, Context &ctx); + KernelAlg _kernel_alg{K_unknown}; }; } diff --git a/saber/funcs/impl/cuda/saber_conv_depthwise.cpp b/saber/funcs/impl/cuda/saber_conv_depthwise.cpp index d36a9efba..278a5517d 100644 --- a/saber/funcs/impl/cuda/saber_conv_depthwise.cpp +++ b/saber/funcs/impl/cuda/saber_conv_depthwise.cpp @@ -5,13 +5,25 @@ namespace anakin { namespace saber { -template -SaberStatus saber_depthwise_conv_act(const dtype* input, dtype* output, \ +template +SaberStatus saber_depthwise_conv_act(const float* input, float* output, \ int num, int cin, int hin, int win, int hout, int wout, \ int kw, int kh, int stride_w, int stride_h, \ - int pad_h, int pad_w, const dtype* weights, const dtype* bias, \ + int pad_h, int pad_w, const float* weights, const float* bias, \ cudaStream_t stream); +template +SaberStatus saber_depthwise_conv_act_s8_s8(const void* input, void* output, + int num, int cin, int hin, int win, int hout, int wout, + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, + const void* weights, const float* bias, cudaStream_t stream); + +template +SaberStatus saber_depthwise_conv_act_s8_f32(const void* input, void* output, + int num, int cin, int hin, int win, int hout, int wout, + int kw, int kh, int stride_w, int stride_h, int pad_w, int pad_h, float alpha, + const void* weights, const float* bias, cudaStream_t stream); + template <> SaberStatus SaberDepthWiseConv::init( const std::vector *>& inputs, @@ -30,22 +42,12 @@ SaberStatus SaberDepthWiseConv::init( if (param.activation_param.has_active) { if (param.activation_param.active == Active_relu) { - if (param.bias()->size() > 0) { - dispatch_func = saber_depthwise_conv_act; - } else { - dispatch_func = saber_depthwise_conv_act; - } + dispatch_func = saber_depthwise_conv_act; } else { - if (param.bias()->size() > 0) { - dispatch_func = saber_depthwise_conv_act; - } else { - dispatch_func = saber_depthwise_conv_act; - } + dispatch_func = saber_depthwise_conv_act; } - } else if (param.bias()->size() > 0) { - dispatch_func = saber_depthwise_conv_act; } else { - dispatch_func = saber_depthwise_conv_act; + dispatch_func = saber_depthwise_conv_act; } return SaberSuccess; } @@ -76,26 +78,9 @@ SaberStatus SaberDepthWiseConv::dispatch( if (this->_saber_act != nullptr) { this->_saber_act->dispatch(outputs, outputs, param.activation_param); } - CUDA_CHECK(cudaGetLastError()); return SaberSuccess; } -template <> -SaberStatus SaberDepthWiseConv::init( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context &ctx) { - return SaberUnImplError; -} - -template <> -SaberStatus SaberDepthWiseConv::dispatch( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param) { - return SaberUnImplError; -} - template <> SaberStatus SaberDepthWiseConv::init( diff --git a/saber/funcs/impl/cuda/saber_conv_depthwise.h b/saber/funcs/impl/cuda/saber_conv_depthwise.h index f75a8884b..9ae7a3428 100644 --- a/saber/funcs/impl/cuda/saber_conv_depthwise.h +++ b/saber/funcs/impl/cuda/saber_conv_depthwise.h @@ -76,6 +76,12 @@ class SaberDepthWiseConv : public ImplBase< int, int, int, int, int, int, const float*, const float*, cudaStream_t)> dispatch_func; + + std::function dispatch_func_s8; }; } diff --git a/saber/funcs/impl/cuda/saber_conv_direct.cpp b/saber/funcs/impl/cuda/saber_conv_direct.cpp index 92af89a49..5d682597f 100644 --- a/saber/funcs/impl/cuda/saber_conv_direct.cpp +++ b/saber/funcs/impl/cuda/saber_conv_direct.cpp @@ -2,6 +2,7 @@ #include "saber/funcs/impl/cuda/saber_conv_direct.h" #include "saber/funcs/calibrate.h" #include "saber_conv.h" +#include "saber/core/tensor_op.h" namespace anakin { namespace saber { @@ -26,7 +27,7 @@ SaberStatus SaberDirectConv::init( this->_ctx = &ctx; _use_saber_act = param.activation_param.has_active && !(param.activation_param.active == Active_relu - && param.activation_param.negative_slope == 0.f); + && fabsf(param.activation_param.negative_slope) < 1e-6f); _use_saber_act = _use_saber_act || (param.bias()->valid_size() == 0 && param.activation_param.has_active); if (param.activation_param.has_active) { @@ -111,15 +112,11 @@ SaberStatus SaberDirectConv::dispatch( CUDA_CHECK(cudaGetLastError()); return SaberSuccess; } - template <> SaberStatus SaberDirectConv::create( const std::vector *>& inputs, std::vector *>& outputs, - ConvParam& param, Context &ctx){ - LOG(INFO) << "conv int8 create" - << " input tensor dtype: " << (inputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8") - << " output tensor dtype: " << (outputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8"); + ConvParam& param, Context &ctx) { return SaberSuccess; } @@ -127,26 +124,22 @@ template <> SaberStatus SaberDirectConv::init( const std::vector *>& inputs, std::vector *>& outputs, - ConvParam& param, Context &ctx){ - LOG(INFO) << "conv int8 init" - << " input tensor dtype: " << (inputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8") - << " output tensor dtype: " << (outputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8"); - return SaberSuccess; -} + ConvParam& param, Context &ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} template <> SaberStatus SaberDirectConv::dispatch( const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param) { - LOG(INFO) << "conv int8 dispatch" - << " input tensor dtype: " << (inputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8") - << " output tensor dtype: " << (outputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8"); + return SaberSuccess; } - template <> SaberStatus SaberDirectConv::init( const std::vector *>& inputs, @@ -163,5 +156,5 @@ SaberStatus SaberDirectConv::dispatch( return SaberUnImplError; } -} -} +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/cuda/saber_conv_direct.h b/saber/funcs/impl/cuda/saber_conv_direct.h index 6d96f0d10..eceb5bb2d 100644 --- a/saber/funcs/impl/cuda/saber_conv_direct.h +++ b/saber/funcs/impl/cuda/saber_conv_direct.h @@ -51,6 +51,7 @@ class SaberDirectConv : public ImplBase< private: bool _use_saber_act{false}; SaberActivation *_saber_act{nullptr}; + float _in_scale{0.f}; //we use this func holder only when input and output datatype is float; std::function dispatch_func; + + std::function int8_dispatch_func; }; } diff --git a/saber/funcs/impl/cuda/saber_conv_eltwise.cpp b/saber/funcs/impl/cuda/saber_conv_eltwise.cpp index 05209c4a1..6591eba93 100644 --- a/saber/funcs/impl/cuda/saber_conv_eltwise.cpp +++ b/saber/funcs/impl/cuda/saber_conv_eltwise.cpp @@ -2,10 +2,12 @@ #include "saber/funcs/impl/cuda/saber_conv.h" #include "saber/funcs/impl/cuda/saber_eltwise.h" #include "saber/funcs/impl/cuda/saber_conv_eltwise.h" -#include "sass_funcs.h" +#include "saber/funcs/impl/cuda/vender_conv.h" #include "saber/funcs/calibrate.h" #include "saber_conv_eltwise.h" -#include "saber/funcs/impl/cuda/vender_conv.h" +#include "sass_funcs.h" +#include "saber/funcs/debug.h" + namespace anakin { namespace saber { @@ -113,14 +115,14 @@ SaberStatus SaberConvEltwise::dispatch( (const float*)inputs[0]->data(), (const float*)param.conv_param.weight()->data(), chout, chin, hin, win, bias_data, - this->_ctx->get_compute_stream(),1.f, 1.f); + this->_ctx->get_compute_stream(), 1.f, 1.f); } else { conv_gemm_k1s1p0(num, in_stride, out_stride, (float*)outputs[0]->mutable_data(), (const float*)inputs[0]->data(), (const float*)param.conv_param.weight()->data(), chout, chin, hin, win, bias_data, - this->_ctx->get_compute_stream(),1.f, 1.f); + this->_ctx->get_compute_stream(), 1.f, 1.f); } } else { if (param.conv_param.activation_param.has_active) { @@ -207,20 +209,21 @@ SaberStatus SaberConvEltwise::trans_weights( } if (target_weights.valid_size() > 0) { conv_trans_weights(target_weights, - stride_h, stride_w, group, true, nullptr, dilation_h, dilation_w); + stride_h, stride_w, group, true, nullptr, dilation_h, dilation_w); } _extern_trans = true; return SaberSuccess; } + template <> -SaberStatus SaberConvEltwise::trans_weights( +SaberStatus SaberConvEltwise::trans_weights( Tensor &target_weights, Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, int stride_h, int stride_w, int group) { return SaberSuccess; } template <> -SaberStatus SaberConvEltwise::trans_weights( +SaberStatus SaberConvEltwise::trans_weights( Tensor &target_weights, Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, int stride_h, int stride_w, int group) { @@ -228,7 +231,7 @@ SaberStatus SaberConvEltwise::trans_weights( } template class SaberConvEltwise; -DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, NV, AK_HALF); DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/saber_conv_eltwise.h b/saber/funcs/impl/cuda/saber_conv_eltwise.h index dd9619980..0c3507e12 100644 --- a/saber/funcs/impl/cuda/saber_conv_eltwise.h +++ b/saber/funcs/impl/cuda/saber_conv_eltwise.h @@ -16,16 +16,16 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_ELTWISE_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_ELTWISE_H -#include #include "saber/funcs/impl/impl_conv_eltwise.h" -#include "sass_funcs.h" #include "saber/funcs/impl/cuda/saber_conv.h" #include "saber/funcs/impl/cuda/saber_eltwise.h" +#include "saber/funcs/impl/cuda/saber_conv_gemmlike.h" #include "saber/funcs/funcs_utils.h" +#include "sass_funcs.h" +#include -namespace anakin{ - -namespace saber{ +namespace anakin { +namespace saber { template class SaberConvEltwise : public ImplBase< @@ -34,10 +34,10 @@ class SaberConvEltwise : public ImplBase< typedef typename DataTrait::Dtype OpDataType; typedef ImplBase > Impl_conv_t; typedef ImplBase > Impl_eltwise_t; + typedef ImplBase > Impl_t; - SaberConvEltwise() {} - - ~SaberConvEltwise() {} + SaberConvEltwise() = default; + ~SaberConvEltwise() = default; /** * [Create description] Init all cudnn resource here @@ -48,21 +48,21 @@ class SaberConvEltwise : public ImplBase< * @param param [conv parameters] */ virtual SaberStatus init(const std::vector *>& inputs, - std::vector *>& outputs, - ConvEltwiseParam& param, Context& ctx); + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); virtual SaberStatus create(const std::vector *>& inputs, - std::vector *>& outputs, - ConvEltwiseParam& param, Context& ctx); + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); //call cudnnConvolutionForward here virtual SaberStatus dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam& param); + std::vector*>& outputs, + ConvEltwiseParam& param); SaberStatus trans_weights(Tensor &target_weights, Tensor &target_bias, - int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group); + int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group); private: bool _extern_trans{false}; @@ -76,6 +76,15 @@ class SaberConvEltwise : public ImplBase< std::vector *> _inner_tensor_v; int _kernel_height{0}; int _kernel_width{0}; + std::vector *> _in_data_tensor; + std::vector *> _out_data_tensor; + Tensor int8_input; + Tensor int8_output; + SaberGemmLikeConv *_impl; + float _in_scale{0.f}; + float _out_scale{0.f}; + bool _scale_per_k{false}; + bool _output_int8{false}; std::function : public ImplBase< } - #endif //ANAKIN_SABER_FUNCS_SABER_CONV2D_H diff --git a/saber/funcs/impl/cuda/saber_conv_gemmlike.cpp b/saber/funcs/impl/cuda/saber_conv_gemmlike.cpp index 9d628f8b6..c75fcbc05 100644 --- a/saber/funcs/impl/cuda/saber_conv_gemmlike.cpp +++ b/saber/funcs/impl/cuda/saber_conv_gemmlike.cpp @@ -5,6 +5,36 @@ namespace anakin { namespace saber { + +template <> +SaberStatus SaberGemmLikeConv::create( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberGemmLikeConv::init( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + this->_ctx = &ctx; + _use_act = param.activation_param.has_active + && !(param.activation_param.active == Active_relu + && fabsf(param.activation_param.negative_slope) < 1e-6f); + _use_act = _use_act || + (param.bias()->valid_size() == 0 && param.activation_param.has_active); + if (param.activation_param.has_active) { + if (_use_act) { + _saber_act = new SaberActivation; + _saber_act->init(inputs, outputs, param.activation_param, ctx); + } + } + return create(inputs, outputs, param, ctx); +} + template <> SaberStatus SaberGemmLikeConv::dispatch( const std::vector *>& inputs, @@ -26,10 +56,8 @@ SaberStatus SaberGemmLikeConv::dispatch( bias_data = (const float*)param.bias()->data(); } - if (param.activation_param.has_active) - { - if (param.activation_param.active == Active_relu) - { + if (param.activation_param.has_active) { + if (!_use_act) { conv_gemm_k1s1p0(num, in_stride, out_stride, (float*)outputs[0]->mutable_data(), (const float*)inputs[0]->data(), @@ -39,7 +67,7 @@ SaberStatus SaberGemmLikeConv::dispatch( CUDA_CHECK(cudaGetLastError()); return SaberSuccess; } - } + } conv_gemm_k1s1p0(num, in_stride, out_stride, (float*)outputs[0]->mutable_data(), @@ -47,6 +75,7 @@ SaberStatus SaberGemmLikeConv::dispatch( (const float*)param.weight()->data(), chout, chin, hin, win, bias_data, this->_ctx->get_compute_stream(), 1.f, 0.f); + if (this->_saber_act != nullptr) { this->_saber_act->dispatch(outputs, outputs, param.activation_param); } @@ -55,18 +84,29 @@ SaberStatus SaberGemmLikeConv::dispatch( } template <> -SaberStatus SaberGemmLikeConv::dispatch( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param) { +SaberStatus SaberGemmLikeConv::create( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + return SaberSuccess; } template <> -SaberStatus SaberGemmLikeConv::dispatch( +SaberStatus SaberGemmLikeConv::init( + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx) { + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberGemmLikeConv::dispatch( const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param) { + return SaberSuccess; } diff --git a/saber/funcs/impl/cuda/saber_conv_gemmlike.h b/saber/funcs/impl/cuda/saber_conv_gemmlike.h index 7158b135c..2a24feeec 100644 --- a/saber/funcs/impl/cuda/saber_conv_gemmlike.h +++ b/saber/funcs/impl/cuda/saber_conv_gemmlike.h @@ -16,11 +16,11 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_GEMMLIKE_H #define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_CONV_GEMMLIKE_H -#include #include "saber/funcs/impl/impl_conv.h" -#include "sass_funcs.h" #include "saber/funcs/impl/cuda/saber_activation.h" #include "saber/funcs/funcs_utils.h" +#include "sass_funcs.h" +#include namespace anakin{ @@ -39,35 +39,25 @@ class SaberGemmLikeConv : public ImplBase< virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, - ConvParam& param, Context &ctx) - { - this->_ctx = &ctx; - if (param.activation_param.has_active) - { - if (param.activation_param.active != Active_relu) - { - _saber_act = new SaberActivation; - _saber_act->init(inputs, outputs, param.activation_param, ctx); - } - } - - return create(inputs, outputs, param, ctx); - } + ConvParam& param, Context &ctx); virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, - ConvParam& param, Context& ctx) - { - if (_saber_act != nullptr) - _saber_act->create(outputs, outputs, param.activation_param, ctx); - } + ConvParam& param, Context& ctx); virtual SaberStatus dispatch(const std::vector*>& inputs, std::vector*>& outputs, ConvParam& param); + void set_act(bool use_act) { + _use_act = use_act; + } private: SaberActivation *_saber_act{nullptr}; + bool _use_act{false}; + std::function _int8_func; }; } diff --git a/saber/funcs/impl/cuda/saber_conv_pooling.cpp b/saber/funcs/impl/cuda/saber_conv_pooling.cpp index 0d99724c4..36317f332 100644 --- a/saber/funcs/impl/cuda/saber_conv_pooling.cpp +++ b/saber/funcs/impl/cuda/saber_conv_pooling.cpp @@ -186,23 +186,22 @@ SaberStatus SaberConv2DPooling::trans_weights(Tensor &target_w return SaberSuccess; } + template <> -SaberStatus SaberConv2DPooling::trans_weights(Tensor &target_weights, +SaberStatus SaberConv2DPooling::trans_weights(Tensor &target_weights, Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, int stride_h, int stride_w, int group) { return SaberUnImplError; } - template <> -SaberStatus SaberConv2DPooling::trans_weights(Tensor &target_weights, - Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { +SaberStatus SaberConv2DPooling::trans_weights(Tensor &target_weights, + Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group) { return SaberUnImplError; } - template class SaberConv2DPooling; -DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, NV, AK_HALF); DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, NV, AK_INT8); +DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/saber_conv_pooling.h b/saber/funcs/impl/cuda/saber_conv_pooling.h index ef945202c..8fd2e3337 100644 --- a/saber/funcs/impl/cuda/saber_conv_pooling.h +++ b/saber/funcs/impl/cuda/saber_conv_pooling.h @@ -20,8 +20,9 @@ #include "saber/funcs/impl/impl_conv_pooling.h" #include "saber/funcs/impl/cuda/saber_conv.h" #include "saber/funcs/impl/cuda/vender_pooling.h" -#include "sass_funcs.h" +#include "saber/funcs/impl/cuda/saber_pooling.h" #include "saber/funcs/funcs_utils.h" +#include "sass_funcs.h" namespace anakin { @@ -67,6 +68,7 @@ class SaberConv2DPooling : public ImplBase< int _kernel_height{0}; int _kernel_width{0}; VenderPooling _pool; + SaberPooling _saber_pool; SaberConv2D _conv; Shape _inner_shape; Tensor _inner_tensor; diff --git a/saber/funcs/impl/cuda/saber_conv_winograd.cpp b/saber/funcs/impl/cuda/saber_conv_winograd.cpp index 9a2fd3431..479297068 100644 --- a/saber/funcs/impl/cuda/saber_conv_winograd.cpp +++ b/saber/funcs/impl/cuda/saber_conv_winograd.cpp @@ -21,7 +21,7 @@ SaberStatus SaberWinogradConv::dispatch( if (param.activation_param.has_active) { - if (param.activation_param.active == Active_relu) + if (!_use_saber_act) { winograd_conv_relu((const float *) inputs[0]->data(), (float *) outputs[0]->mutable_data(), diff --git a/saber/funcs/impl/cuda/saber_conv_winograd.h b/saber/funcs/impl/cuda/saber_conv_winograd.h index 1f9bd08fc..81aca2057 100644 --- a/saber/funcs/impl/cuda/saber_conv_winograd.h +++ b/saber/funcs/impl/cuda/saber_conv_winograd.h @@ -42,15 +42,17 @@ class SaberWinogradConv : public ImplBase< ConvParam& param, Context &ctx) { this->_ctx = &ctx; - if (param.activation_param.has_active) - { - if (param.activation_param.active != Active_relu) - { - _saber_act = new SaberActivation; + _use_saber_act = param.activation_param.has_active + && !(param.activation_param.active == Active_relu + && fabsf(param.activation_param.negative_slope) < 1e-6f); + _use_saber_act = _use_saber_act || + (param.bias()->valid_size() == 0 && param.activation_param.has_active); + if (param.activation_param.has_active) { + if (_use_saber_act) { + _saber_act = new SaberActivation; _saber_act->init(inputs, outputs, param.activation_param, ctx); } - } - + } return create(inputs, outputs, param, ctx); } @@ -68,6 +70,7 @@ class SaberWinogradConv : public ImplBase< private: SaberActivation *_saber_act{nullptr}; + bool _use_saber_act{false}; }; } diff --git a/saber/funcs/impl/cuda/saber_cos_sim.h b/saber/funcs/impl/cuda/saber_cos_sim.h new file mode 100644 index 000000000..0b389960d --- /dev/null +++ b/saber/funcs/impl/cuda/saber_cos_sim.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_COS_SIM_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_COS_SIM_H + +#include "saber/funcs/impl/impl_cos_sim.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberCosSim : + public ImplBase< + NV, OpDtype, + CosSimParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberCosSim() = default; + ~SaberCosSim() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + CosSimParam& param, Context& ctx) { + this->_ctx = &ctx; + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + CosSimParam& param, Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam& param); +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_COSSIM_H diff --git a/saber/funcs/impl/cuda/saber_detection_output.h b/saber/funcs/impl/cuda/saber_detection_output.h index 65f351f52..67581ca9a 100644 --- a/saber/funcs/impl/cuda/saber_detection_output.h +++ b/saber/funcs/impl/cuda/saber_detection_output.h @@ -30,15 +30,16 @@ class SaberDetectionOutput : \ DetectionOutputParam > { public: - typedef typename DataTrait::Dtype dtype; SaberDetectionOutput() = default; ~SaberDetectionOutput() { if (_bbox_cpu_data) { fast_free(_bbox_cpu_data); + _bbox_cpu_data = nullptr; } if (_conf_cpu_data) { fast_free(_conf_cpu_data); + _conf_cpu_data = nullptr; } } @@ -53,36 +54,57 @@ class SaberDetectionOutput : \ virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, DetectionOutputParam& param, Context &ctx) { - - //! inputs[0]: location map, dims = 4 {N, boxes * 4, 1, 1} - //! inputs[1]: confidence map, dims = 4 {N, classes * boxes, 1, 1} - //! inputs[2]: prior boxes, dims = 4 {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)} + _shared_loc = param.share_location; Shape sh_loc = inputs[0]->valid_shape(); Shape sh_conf = inputs[1]->valid_shape(); - Shape sh_box = inputs[2]->valid_shape(); - //! shape {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)}, boxes = size / 2 / 4 - //! layout must be 4 dims, the priors is in the last dim - _num_priors = sh_box.count() / 8; - int num = inputs[0]->num(); - if (param.class_num == 0) { - _num_classes = inputs[1]->valid_size() / (num * _num_priors); - } else { - _num_classes = param.class_num; - } - if (param.share_location) { + Shape sh_box; + + //fixme, only support{xmin, ymin, xmax, ymax} style box + if (_shared_loc) { + //! for one stage detector + //! inputs[0]: location map, {N, boxes * 4} + //! inputs[1]: confidence map, ssd: {N, classes, boxes}, yolov3: {N, boxes, classes} + //! optional, ssd has 3 inputs, the last inputs is priorbox + //! inputs[2]: prior boxes, dims = 4 {1, 2, boxes * 4(xmin, ymin, xmax, ymax)} + CHECK_GE(inputs.size(), 2) << "detection_output op must has 2 inputs at least"; + bool is_ssd = inputs.size() > 2; + if (is_ssd) { + sh_box = inputs[2]->valid_shape(); + } + //! boxes = sh_loc / 4 + _num_priors = sh_loc.count() / 4; + if (param.class_num <= 0) { + _num_classes = sh_conf.count() / _num_priors; + } else { + _num_classes = param.class_num; + } _num_loc_classes = 1; + if (is_ssd) { + _bbox_preds.reshape(sh_loc); + _conf_permute.reshape(sh_conf); + } + } else { + //! for two stage detector + //! inputs[0]: tensor with offset, location, {M, C, 4} + //! inputs[1]: tensor with offset, confidence, {M, C} + CHECK_EQ(sh_loc[0], sh_conf[0]) << "boxes number must be the same"; + _num_priors = sh_loc[0]; + if (param.class_num <= 0) { + _num_classes = sh_conf.count() / _num_priors; + } else { + _num_classes = param.class_num; + } _num_loc_classes = _num_classes; _bbox_permute.reshape(sh_loc); + _conf_permute.reshape(sh_conf); } - _bbox_preds.reshape(sh_loc); - _conf_permute.reshape(sh_conf); + CHECK_EQ(_num_priors * _num_loc_classes * 4, sh_loc.count()) << \ + "Number of boxes must match number of location predictions."; + CHECK_EQ(_num_priors * _num_classes, sh_conf.count()) << \ + "Number of boxes must match number of confidence predictions."; - CHECK_EQ(_num_priors * _num_loc_classes * 4, sh_loc.count() / sh_loc.num()) << \ - "Number of priors must match number of location predictions."; - CHECK_EQ(_num_priors * _num_classes, sh_conf.count() / sh_conf.num()) << \ - "Number of priors must match number of confidence predictions."; if (_conf_cpu_data != nullptr) { fast_free(_conf_cpu_data); @@ -90,8 +112,8 @@ class SaberDetectionOutput : \ if (_bbox_cpu_data != nullptr) { fast_free(_bbox_cpu_data); } - _conf_cpu_data = (dtype*)fast_malloc(sizeof(dtype) * sh_conf.count()); - _bbox_cpu_data = (dtype*)fast_malloc(sizeof(dtype) * sh_loc.count()); + _conf_cpu_data = (float*)fast_malloc(sizeof(float) * sh_conf.count()); + _bbox_cpu_data = (float*)fast_malloc(sizeof(float) * sh_loc.count()); return SaberSuccess; } @@ -105,11 +127,12 @@ class SaberDetectionOutput : \ int _num_classes; int _num_loc_classes; int _num_priors; + bool _shared_loc{true}; Tensor _bbox_preds; Tensor _bbox_permute; Tensor _conf_permute; - dtype* _bbox_cpu_data{nullptr}; - dtype* _conf_cpu_data{nullptr}; + float* _bbox_cpu_data{nullptr}; + float* _conf_cpu_data{nullptr}; }; template class SaberDetectionOutput; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_eltwise.h b/saber/funcs/impl/cuda/saber_eltwise.h index 4e302192e..e01d44099 100644 --- a/saber/funcs/impl/cuda/saber_eltwise.h +++ b/saber/funcs/impl/cuda/saber_eltwise.h @@ -43,10 +43,6 @@ class SaberEltwise: this->_ctx = &ctx; CHECK_GE(outputs.size(), 1) << "outputs size has to == 1"; CHECK_GE(inputs.size(), 2) << "input size has to >= 2"; - CHECK(!(inputs.size() > 2 - && param.operation == Eltwise_sum)) << - "not support input size>2 and operation==Eltwise_sum, size = " << inputs.size() << ",activation = " - << param.operation; _with_relu = param.has_eltwise && param.activation_param.active == Active_relu; _other_activation = param.has_eltwise && param.activation_param.active != Active_relu && param.activation_param.active != Active_unknow; @@ -54,6 +50,17 @@ class SaberEltwise: if (_other_activation) { SABER_CHECK(_saber_activation.init(inputs, outputs, param.activation_param, ctx)); } + int input_num = inputs.size(); + Shape coeff_shape({input_num, 1, 1, 1}, Layout_NCHW); + if (param.operation == Eltwise_sum) { + _coeff_d.re_alloc(coeff_shape, AK_FLOAT); + + OpDataType* coeff_data = (OpDataType*)_coeff_d.mutable_data(); + cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); + cudaMemcpyAsync(coeff_data, ¶m.coeff[0], sizeof(OpDataType) * input_num, cudaMemcpyHostToDevice, cuda_stream); + } + _inputs_d.re_alloc(coeff_shape, AK_UINT64); + return create(inputs, outputs, param, ctx); } @@ -80,6 +87,8 @@ class SaberEltwise: bool _with_relu; bool _other_activation; SaberActivation _saber_activation; + Tensor _coeff_d; + Tensor _inputs_d; }; diff --git a/saber/funcs/impl/cuda/saber_fc.h b/saber/funcs/impl/cuda/saber_fc.h index 7b32706ee..4fc71ed97 100644 --- a/saber/funcs/impl/cuda/saber_fc.h +++ b/saber/funcs/impl/cuda/saber_fc.h @@ -15,7 +15,7 @@ #define ANAKIN_SABER_FUNCS_CUDA_SABER_FC_H #include "saber/funcs/impl/impl_fc.h" -#include "sass_funcs.h" +#include "saber/funcs/gemm.h" namespace anakin{ @@ -28,52 +28,34 @@ class SaberFc: public ImplBase > { typedef typename DataTrait::Dtype OpDataType; SaberFc() = default; - ~SaberFc() {} + ~SaberFc() { + delete _gemm; + } virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, - FcParam& param, Context& ctx){ - // get context - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); - } + FcParam& param, Context& ctx); virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, - FcParam& param, Context& ctx){ - - if (!(&ctx == this->_ctx)) { - this->_ctx = &ctx; - } - - Shape shape_out = inputs[0]->valid_shape(); - _M = inputs[0]->count_valid(0, param.axis); - _K = inputs[0]->count_valid(param.axis, inputs[0]->dims()); - _N = param.num_output; - if (_N <= 0) { - int weight_size = param.weights->valid_size(); - _N = weight_size / _K; - } - //! weights dims must be in h and w - _flag_trans_weights = param.is_transpose_weights; - _kernel = saber_find_fast_sass_gemm(false, !_flag_trans_weights, _M, _N, _K); - return SaberSuccess; - } + FcParam& param, Context& ctx); virtual SaberStatus dispatch(const std::vector *>& inputs, std::vector *>& outputs, FcParam& param); - private: + + MatrixFunc *_gemm{nullptr}; + MatrixFunc *_gemm_s8f32{nullptr}; +// Gemm _gemm; bool _flag_trans_weights{false}; int _M; int _K; int _N; bool _is_continue_buf{true}; - std::function _kernel; + Tensor _inner_tensor; + Tensor _trans_weight; }; } //namespace saber diff --git a/saber/funcs/impl/cuda/saber_gemm.cpp b/saber/funcs/impl/cuda/saber_gemm.cpp index a510866a1..c6b48f611 100644 --- a/saber/funcs/impl/cuda/saber_gemm.cpp +++ b/saber/funcs/impl/cuda/saber_gemm.cpp @@ -1,6 +1,6 @@ #include "saber/funcs/impl/cuda/saber_gemm.h" - +#include "sass_funcs.h" namespace anakin { namespace saber { diff --git a/saber/funcs/impl/cuda/saber_gemm.h b/saber/funcs/impl/cuda/saber_gemm.h index 58208e653..900c55a82 100644 --- a/saber/funcs/impl/cuda/saber_gemm.h +++ b/saber/funcs/impl/cuda/saber_gemm.h @@ -1,15 +1,16 @@ -#ifndef SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H -#define SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GEMM_H #include "saber/funcs/gemm.h" -#include "sass_funcs.h" + namespace anakin { namespace saber { template -class Gemm { +class Gemm + : public MatrixFunc{ public: Gemm() = default; ~Gemm() {} diff --git a/saber/funcs/impl/cuda/saber_generate_proposals.h b/saber/funcs/impl/cuda/saber_generate_proposals.h new file mode 100644 index 000000000..7450dde1f --- /dev/null +++ b/saber/funcs/impl/cuda/saber_generate_proposals.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GENERATE_PROPOSALS_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GENERATE_PROPOSALS_H + +#include "saber/funcs/impl/impl_generate_proposals.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberGenerateProposals : + public ImplBase< + NV, OpDtype, + GenerateProposalsParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberGenerateProposals() = default; + ~SaberGenerateProposals() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + GenerateProposalsParam& param, Context& ctx) { + this->_ctx = &ctx; + auto scores = inputs[0]; + auto bbox_deltas = inputs[1]; + Shape scores_shape = scores->valid_shape(); + Shape scores_swap_shape({scores_shape[0], + scores_shape[2], + scores_shape[3], + scores_shape[1]}, Layout_NCHW); + + Shape bbox_deltas_shape = bbox_deltas->valid_shape(); + Shape bbox_deltas_swap_shape({bbox_deltas_shape[0], + bbox_deltas_shape[2], + bbox_deltas_shape[3], + bbox_deltas_shape[1]}, Layout_NCHW); + _scores_swap.reshape(scores_swap_shape); + _bbox_deltas_swap.reshape(bbox_deltas_swap_shape); + _scores_index.reshape(inputs[0]->valid_shape()); + _sorted_scores.reshape(inputs[0]->valid_shape()); + _sorted_index.reshape(inputs[0]->valid_shape()); + _sorted_index.set_dtype(AK_INT32); + + int batch_size = inputs[0]->num(); + _proposals.reshape(std::vector{batch_size, param.pre_nms_top_n, 4, 1}); + _keep_num.reshape(std::vector{batch_size, 1, 1, 1}); + _keep_num.set_dtype(AK_INT32); + _keep.reshape(std::vector{batch_size, param.pre_nms_top_n, 1, 1}); + _keep.set_dtype(AK_INT32); + _keep_nms.reshape(std::vector{1, param.pre_nms_top_n, 1, 1}); + _boxes_out.reshape(std::vector{param.pre_nms_top_n, 5, 1, 1}); + _scores_out.reshape(std::vector{param.pre_nms_top_n, 1, 1, 1}); + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + GenerateProposalsParam& param, Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam& param); +private: + Tensor _scores_swap; + Tensor _bbox_deltas_swap; + Tensor _scores_index; + Tensor _sorted_scores; + Tensor _sorted_index; + Tensor _proposals; + Tensor _keep_num; + Tensor _keep; + Tensor _keep_nms; + Tensor _boxes_out; + Tensor _scores_out; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_GENERATE_PROPOSALS_H diff --git a/saber/funcs/impl/cuda/saber_gru.h b/saber/funcs/impl/cuda/saber_gru.h index 2eac66db4..882b2d3a4 100644 --- a/saber/funcs/impl/cuda/saber_gru.h +++ b/saber/funcs/impl/cuda/saber_gru.h @@ -33,9 +33,11 @@ class SaberGru: public ImplBase < typedef typename DataTrait::Dtype OpDataType; typedef Tensor OpTensor; - SaberGru() {} + SaberGru():_handle(NULL) {} ~SaberGru() { - + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } } virtual SaberStatus init(const std::vector& inputs, \ @@ -96,6 +98,10 @@ class SaberGru: public ImplBase < // cudaDeviceSynchronize(); } + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); return create(inputs, outputs, param, ctx); } @@ -104,7 +110,16 @@ class SaberGru: public ImplBase < GruParam& param, Context& ctx) { if (!(&ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); } std::vector> offset_vec=inputs[0]->get_seq_offset(); @@ -127,6 +142,7 @@ class SaberGru: public ImplBase < GruParam & param); private: + cublasHandle_t _handle; /** * for hw2seq diff --git a/saber/funcs/impl/cuda/saber_lstm.h b/saber/funcs/impl/cuda/saber_lstm.h index 5b5dee5af..2cfa81373 100644 --- a/saber/funcs/impl/cuda/saber_lstm.h +++ b/saber/funcs/impl/cuda/saber_lstm.h @@ -22,13 +22,10 @@ namespace anakin { namespace saber { -static int round_up(int k, int c) { - return ((k + c - 1) / c) * c; -} template class SaberLstm: public ImplBase < - NV, OpDtype,LstmParam > { + NV, OpDtype, LstmParam > { public: typedef typename DataTrait::Dtype OpDataType; @@ -43,14 +40,16 @@ class SaberLstm: public ImplBase < LstmParam & param, Context& ctx) { this->_ctx = &ctx; - if(param.with_peephole){ - _hidden_size=param.bias()->valid_size()/7; - }else{ - _hidden_size=param.bias()->valid_size()/4; + + if (param.with_peephole) { + _hidden_size = param.bias()->valid_size() / 7; + } else { + _hidden_size = param.bias()->valid_size() / 4; } - _word_size=(param.weight()->valid_size()-_hidden_size*_hidden_size*4)/_hidden_size/4; + + _word_size = (param.weight()->valid_size() - _hidden_size * _hidden_size * 4) / _hidden_size / 4; //TODO:add round_up to saber_util - _aligned_hidden_size=round_up(_hidden_size,32); + _aligned_hidden_size = utils::round_up(_hidden_size, 32); _seq_util = SeqSortedseqTranseUtil(param.is_reverse); @@ -103,15 +102,15 @@ class SaberLstm: public ImplBase < SaberStatus dispatch_batch( - const std::vector < Tensor* >& inputs, - std::vector < Tensor* >& outputs, - LstmParam < NV >& param); + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param); SaberStatus dispatch_once( - const std::vector < Tensor* >& inputs, - std::vector < Tensor* >& outputs, - LstmParam < NV >& param); + const std::vector < Tensor* >& inputs, + std::vector < Tensor* >& outputs, + LstmParam < NV >& param); }; diff --git a/saber/funcs/impl/cuda/saber_lstmp.h b/saber/funcs/impl/cuda/saber_lstmp.h new file mode 100644 index 000000000..2fa4115b1 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_lstmp.h @@ -0,0 +1,83 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTMP_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTMP_H +#include "saber/funcs/impl/impl_lstmp.h" +#include "sass_funcs.h" +namespace anakin { + +namespace saber { + +template +class SaberLstmp : public ImplBase < + NV, OpDtype, LstmParam > { + +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberLstmp() {} + + ~SaberLstmp() { + + } + + virtual SaberStatus init(const std::vector *>& inputs, \ + std::vector *>& outputs, \ + LstmParam& param, Context& ctx) { + + this->_ctx = &ctx; + _inner_hidden_dim = param.cell_dim; + _output_hidden_dim = param.project_dim; + CHECK_GT(param.cell_dim,0); + CHECK_GT(param.project_dim,0); + + CHECK_EQ(inputs.size(), 1) << "only support input size = 1"; + CHECK_EQ(outputs.size(), 1) << "only support outputs size = 1"; + CHECK_EQ(param.init_hidden() == nullptr, true) << "only support param.init_hidden() == nullptr"; + CHECK_EQ(param.num_layers, 1) << "only support param.num_layers==1"; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, \ + std::vector *>& outputs, \ + LstmParam& param, Context& ctx) { + if (!(&ctx == this->_ctx)) { + if (_handle != NULL) { + CUBLAS_CHECK(cublasDestroy(_handle)); + } + + this->_ctx = &ctx; + + cudaStream_t cuda_stream; + cuda_stream = ctx.get_compute_stream(); + CUBLAS_CHECK(cublasCreate(&_handle)); + CUBLAS_CHECK(cublasSetStream(_handle, cuda_stream)); + } + + return SaberSuccess; + } + + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + LstmParam& param); + +private: + + cublasHandle_t _handle; + + Tensor _wx_tensor; + Tensor _temp_hidden_tensor; + Tensor _temp_cell_tensor; + int _output_hidden_dim; + int _inner_hidden_dim; + + +}; +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_LSTMP_H diff --git a/saber/funcs/impl/cuda/saber_mat_mul.cpp b/saber/funcs/impl/cuda/saber_mat_mul.cpp index d972494d9..03ddc4318 100644 --- a/saber/funcs/impl/cuda/saber_mat_mul.cpp +++ b/saber/funcs/impl/cuda/saber_mat_mul.cpp @@ -18,7 +18,7 @@ SaberStatus SaberMatMul::dispatch( //should add batch gemm here for (int b = 0; b < param._b; b++) { - _kernel(param._m, param._n, param._k, 1.f, + _kernel(param._m, param._n, param._k, param._scale, X + b * param._m * param._k, 0.f, Y + b * param._k * param._n, diff --git a/saber/funcs/impl/cuda/saber_mean.h b/saber/funcs/impl/cuda/saber_mean.h new file mode 100644 index 000000000..b22e63ae7 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_mean.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MEAN_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MEAN_H + +#include "saber/funcs/impl/impl_mean.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberMean : + public ImplBase< + NV, OpDtype, + MeanParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberMean() {} + ~SaberMean() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MeanParam& param, Context& ctx) { + + this->_ctx = &ctx; + create(inputs, outputs, param, ctx); + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MeanParam& param, Context &ctx) { + + _num_out = outputs[0]->num(); + _c_out = outputs[0]->channel(); + _h_out = outputs[0]->height(); + _w_out = outputs[0]->width(); + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + MeanParam& param); + +private: + int _num_out; + int _c_out; + int _h_out; + int _w_out; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MATCH_MATRIX_H diff --git a/saber/funcs/impl/cuda/saber_one_hot.h b/saber/funcs/impl/cuda/saber_one_hot.h new file mode 100644 index 000000000..adea462e7 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_one_hot.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ONE_HOT_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ONE_HOT_H + +#include "saber/funcs/impl/impl_one_hot.h" +#include "saber/core/data_traits.h" + +namespace anakin { + +namespace saber { + +template +class SaberOneHot: \ + public ImplBase < + NV, OpDtype, + OneHotParam> { + +public: + + SaberOneHot() = default; + + ~SaberOneHot() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param, + Context& ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param, + Context& ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param) override; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ONE_HOT_H diff --git a/saber/funcs/impl/cuda/saber_pixel_shuffle.h b/saber/funcs/impl/cuda/saber_pixel_shuffle.h new file mode 100644 index 000000000..84f6a0327 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_pixel_shuffle.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PIXEL_SHUFFLE_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PIXEL_SHUFFLE_H + +#include "saber/funcs/impl/impl_pixel_shuffle.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPixelShuffle:\ + public ImplBase< + NV, + OpDtype, + PixelShuffleParam> { + +public: + + SaberPixelShuffle() {} + + ~SaberPixelShuffle() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m, + Context &ctx){ + + return create(inputs, outputs, param, ctx); + } + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m, + Context &ctx){ + this -> _ctx = &ctx; + + _axes = inputs[0]->valid_shape().size() + 2; + Shape in_sh = inputs[0]->valid_shape(); + int new_c = in_sh.channel()/(param.rw * param.rh); + Shape in_new_sh; + Shape out_new_sh; + std::vector order; + in_new_sh.push_back(in_sh.num()); + out_new_sh.push_back(in_sh.num()); + if (param.channel_first){ + in_new_sh.push_back(new_c); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + order = std::vector({0, 1, 4, 2, 5, 3}); + out_new_sh.push_back(new_c); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + + } else { + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(new_c); + order = std::vector({0, 1, 3, 2, 4, 5}); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + out_new_sh.push_back(new_c); + } + Shape in_step = in_new_sh.get_stride(); + Shape out_step = out_new_sh.get_stride(); + + _permute_order.reshape(Shape({6, 1, 1, 1})); + _in_step.reshape(Shape({in_step.dims(), 1, 1, 1})); + _out_step.reshape(Shape({out_step.dims(), 1, 1, 1})); + + cudaMemcpy(_permute_order.mutable_data(), order.data(), + sizeof(int) * order.size(), cudaMemcpyHostToDevice); + cudaMemcpy(_in_step.mutable_data(), in_step.data(), + sizeof(int) * _in_step.size(), cudaMemcpyHostToDevice); + cudaMemcpy(_out_step.mutable_data(), out_step.data(), + sizeof(int) * _out_step.size(), cudaMemcpyHostToDevice); + + return SaberSuccess; + } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m); + +private: + int _axes; + Tensor _permute_order; + Tensor _in_step; + Tensor _out_step; +}; + +template class SaberPixelShuffle; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PixelShuffle_H diff --git a/saber/funcs/impl/cuda/saber_pooling.h b/saber/funcs/impl/cuda/saber_pooling.h index b99fbf829..499df1def 100644 --- a/saber/funcs/impl/cuda/saber_pooling.h +++ b/saber/funcs/impl/cuda/saber_pooling.h @@ -29,43 +29,36 @@ namespace saber{ template class SaberPooling:\ - public ImplBase< - NV,OpDtype, - PoolingParam> { - - public: - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - - SaberPooling(){} - - ~SaberPooling() {} - - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, - Context &ctx) override { - - return SaberUnImplError; - - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m, - Context &ctx) override { - - return SaberUnImplError; - - } - - //call cudnnConvolutionForward here - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, - PoolingParam ¶m) { - - return SaberUnImplError; - } +public ImplBase< + NV, OpDtype, + PoolingParam> { +typedef ImplBase > Impl_t; +public: + + SaberPooling() = default; + + ~SaberPooling() { + delete _impl; + } + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, + Context &ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m, + Context &ctx) override; + + //call cudnnConvolutionForward here + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam ¶m) override; +private: + Tensor _int8_input; + Tensor _int8_output; + Impl_t* _impl{nullptr}; }; } diff --git a/saber/funcs/impl/cuda/saber_proposal_img_scale_to_cam_coords.h b/saber/funcs/impl/cuda/saber_proposal_img_scale_to_cam_coords.h index 90e23d749..a9191e396 100644 --- a/saber/funcs/impl/cuda/saber_proposal_img_scale_to_cam_coords.h +++ b/saber/funcs/impl/cuda/saber_proposal_img_scale_to_cam_coords.h @@ -36,22 +36,23 @@ class SaberProposalImgScaleToCamCoords:\ public: - SaberProposalImgScaleToCamCoords() - : _rois_boxes_data_host_tensor(NULL) - , _im_info_data_host_tensor(NULL) - , _cam2d_data_host_tensor(NULL) - , _prj_h_pred_data_host_tensor(NULL) - , _real_h_pred_data_host_tensor(NULL) - , _size3d_h_pred_data_host_tensor(NULL) - , _size3d_w_pred_data_host_tensor(NULL) - , _size3d_l_pred_data_host_tensor(NULL) - , _orien3d_sin_pred_data_host_tensor(NULL) - , _orien3d_cos_pred_data_host_tensor(NULL) - , _trunc_ratio_pred_data_host_tensor(NULL) - , _img_info_data_host_tensor(NULL) - , _cam_coords_data_host_tensor(NULL) - , _has_inited(false) - {} +// SaberProposalImgScaleToCamCoords() +// : _rois_boxes_data_host_tensor(NULL) +// , _im_info_data_host_tensor(NULL) +// , _cam2d_data_host_tensor(NULL) +// , _prj_h_pred_data_host_tensor(NULL) +// , _real_h_pred_data_host_tensor(NULL) +// , _size3d_h_pred_data_host_tensor(NULL) +// , _size3d_w_pred_data_host_tensor(NULL) +// , _size3d_l_pred_data_host_tensor(NULL) +// , _orien3d_sin_pred_data_host_tensor(NULL) +// , _orien3d_cos_pred_data_host_tensor(NULL) +// , _trunc_ratio_pred_data_host_tensor(NULL) +// , _img_info_data_host_tensor(NULL) +// , _cam_coords_data_host_tensor(NULL) +// , _has_inited(false) +// {} + SaberProposalImgScaleToCamCoords() = default; ~SaberProposalImgScaleToCamCoords() { if (_rois_boxes_data_host_tensor != NULL) { @@ -126,81 +127,81 @@ class SaberProposalImgScaleToCamCoords:\ std::vector*>& outputs, ProposalImgScaleToCamCoordsParam ¶m) override; private: - int num_class_; + int num_class_{0}; std::vector sub_class_num_class_; std::vector sub_class_bottom_idx_; std::vector sub_class_num_class_pre_sum_; - int total_sub_class_num_; + int total_sub_class_num_{0}; ProposalImgScaleToCamCoords_NormType prj_h_norm_type_; - bool has_size3d_and_orien3d_; + bool has_size3d_and_orien3d_{false}; // with trunc ratio - bool with_trunc_ratio_; - ProposalImgScaleToCamCoords_OrienType orien_type_; + bool with_trunc_ratio_{false}; + ProposalImgScaleToCamCoords_OrienType orien_type_{ProposalImgScaleToCamCoords_OrienType_PI}; std::set cls_ids_zero_size3d_w_; std::set cls_ids_zero_size3d_l_; std::set cls_ids_zero_orien3d_; - bool cmp_pts_corner_3d_; - bool cmp_pts_corner_2d_; - int num_top_channels_; - int size3d_h_bottom_idx_; - int size3d_w_bottom_idx_; - int size3d_l_bottom_idx_; - int orien3d_sin_bottom_idx_; - int orien3d_cos_bottom_idx_; - int trunc_ratio_bottom_idx_; - int cam_info_idx_st_in_im_info_; - bool need_ctr_2d_norm_; + bool cmp_pts_corner_3d_{false}; + bool cmp_pts_corner_2d_{false}; + int num_top_channels_{0}; + int size3d_h_bottom_idx_{0}; + int size3d_w_bottom_idx_{0}; + int size3d_l_bottom_idx_{0}; + int orien3d_sin_bottom_idx_{0}; + int orien3d_cos_bottom_idx_{0}; + int trunc_ratio_bottom_idx_{0}; + int cam_info_idx_st_in_im_info_{0}; + bool need_ctr_2d_norm_{false}; std::vector ctr_2d_means_; std::vector ctr_2d_stds_; - bool need_prj_h_norm_; + bool need_prj_h_norm_{false}; std::vector prj_h_means_; std::vector prj_h_stds_; - bool need_real_h_norm_; + bool need_real_h_norm_{false}; std::vector real_h_means_; std::vector real_h_stds_; - bool need_real_w_norm_; + bool need_real_w_norm_{false}; std::vector real_w_means_; std::vector real_w_stds_; - bool need_real_l_norm_; + bool need_real_l_norm_{false}; std::vector real_l_means_; std::vector real_l_stds_; - bool need_sin_norm_; + bool need_sin_norm_{false}; std::vector sin_means_; std::vector sin_stds_; - bool need_cos_norm_; + bool need_cos_norm_{false}; std::vector cos_means_; std::vector cos_stds_; - bool has_scale_offset_info_; - float im_width_scale_; - float im_height_scale_; - float cords_offset_x_; - float cords_offset_y_; - bool bbox_size_add_one_; + bool has_scale_offset_info_{false}; + float im_width_scale_{0.f}; + float im_height_scale_{0.f}; + float cords_offset_x_{0.f}; + float cords_offset_y_{0.f}; + bool bbox_size_add_one_{false}; // rotate coords by pitch - bool rotate_coords_by_pitch_; + bool rotate_coords_by_pitch_{false}; // whether regress ph rh as whole - bool regress_ph_rh_as_whole_; - bool need_real_h_norm_dps_; + bool regress_ph_rh_as_whole_{false}; + bool need_real_h_norm_dps_{false}; std::vector real_h_means_dps_; std::vector real_h_stds_dps_; - Tensor* _rois_boxes_data_host_tensor; - Tensor* _im_info_data_host_tensor; - Tensor* _cam2d_data_host_tensor; - Tensor* _prj_h_pred_data_host_tensor; - Tensor* _real_h_pred_data_host_tensor; - Tensor* _size3d_h_pred_data_host_tensor; - Tensor* _size3d_w_pred_data_host_tensor; - Tensor* _size3d_l_pred_data_host_tensor; - Tensor* _orien3d_sin_pred_data_host_tensor; - Tensor* _orien3d_cos_pred_data_host_tensor; - Tensor* _trunc_ratio_pred_data_host_tensor; - Tensor* _img_info_data_host_tensor; + Tensor* _rois_boxes_data_host_tensor{nullptr}; + Tensor* _im_info_data_host_tensor{nullptr}; + Tensor* _cam2d_data_host_tensor{nullptr}; + Tensor* _prj_h_pred_data_host_tensor{nullptr}; + Tensor* _real_h_pred_data_host_tensor{nullptr}; + Tensor* _size3d_h_pred_data_host_tensor{nullptr}; + Tensor* _size3d_w_pred_data_host_tensor{nullptr}; + Tensor* _size3d_l_pred_data_host_tensor{nullptr}; + Tensor* _orien3d_sin_pred_data_host_tensor{nullptr}; + Tensor* _orien3d_cos_pred_data_host_tensor{nullptr}; + Tensor* _trunc_ratio_pred_data_host_tensor{nullptr}; + Tensor* _img_info_data_host_tensor{nullptr}; std::vector *> _sub_class_datas_host_tensor_v; //output - Tensor* _cam_coords_data_host_tensor; - bool _has_inited; + Tensor* _cam_coords_data_host_tensor{nullptr}; + bool _has_inited{false}; }; } diff --git a/saber/funcs/impl/cuda/saber_ps_roi_pooling.h b/saber/funcs/impl/cuda/saber_ps_roi_pooling.h new file mode 100644 index 000000000..0405443ea --- /dev/null +++ b/saber/funcs/impl/cuda/saber_ps_roi_pooling.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PS_ROI_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_PS_ROI_POOLING_H + +#include "saber/funcs/impl/impl_ps_roi_pooling.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPsRoiPool: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberPsRoiPool() + {} + + ~SaberPsRoiPool() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m, + Context &ctx) { + Shape inter_shape = inputs[0]->shape(); + int oc = outputs[0]->channel(); + int num = outputs[0]->num(); + inter_shape.set_num(param.pooled_height * param.pooled_width * oc); + inter_shape.set_channel(num); + inter_shape.set_width(param.crop_height); + inter_shape.set_height(param.crop_width); + _crop_data.re_alloc(inter_shape, OpDtype); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m); + +private: + Tensor _crop_data; + + +}; +template class SaberPsRoiPool; +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_POOL_H diff --git a/saber/funcs/impl/cuda/saber_rcnn_proposal.cpp b/saber/funcs/impl/cuda/saber_rcnn_proposal.cpp index 86c5c5601..f939d1ab6 100644 --- a/saber/funcs/impl/cuda/saber_rcnn_proposal.cpp +++ b/saber/funcs/impl/cuda/saber_rcnn_proposal.cpp @@ -55,7 +55,6 @@ SaberStatus SaberRCNNProposal::dispatch( ProposalParam& param) { cudaStream_t cuda_stream = this->_ctx->get_compute_stream(); - float input_height = this->im_height_, input_width = this->im_width_; float min_size_w_cur = this->min_size_w_; float min_size_h_cur = this->min_size_h_; std::vector im_width_scale = std::vector(1, this->read_width_scale_); @@ -66,8 +65,8 @@ SaberStatus SaberRCNNProposal::dispatch( _img_info_glue.set_extern_tensor(inputs.back()); const float* img_info_data = (const float*)_img_info_glue.host_data(_ctx); - input_width = img_info_data[0]; - input_height = img_info_data[1]; + float input_width = img_info_data[0]; + float input_height = img_info_data[1]; CHECK_GT(input_width, 0); CHECK_GT(input_height, 0); im_width_scale.clear(); @@ -85,7 +84,7 @@ SaberStatus SaberRCNNProposal::dispatch( float bsz01 = this->bbox_size_add_one_ ? float(1.0) : float(0.0); - float min_size_mode_and_else_or = true; + bool min_size_mode_and_else_or = true; if (this->min_size_mode_ == DetectionOutputSSD_HEIGHT_OR_WIDTH) { min_size_mode_and_else_or = false; } else { diff --git a/saber/funcs/impl/cuda/saber_reduce.h b/saber/funcs/impl/cuda/saber_reduce.h new file mode 100644 index 000000000..a0624b7d2 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_reduce.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REDUCE_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REDUCE_H + +#include "saber/funcs/impl/impl_reduce.h" +#include +#include + +namespace anakin{ + +namespace saber{ + +template +class SaberReduce : + public ImplBase< + NV, OpDtype, + ReduceParam > { +public: + typedef ImplBase > Impl_t; + SaberReduce() = default; + ~SaberReduce() { + delete _impl; + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param); + +private: + Buffer _rdim_b; + Buffer _ndim_b; + Buffer _i_stride_b; + Buffer _o_stride_b; + Impl_t* _impl{nullptr}; + typedef void reduce_kernel( + const float*, float*, const int*, const int*, + const int*, const int*, int); + std::map>> _kernel_direct_map; + bool _template_reduction{false}; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REDUCE_H diff --git a/saber/funcs/impl/cuda/saber_reduce_min.h b/saber/funcs/impl/cuda/saber_reduce_min.h new file mode 100644 index 000000000..0531a2a73 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_reduce_min.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REDUCE_MIN_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_REDUCE_MIN_H + +#include "saber/funcs/impl/impl_reduce_min.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberReduceMin : + public ImplBase< + NV, OpDtype, + ReduceMinParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberReduceMin() {} + ~SaberReduceMin() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceMinParam& param, Context& ctx) { + + this->_ctx = &ctx; + create(inputs, outputs, param, ctx); + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceMinParam& param, Context &ctx) { + + _num = inputs[0]->num(); + _channel = inputs[0]->channel(); + _height = inputs[0]->height(); + _width = inputs[0]->width(); + _rank = inputs[0]->valid_shape().size(); + if (!param.reduce_dim.empty()) { + //reduce dim isn't empty + + for (int i = 0; i < param.reduce_dim.size(); ++i) { + if (param.reduce_dim[i] < 0) { + _reduce_dim.push_back(param.reduce_dim[i] + _rank); + }else { + _reduce_dim.push_back(param.reduce_dim[i]); + } + } + } + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceMinParam& param); + +private: + int _rank; // dimetions + int _num; + int _channel; + int _height; + int _width; + std::vector _reduce_dim; + Tensor _tensor_tmp; + +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MATCH_MATRIX_H diff --git a/saber/funcs/impl/cuda/saber_roi_align.h b/saber/funcs/impl/cuda/saber_roi_align.h new file mode 100644 index 000000000..1d91cd3b1 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_roi_align.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_ALIGN_H + +#include "saber/funcs/impl/impl_roi_align.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberRoiAlign: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberRoiAlign() + {} + + ~SaberRoiAlign() { + + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + create(inputs, outputs, param, ctx); + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m, + Context &ctx) { + + Shape out_stride = outputs[0]->get_stride(); + Shape in_stride = inputs[0]->get_stride(); + _in_n_stride = in_stride[0]; + _in_c_stride = in_stride[1]; + _in_h_stride = in_stride[2]; + _in_w_stride = in_stride[3]; + _out_n_stride = out_stride[0]; + _out_c_stride = out_stride[1]; + _out_h_stride = out_stride[2]; + _out_w_stride = out_stride[3]; + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m); + +private: + int _in_n_stride; + int _in_c_stride; + int _in_h_stride; + int _in_w_stride; + int _out_n_stride; + int _out_c_stride; + int _out_h_stride; + int _out_w_stride; + const int _kROISize = 5; +}; + +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_ROI_POOL_H diff --git a/saber/funcs/impl/cuda/saber_rois_anchor_feature.h b/saber/funcs/impl/cuda/saber_rois_anchor_feature.h index 6162c36b4..61e6bcb45 100644 --- a/saber/funcs/impl/cuda/saber_rois_anchor_feature.h +++ b/saber/funcs/impl/cuda/saber_rois_anchor_feature.h @@ -25,8 +25,8 @@ class SaberRoisAnchorFeature : public ImplBase < NV, OpDtype, RoisAnchorFeatureParam > { public: - SaberRoisAnchorFeature() {} - ~SaberRoisAnchorFeature() {} + SaberRoisAnchorFeature() = default; + ~SaberRoisAnchorFeature() = default; virtual SaberStatus init(const std::vector*> &inputs, std::vector*> &outputs, @@ -44,19 +44,19 @@ class SaberRoisAnchorFeature : public ImplBase < RoisAnchorFeatureParam& param) override; private: bool _has_inited{false}; - int num_anchors_; - int num_top_iou_anchor_; - int min_num_top_iou_anchor_; - float iou_thr_; + int num_anchors_{0}; + int num_top_iou_anchor_{0}; + int min_num_top_iou_anchor_{0}; + float iou_thr_{0.f}; std::vector anchor_width_; std::vector anchor_height_; std::vector anchor_area_; - bool ft_ratio_h_; - bool ft_ratio_w_; - bool ft_log_ratio_h_; - bool ft_log_ratio_w_; - int num_ft_per_anchor_; - bool bbox_size_add_one_; + bool ft_ratio_h_{false}; + bool ft_ratio_w_{false}; + bool ft_log_ratio_h_{false}; + bool ft_log_ratio_w_{false}; + int num_ft_per_anchor_{0}; + bool bbox_size_add_one_{false}; Tensor bottom; Tensor top; }; diff --git a/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.cpp b/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.cpp index 452ce2394..cef11f03f 100644 --- a/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.cpp +++ b/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.cpp @@ -21,7 +21,7 @@ SaberStatus SaberRPNProposalSSD::create( CHECK_EQ(1, this->heat_map_b_vec_.size()); if (outputs.size() == 0) { - CHECK_GT(this->num_class_, 0); + CHECK_GT(this->num_class_, 0); } num_anchors_ = this->anchor_x1_vec_.size(); @@ -70,7 +70,6 @@ SaberStatus SaberRPNProposalSSD::dispatch( std::vector*> &outputs, ProposalParam& param) { - float input_height = this->im_height_, input_width = this->im_width_; float min_size_w_cur = this->min_size_w_; float min_size_h_cur = this->min_size_h_; std::vector im_width_scale = std::vector(1, this->read_width_scale_); @@ -80,8 +79,8 @@ SaberStatus SaberRPNProposalSSD::dispatch( CHECK_EQ(inputs.back()->count(1, inputs.back()->dims()), 6); _img_info_glue.set_extern_tensor(inputs.back()); const float* img_info_data = (const float*)_img_info_glue.host_data(_ctx); - input_width = img_info_data[0]; - input_height = img_info_data[1]; + float input_width = img_info_data[0]; + float input_height = img_info_data[1]; CHECK_GT(input_width, 0); CHECK_GT(input_height, 0); im_width_scale.clear(); @@ -99,7 +98,7 @@ SaberStatus SaberRPNProposalSSD::dispatch( float bsz01 = this->bbox_size_add_one_ ? float(1.0) : float(0.0); - float min_size_mode_and_else_or = true; + bool min_size_mode_and_else_or = true; if (this->min_size_mode_ == DetectionOutputSSD_HEIGHT_OR_WIDTH) { min_size_mode_and_else_or = false; } else { diff --git a/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.h b/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.h index 72d892ab8..60da6fd0a 100644 --- a/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.h +++ b/saber/funcs/impl/cuda/saber_rpn_proposal_ssd.h @@ -34,11 +34,7 @@ class SaberRPNProposalSSD : public ImplROIOutputSSD < public: - SaberRPNProposalSSD() - : box_dev_nms_(NULL) - , boxes_dev_len(0) - , mask_dev_nms_(NULL) - {} + SaberRPNProposalSSD() = default; ~SaberRPNProposalSSD() { if (box_dev_nms_ != NULL) { @@ -63,10 +59,10 @@ class SaberRPNProposalSSD : public ImplROIOutputSSD < ProposalParam ¶m); private: - int num_rpns_; - int num_anchors_; - bool has_img_info_; - int rois_dim_; + int num_rpns_{0}; + int num_anchors_{0}; + bool has_img_info_{false}; + int rois_dim_{0}; // ADD CPU TENSORS PGlue, Tensor > _img_info_glue; @@ -78,9 +74,9 @@ class SaberRPNProposalSSD : public ImplROIOutputSSD < PGlue, Tensor > idx_sm_; //caffe pyramid_layers.hpp:615 - float* box_dev_nms_; - unsigned long long* mask_dev_nms_; - int boxes_dev_len; + float* box_dev_nms_{nullptr}; + unsigned long long* mask_dev_nms_{nullptr}; + int boxes_dev_len{0}; //caffe pyramid_layers.hpp:618 }; diff --git a/saber/funcs/impl/cuda/saber_sequence_concat.h b/saber/funcs/impl/cuda/saber_sequence_concat.h new file mode 100644 index 000000000..59f4f3cc5 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_sequence_concat.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_CONCAT_H + +#include "saber/funcs/impl/impl_sequence_concat.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSequenceConcat : + public ImplBase< + NV, OpDtype, + SequenceConcatParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberSequenceConcat() = default; + ~SaberSequenceConcat() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + SequenceConcatParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + SequenceConcatParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam& param); +private: + Tensor _out2in_map_tensor; + Tensor _out2in_word_map_tensor; + Tensor _in_locate_tensor; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_CONCAT_H diff --git a/saber/funcs/impl/cuda/saber_sequence_depadding.h b/saber/funcs/impl/cuda/saber_sequence_depadding.h new file mode 100644 index 000000000..3e9fdce24 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_sequence_depadding.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_DEPADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_DEPADDING_H + +#include "saber/funcs/impl/impl_sequence_depadding.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSequenceDePadding : + public ImplBase< + NV, OpDtype, + SequenceDePaddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberSequenceDePadding() = default; + ~SaberSequenceDePadding() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + SequenceDePaddingParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + SequenceDePaddingParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam& param); +private: + Tensor _seq_id_map; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_DEPADDING_H diff --git a/saber/funcs/impl/cuda/saber_sequence_padding.h b/saber/funcs/impl/cuda/saber_sequence_padding.h new file mode 100644 index 000000000..1cfa70013 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_sequence_padding.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_PADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_PADDING_H + +#include "saber/funcs/impl/impl_sequence_padding.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSequencePadding : + public ImplBase< + NV, OpDtype, + SequencePaddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberSequencePadding() = default; + ~SaberSequencePadding() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + SequencePaddingParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + SequencePaddingParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam& param); +private: + Tensor _in_seq_offset; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_PADDING_H diff --git a/saber/funcs/impl/cuda/saber_sequence_pool_concat.h b/saber/funcs/impl/cuda/saber_sequence_pool_concat.h new file mode 100644 index 000000000..241d5e256 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_sequence_pool_concat.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_POOL_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SEQUENCE_POOL_CONCAT_H + +#include "saber/funcs/impl/impl_sequence_pool_concat.h" +#include "saber/saber_funcs_param.h" +#include +#include + +namespace anakin { +namespace saber { + +template +class SaberSequencePoolConcat : + public ImplBase < NV, OpDtype, SequencePoolConcatParam > { + +public: + + SaberSequencePoolConcat() = default; + + ~SaberSequencePoolConcat() {} + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param) override; + +private: + Buffer _offset_buffer; +}; + +} +} + +#endif diff --git a/saber/funcs/impl/cuda/saber_slice_v2.h b/saber/funcs/impl/cuda/saber_slice_v2.h new file mode 100644 index 000000000..c10fa4ec5 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_slice_v2.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_CUDA_SABER_SLICE_V2_H +#define ANAKIN_SABER_FUNCS_CUDA_SABER_SLICE_V2_H + +#include "saber/funcs/impl/impl_slice_v2.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSliceV2: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberSliceV2() = default; + ~SaberSliceV2() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx); + + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m); + +private: + Tensor _starts_d; + Tensor _in_stride_d; + Tensor _out_shape_d; + Tensor _axes_d; + +}; +template class SaberSliceV2; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_CUDA_SABER_SLICE_H diff --git a/saber/funcs/impl/cuda/saber_soft_sign.h b/saber/funcs/impl/cuda/saber_soft_sign.h new file mode 100644 index 000000000..468e2e735 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_soft_sign.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SOFT_SIGN_H + +#include "saber/funcs/impl/impl_soft_sign.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSoftSign : + public ImplBase< + NV, OpDtype, + SoftSignParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberSoftSign() = default; + ~SaberSoftSign() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + SoftSignParam& param, Context& ctx) { + this->_ctx = &ctx; + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + SoftSignParam& param, Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam& param); +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_SOFT_SIGN_H diff --git a/saber/funcs/impl/cuda/saber_softmax.h b/saber/funcs/impl/cuda/saber_softmax.h index 46108cb6e..f40005414 100644 --- a/saber/funcs/impl/cuda/saber_softmax.h +++ b/saber/funcs/impl/cuda/saber_softmax.h @@ -28,10 +28,6 @@ class SaberSoftmax: { public: typedef TargetWrapper API; - typedef Tensor DataTensor_in; - typedef Tensor DataTensor_out; - typedef Tensor OpTensor; - typedef typename DataTrait::Dtype OpDataType; SaberSoftmax() = default; @@ -44,63 +40,16 @@ class SaberSoftmax: * @param param * @param ctx */ - virtual SaberStatus init(const std::vector& inputs, - std::vector& outputs, - SoftmaxParam& param, Context& ctx) { - - //! get context - this->_ctx = &ctx; - return create(inputs, outputs, param, ctx); - } - - virtual SaberStatus create(const std::vector& inputs, - std::vector& outputs, - SoftmaxParam& param, Context& ctx) { - //! compute size - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - CHECK_EQ(shape_in == shape_out, true) << "valid shapes must be the same"; - _outer_num = inputs[0]->count_valid(0, param.axis); - _inner_num = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); - _axis_size = shape_in[param.axis]; - - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, API::get_device_id()); - size_t sharedmem_size = deviceProp.sharedMemPerBlock; - _max_dimsize = sharedmem_size / sizeof(OpDataType) / CUDA_NUM_THREADS; - - Shape sh_tmp({1, 1, 1, _outer_num * _inner_num}); - if (_axis_size > _max_dimsize){ - //! re_alloc device memory - _max_data.reshape(sh_tmp); - _sum_data.reshape(sh_tmp); - } - - //! CHECK whether the input or output tensor is with continuous buffer or not - _is_continue_buf = outputs[0]->is_continue_mem() && inputs[0]->is_continue_mem(); - _dims = shape_in.size(); - if (!_is_continue_buf) { - Shape sh_input_real_stride = inputs[0]->get_stride(); - Shape sh_output_real_stride = outputs[0]->get_stride(); - - //! re_alloc device memory - Shape sh({1, 1, 1, _dims}); - _valid_shape.reshape(sh); - _input_stride.reshape(sh); - _output_stride.reshape(sh); - - CUDA_CHECK(cudaMemcpy(_valid_shape.mutable_data(), inputs[0]->valid_shape().data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(_input_stride.mutable_data(), sh_input_real_stride.data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(_output_stride.mutable_data(), sh_output_real_stride.data(), \ - sizeof(int) * _dims, cudaMemcpyHostToDevice)); - } - return SaberSuccess; - } - - virtual SaberStatus dispatch(const std::vector& inputs, - std::vector& outputs, + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) override; + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + SoftmaxParam& param, Context& ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, SoftmaxParam& param); private: @@ -120,7 +69,6 @@ class SaberSoftmax: Tensor _max_data; Tensor _sum_data; }; -template class SaberSoftmax; } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/cuda/saber_yolo_box.h b/saber/funcs/impl/cuda/saber_yolo_box.h new file mode 100644 index 000000000..4f443fb42 --- /dev/null +++ b/saber/funcs/impl/cuda/saber_yolo_box.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_YOLO_BOX_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_YOLO_BOX_H + +#include "saber/funcs/impl/impl_yolo_box.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberYoloBox : + public ImplBase> { + +public: + + SaberYoloBox() = default; + ~SaberYoloBox() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m, + Context &ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m, + Context &ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m) override; + +private: +}; +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_YOLO_BOX_H diff --git a/saber/funcs/impl/cuda/vender_activation.h b/saber/funcs/impl/cuda/vender_activation.h index d60ded642..1f41bd48c 100644 --- a/saber/funcs/impl/cuda/vender_activation.h +++ b/saber/funcs/impl/cuda/vender_activation.h @@ -52,6 +52,9 @@ class VenderActivation : public ImplBase< ActivationParam& param, Context& ctx) { this->_ctx = &ctx; + if (param.active == Active_gelu || param.active == Active_swish) { + return SaberUnImplError; + } cudaStream_t cuda_stream; cuda_stream = ctx.get_compute_stream(); @@ -70,7 +73,7 @@ class VenderActivation : public ImplBase< virtual SaberStatus create(const std::vector *>& inputs, std::vector *>& outputs, ActivationParam& param, Context& ctx) { - if (param.active == Active_prelu || param.active == Active_stanh) { + if (param.active == Active_prelu || param.active == Active_stanh || param.active == Active_swish) { return SaberUnImplError; } if (!(&ctx == this->_ctx)) { @@ -119,7 +122,7 @@ class VenderActivation : public ImplBase< std::vector *>& outputs, ActivationParam& param) { - if (param.active == Active_prelu || param.active == Active_stanh) { + if (param.active == Active_prelu || param.active == Active_stanh || param.active == Active_gelu || param.active == Active_swish) { return SaberUnImplError; } const InDataType *in_data = (const InDataType *) inputs[0]->data(); diff --git a/saber/funcs/impl/cuda/vender_conv.cpp b/saber/funcs/impl/cuda/vender_conv.cpp index ed7286752..0d11a73ec 100644 --- a/saber/funcs/impl/cuda/vender_conv.cpp +++ b/saber/funcs/impl/cuda/vender_conv.cpp @@ -23,6 +23,10 @@ SaberStatus VenderConv2D::\ CUDNN_CHECK(cudnnCreate(&_handle)); CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); } + const Tensor *conv_weight = param.weight(); + if (_use_origin_weight) { + conv_weight = &_origin_weight; + } int input_num = inputs[0]->num(); int input_channel = inputs[0]->channel(); @@ -31,13 +35,13 @@ SaberStatus VenderConv2D::\ int output_channel = outputs[0]->channel(); int output_height = outputs[0]->height(); int output_width = outputs[0]->width(); - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); + int kernel_h = conv_weight->height(); + int kernel_w = conv_weight->width(); int filter_dim_a[] = {output_channel, input_channel / param.group, kernel_h, kernel_w}; cudnn::setNDFilterDesc(&_filter_desc, - param.weight()->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); + conv_weight->dims(), filter_dim_a, CUDNN_TENSOR_NCHW); Shape in_stride = inputs[0]->get_stride(); Shape out_stride = outputs[0]->get_stride(); @@ -48,18 +52,18 @@ SaberStatus VenderConv2D::\ output_height, output_width}; cudnn::setTensorNdDesc(&_input_descs, - inputs[0]->dims(), dim_a, &in_stride[0]); + inputs[0]->dims(), dim_a, &in_stride[0]); cudnn::setTensorNdDesc(&_output_descs, - outputs[0]->dims(), dim_b, &out_stride[0]); + outputs[0]->dims(), dim_b, &out_stride[0]); int pad_a[] = {param.pad_h, param.pad_w}; int filter_stride_a[] = {param.stride_h, param.stride_w}; int dilation_a[] = {param.dilation_h, param.dilation_w}; cudnn::setConvolutionNdDesc(&_conv_descs, - inputs[0]->dims() - 2, pad_a, - filter_stride_a, dilation_a); + inputs[0]->dims() - 2, pad_a, filter_stride_a, dilation_a); + if (param.activation_param.has_active && !_with_saber_act) { cudnn::set_activation_des(&_active_descs, param.activation_param.active); } @@ -98,7 +102,7 @@ SaberStatus VenderConv2D::\ int dim_bias[] = {1, output_channel, 1, 1}; int stride_bias[] = {output_channel, 1, 1, 1}; cudnn::setTensorNdDesc(&_bias_desc, - 4, dim_bias, stride_bias); + 4, dim_bias, stride_bias); } return SaberSuccess; } @@ -155,54 +159,58 @@ SaberStatus VenderConv2D::dispatch( std::vector*>& outputs, ConvParam& param) { + const Tensor *conv_weight = param.weight(); + if (_use_origin_weight) { + conv_weight = &_origin_weight; + } + const float* in_data = (const float*)inputs[0]->data(); float* out_data = (float*)outputs[0]->mutable_data(); - const float* weight_data = (const float*) param.weight()->data(); + const float* weight_data = (const float*) conv_weight->data(); if (param.activation_param.has_active && !_with_saber_act) { if (param.bias()->size() > 0) { const float * bias_data = (const float*)param.bias()->data(); CUDNN_CHECK(cudnnConvolutionBiasActivationForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - &_beta, - _output_descs, out_data, - _bias_desc, bias_data, - _active_descs, _output_descs, out_data)); + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, + _workspace, _workspace_fwd_sizes, + &_beta, _output_descs, + out_data, _bias_desc, bias_data, + _active_descs, _output_descs, out_data)); } else { CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - &_beta, - _output_descs, out_data)); + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, + _workspace, _workspace_fwd_sizes, + &_beta, _output_descs, out_data)); CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data, - &_beta, - _output_descs, out_data)); + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data, + &_beta, _output_descs, out_data)); } } else { CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - &_beta, - _output_descs, out_data)); + cudnn::cudnnTypeWrapper::kOne(), + _input_descs, in_data, + _filter_desc, weight_data, + _conv_descs, _fwd_algo, + _workspace, _workspace_fwd_sizes, + &_beta, _output_descs, out_data)); if (param.bias()->size() > 0) { // add up bias. const float *bias_data = (const float *) param.bias()->data(); CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); + cudnn::cudnnTypeWrapper::kOne(), + _bias_desc, bias_data, + cudnn::cudnnTypeWrapper::kOne(), + _output_descs, out_data)); } } if (_with_saber_act) { @@ -217,321 +225,23 @@ SaberStatus VenderConv2D::trans_weights(Tensor &target_weights return SaberUnImplError; } -// INT8 part -template <> -SaberStatus VenderConv2D::\ - create(const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context& ctx) { - - if (&ctx != this->_ctx) { - if (_handle != NULL) { - CUDNN_CHECK(cudnnDestroy(_handle)); - } - - this->_ctx = &ctx; - - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - } - - int input_num = inputs[0]->num(); - int input_channel = inputs[0]->channel(); - int input_height = inputs[0]->height(); - int input_width = inputs[0]->width(); - int output_channel = outputs[0]->channel(); - int output_height = outputs[0]->height(); - int output_width = outputs[0]->width(); - int in_size = inputs[0]->valid_size(); - int out_size = outputs[0]->valid_size(); - - // ====== int8 conv, the input channel must be a multiple of 4 - CHECK_EQ(input_channel % 4, 0); - - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); - - int filter_dim_a[] = {output_channel, - input_channel, - kernel_h, kernel_w}; - - CUDNN_CHECK(cudnnSetFilterNdDescriptor(_filter_desc, CUDNN_DATA_INT8x4, - CUDNN_TENSOR_NCHW_VECT_C, - 4, filter_dim_a)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_input_descs, - CUDNN_TENSOR_NCHW_VECT_C, - CUDNN_DATA_INT8x4, - input_num, input_channel, - input_height, input_width)); - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_output_descs, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - input_num, output_channel, - output_height, output_width)); - - int pad_a[] = {param.pad_h, param.pad_w}; - int filter_stride_a[] = {param.stride_h, param.stride_w}; - int dilation_a[] = {param.dilation_h, param.dilation_w}; - - cudnn::setConvolutionNdDesc(&_conv_descs, - 2, pad_a, - filter_stride_a, dilation_a); - - if(param.activation_param.has_active) { - cudnn::set_activation_des(&_active_descs, param.activation_param.active); - } - - // true: use tensor core - // false: disable tensor core - cudnn::set_group_count(&_conv_descs, param.group); - - // Get fastest implement of cudnn - _fwd_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( - _handle, _input_descs, _filter_desc, _conv_descs, _output_descs, - _fwd_algo, &_workspace_fwd_sizes)); - - if (_workspace_fwd_sizes > _workspaceSizeInBytes) { - _workspaceSizeInBytes = _workspace_fwd_sizes; - - if (_workspaceData != NULL) { - cudaFree(_workspaceData); - } - - cudaMalloc(&_workspaceData, _workspaceSizeInBytes); - _workspace = reinterpret_cast(_workspaceData); - } - - if (param.bias()->size() > 0) { - int dim_bias[] = {1, output_channel, 1, 1}; - int stride_bias[] = {output_channel, 1, 1, 1}; - CUDNN_CHECK(cudnnSetTensor4dDescriptor(_bias_desc, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - 1, output_channel, 1, 1)); - } - - if (inputs[0]->get_dtype() == AK_FLOAT) { - int8_input.re_alloc(inputs[0]->valid_shape(), AK_INT8); - int8_input.set_layout(Layout_NCHW_C4); - } - - if (outputs[0]->get_dtype() == AK_INT8) { - if (outputs[0]->get_layout() != Layout_NCHW_C4) { - LOG(ERROR) << "output layout must be NCHW_C4 for nv gpu"; - } - int8_output.re_alloc(outputs[0]->valid_shape(), AK_FLOAT); - int8_output.set_layout(Layout_NCHW); - } - - return SaberSuccess; -} template <> -SaberStatus VenderConv2D::trans_weights(Tensor &target_weights, +SaberStatus VenderConv2D::trans_weights(Tensor &target_weights, Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, int stride_h, int stride_w, int group) { - if (target_weights.valid_size() == 0) { - return SaberSuccess; - } - if (target_weights.channel() % 4 == 0 && target_weights.num() % 4 == 0) { - // prepare int8 memory - Tensor weights_fp32_host; - Tensor weights_int8_host; - weights_fp32_host.re_alloc(target_weights.valid_shape(), AK_FLOAT); - weights_int8_host.re_alloc(target_weights.valid_shape(), AK_INT8); - weights_int8_host.set_layout(Layout_NCHW_C4); - weights_fp32_host.copy_from(target_weights); - convert_weights_to_nchw_c4_host(weights_int8_host, weights_fp32_host, *_ctx); - // Open this will be an inplace trans - - target_weights.set_dtype(AK_INT8); - target_weights.re_alloc(target_weights.valid_shape(), AK_INT8); - target_weights.set_layout(Layout_NCHW_C4); - target_weights.copy_from(weights_int8_host); - target_weights.set_scale(weights_int8_host.get_scale()); - if (target_bias.valid_size() > 0) { - Tensor bias_fp32_host; - Tensor bias_int32_host; - bias_fp32_host.re_alloc(target_bias.valid_shape(), AK_FLOAT); - bias_int32_host.re_alloc(target_bias.valid_shape(), AK_FLOAT); - bias_fp32_host.copy_from(target_bias); - convert_bias_host(bias_int32_host, bias_fp32_host, _in_scale, - target_weights.get_scale(), *_ctx); - target_bias.copy_from(bias_int32_host); - } - } - return SaberSuccess; -} -template <> -SaberStatus VenderConv2D::\ - init(const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context& ctx) { - - this->_ctx = &ctx; - bool use_int8 = true; - use_int8 &= ((inputs[0]->channel() % 4) == 0); - use_int8 &= ((outputs[0]->channel() % 4) == 0); - // INT8 only support Active relu - use_int8 &= ((!param.activation_param.has_active) - || (param.activation_param.active == Active_relu)); - - if (!use_int8) { - return SaberInvalidValue; - } else { - if (inputs[0]->get_scale().size() == 1) { - _in_scale = inputs[0]->get_scale()[0]; - } else { - LOG(FATAL) << "scale now support static calibrate only!!"; - } - } - - // ---- init cudnn resources ---- - _workspaceSizeInBytes = 0; - _workspaceData = NULL; - _workspace_fwd_sizes = 0; - // ---- get cuda resources ---- - cudaStream_t cuda_stream; - cuda_stream = ctx.get_compute_stream(); - - CUDNN_CHECK(cudnnCreate(&_handle)); - CUDNN_CHECK(cudnnSetStream(_handle, cuda_stream)); - - _workspace = NULL; - int in_channels = inputs[0]->channel(); - - // ---- create cudnn Descs ---- - cudnn::createFilterDesc(&_filter_desc); - cudnn::createTensorDesc(&_input_descs); - cudnn::createTensorDesc(&_output_descs); - cudnn::createConvolutionDesc(&_conv_descs); - if (param.activation_param.has_active) { - cudnn::create_activation_des(&_active_descs); - } - if (param.bias()->size() > 0) { - cudnn::createTensorDesc(&_bias_desc); - } - - cudnnCreateTensorDescriptor(&_input_nchw_descs); - cudnnCreateTensorDescriptor(&_output_nchw_descs); - - return create(inputs, outputs, param, ctx); -} - -template <> -SaberStatus VenderConv2D::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvParam& param) { -// LOG(INFO) << "conv int8 dispatch" -// << " input tensor dtype: " << (inputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8") -// << " output tensor dtype: " << (outputs[0]->get_dtype() == AK_FLOAT ? "AK_FLOAT" : "AK_INT8"); - const void* in_data = nullptr; - void* out_data = nullptr; - float in_scale = 0.f; - - if (inputs[0]->get_dtype() == AK_FLOAT) { - if (inputs[0]->get_scale().size() == 1) { - in_scale = inputs[0]->get_scale()[0]; - } else { - LOG(FATAL) << "scale now support static calibrate only!!"; - } - conv_calibrate_fp32_int8_c4(int8_input, *inputs[0], in_scale, *(this->_ctx)); - in_data = (const void *)int8_input.data(); - } else { - in_data = (const void*)inputs[0]->data(); - } - - if (outputs[0]->get_dtype() == AK_INT8) { - if (outputs[0]->get_layout() != Layout_NCHW_C4) { - LOG(ERROR) << "output layout must be NCHW_C4 for nv gpu"; - } - out_data = (void*)int8_output.mutable_data(); -// outputs[0]->set_layout(Layout_NCHW_C4); - } else { - out_data = (void*)outputs[0]->mutable_data(); - } - - const void* weight_data = (const void*) param.weight()->data(); - - const float* weights_scale = (const float*)param.weight()->get_scale_data(); - if (param.activation_param.has_active) { - if (param.bias()->valid_size() > 0) { - const void *bias_data = (const void *) param.bias()->data(); - CUDNN_CHECK(cudnnConvolutionBiasActivationForward( - _handle, cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data, - _bias_desc, bias_data, - _active_descs, _output_descs, out_data)); - } else { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - - CUDNN_CHECK(cudnnActivationForward(_handle, _active_descs, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - } - } else { - CUDNN_CHECK(cudnnConvolutionForward(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _input_descs, in_data, - _filter_desc, weight_data, - _conv_descs, _fwd_algo, _workspace, _workspace_fwd_sizes, - cudnn::cudnnTypeWrapper::kZero(), - _output_descs, out_data)); - if (param.bias()->size() > 0) { - // add up bias. - const void *bias_data = (const void *) param.bias()->data(); - CUDNN_CHECK(cudnnAddTensor(_handle, - cudnn::cudnnTypeWrapper::kOne(), - _bias_desc, bias_data, - cudnn::cudnnTypeWrapper::kOne(), - _output_descs, out_data)); - } - } - if (outputs[0]->get_dtype() == AK_FLOAT) { - conv_calibrate_int32_fp32( - *outputs[0], *outputs[0], in_scale, weights_scale, *_ctx); - } else if (outputs[0]->get_dtype() == AK_INT8) { - // TODO THIS CAN BE A LOT OF WASTE OF PERF. - conv_calibrate_int32_fp32( - int8_output, int8_output, in_scale, weights_scale, *_ctx); - - std::vector out_scale_v = outputs[0]->get_scale(); - if (out_scale_v.size() != 1) { - LOG(FATAL) << "out scale set error, only support 1 scale for now!!! scale = " - << out_scale_v.size(); - } - float out_scale = out_scale_v[0]; - conv_calibrate_fp32_int8_c4(*outputs[0], int8_output, out_scale, *_ctx); - } - return SaberSuccess; + return SaberUnImplError; } template <> -SaberStatus VenderConv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { +SaberStatus VenderConv2D::trans_weights(Tensor &target_weights, + Tensor &target_bias, int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group) { return SaberUnImplError; } template class VenderConv2D; -template class VenderConv2D; +DEFINE_OP_TEMPLATE(VenderConv2D, ConvParam, NV, AK_INT8); DEFINE_OP_TEMPLATE(VenderConv2D, ConvParam, NV, AK_HALF); } } diff --git a/saber/funcs/impl/cuda/vender_conv.h b/saber/funcs/impl/cuda/vender_conv.h index a9a21f55e..b55a3c5c9 100644 --- a/saber/funcs/impl/cuda/vender_conv.h +++ b/saber/funcs/impl/cuda/vender_conv.h @@ -110,6 +110,15 @@ class VenderConv2D : public ImplBase< void set_beta(float beta) { _beta = beta; } + + template + void load_origin_weight(Tensor_h &origin_weight, Context &ctx) { + // run this function before init!!! + _origin_weight.re_alloc(origin_weight.valid_shape(), origin_weight.get_dtype()); + _origin_weight.async_copy_from(origin_weight, ctx.get_compute_stream()); + _use_origin_weight = true; + } + private: cudnnHandle_t _handle; cudnnConvolutionFwdAlgo_t _fwd_algo; @@ -137,8 +146,8 @@ class VenderConv2D : public ImplBase< bool _with_saber_act{false}; SaberActivation *_saber_act{nullptr}; float _in_scale; - Tensor int8_input; - Tensor int8_output; + Tensor _origin_weight; + bool _use_origin_weight{false}; }; diff --git a/saber/funcs/impl/cuda/vender_gemm.cpp b/saber/funcs/impl/cuda/vender_gemm.cpp index 3d6995fca..256df1eef 100644 --- a/saber/funcs/impl/cuda/vender_gemm.cpp +++ b/saber/funcs/impl/cuda/vender_gemm.cpp @@ -9,7 +9,7 @@ SaberStatus Gemm::init(const bool trans_a, const const int m, const int n, const int k, Context ctx) { - if (!(ctx == this->_ctx)) { + if ((!(ctx == this->_ctx)) || (_handle == nullptr)) { if (_handle != NULL) { CUBLAS_CHECK(cublasDestroy(_handle)); } @@ -49,7 +49,7 @@ template<> SaberStatus Gemm::init(const bool trans_a, const bool trans_b, const int m, const int n, const int k, Context ctx) { - if (!(ctx == this->_ctx)) { + if ((!(ctx == this->_ctx)) || (_handle == nullptr)) { if (_handle != NULL) { CUBLAS_CHECK(cublasDestroy(_handle)); } @@ -79,10 +79,24 @@ SaberStatus Gemm::dispatch( CHECK(ptr_a != nullptr); CHECK(ptr_b != nullptr); CHECK(ptr_c != nullptr); - - CUBLAS_CHECK(cublasSgemmEx(_handle, cu_trans_b, cu_trans_a, - _n, _m, _k, &alpha, ptr_b, CUDA_R_8I, _ldb, ptr_a, - CUDA_R_8I, _lda, &beta, ptr_c, CUDA_R_32F, _ldc)); + int generate_arch = Env::cur_env()[_ctx.get_device_id()]._info._generate_arch; + bool arch_check = generate_arch == 61; + if (arch_check) { +#if __CUDACC_VER_MAJOR__ >= 9 + CUBLAS_CHECK(cublasGemmEx(_handle, cu_trans_b, cu_trans_a, + _n, _m, _k, &alpha, ptr_b, CUDA_R_8I, _ldb, ptr_a, + CUDA_R_8I, _lda, &beta, ptr_c, CUDA_R_32F, _ldc, + CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); +#else + CUBLAS_CHECK(cublasSgemmEx(_handle, cu_trans_b, cu_trans_a, + _n, _m, _k, &alpha, ptr_b, CUDA_R_8I, _ldb, ptr_a, + CUDA_R_8I, _lda, &beta, ptr_c, CUDA_R_32F, _ldc)); +#endif + } else { + CUBLAS_CHECK(cublasSgemmEx(_handle, cu_trans_b, cu_trans_a, + _n, _m, _k, &alpha, ptr_b, CUDA_R_8I, _ldb, ptr_a, + CUDA_R_8I, _lda, &beta, ptr_c, CUDA_R_32F, _ldc)); + } return SaberSuccess; } @@ -91,7 +105,7 @@ SaberStatus Gemv::init(const bool trans, const in const int incx, const int incy, Context ctx) { - if (!(ctx == this->_ctx)) { + if ((!(ctx == this->_ctx)) || (_handle == nullptr)) { if (_handle != NULL) { CUBLAS_CHECK(cublasDestroy(_handle)); } diff --git a/saber/funcs/impl/cuda/vender_gemm.h b/saber/funcs/impl/cuda/vender_gemm.h index 70e8e8078..bd28a3d1b 100644 --- a/saber/funcs/impl/cuda/vender_gemm.h +++ b/saber/funcs/impl/cuda/vender_gemm.h @@ -1,6 +1,6 @@ -#ifndef SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H -#define SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_GEMM_H #include "saber/core/tensor.h" #include "saber/funcs/gemm.h" @@ -10,7 +10,8 @@ namespace saber { template -class Gemm { +class Gemm + : public MatrixFunc{ public: Gemm() = default; diff --git a/saber/funcs/impl/cuda/vender_pooling.h b/saber/funcs/impl/cuda/vender_pooling.h index 201cc6bed..fa5818985 100644 --- a/saber/funcs/impl/cuda/vender_pooling.h +++ b/saber/funcs/impl/cuda/vender_pooling.h @@ -33,8 +33,8 @@ class VenderPooling:\ typedef Tensor DataTensor_in; typedef Tensor DataTensor_out; typedef Tensor OpTensor; - - VenderPooling() : _handle(NULL) {} + + VenderPooling() : _handle(NULL), _input_descs(NULL), _output_descs(NULL), _pooling_descs(NULL) {} ~VenderPooling() { if (_handle != NULL) { diff --git a/saber/funcs/impl/cuda/vender_reduce.cpp b/saber/funcs/impl/cuda/vender_reduce.cpp new file mode 100644 index 000000000..bdc2882c2 --- /dev/null +++ b/saber/funcs/impl/cuda/vender_reduce.cpp @@ -0,0 +1,134 @@ + +#include "saber/funcs/impl/cuda/vender_reduce.h" +#include "saber/funcs/impl/cuda/cudnn_helper.h" +#include "saber/funcs/debug.h" + +namespace anakin { +namespace saber { + +template <> +SaberStatus VenderReduce::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + + if (&ctx != this->_ctx) { + if (_handle != NULL) { + CUDNN_CHECK(cudnnDestroy(_handle)); + } + this->_ctx = &ctx; + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, ctx.get_compute_stream())); + } + + int input_num = inputs[0]->num(); + int input_channel = inputs[0]->channel(); + int input_height = inputs[0]->height(); + int input_width = inputs[0]->width(); + int output_num = outputs[0]->num(); + int output_channel = outputs[0]->channel(); + int output_height = outputs[0]->height(); + int output_width = outputs[0]->width(); + + Shape in_stride = inputs[0]->get_stride(); + Shape out_stride = outputs[0]->get_stride(); + + int dim_a[] = {input_num, input_channel, + input_height, input_width}; + int dim_b[] = {output_num, output_channel, + output_height, output_width}; + + cudnn::setTensorNdDesc(&_input_descs, + inputs[0]->dims(), dim_a, &in_stride[0]); + + cudnn::setTensorNdDesc(&_output_descs, + outputs[0]->dims(), dim_b, &out_stride[0]); + + // todo add the parameters. + + cudnnReduceTensorOp_t _reduce_tensor_op = CUDNN_REDUCE_TENSOR_MIN; + switch (param.reduce_type) { + case Reduce_min: + _reduce_tensor_op = CUDNN_REDUCE_TENSOR_MIN; + break; + case Reduce_max: + _reduce_tensor_op = CUDNN_REDUCE_TENSOR_MAX; + break; + case Reduce_sum: + _reduce_tensor_op = CUDNN_REDUCE_TENSOR_ADD; + break; + case Reduce_avg: + _reduce_tensor_op = CUDNN_REDUCE_TENSOR_AVG; + break; + case Reduce_prod: + _reduce_tensor_op = CUDNN_REDUCE_TENSOR_MUL; + break; + default: + LOG(FATAL) << "param reduce_type is unknown!!!!"; + break; + } + + cudnnDataType_t _reduce_tensor_comp_type = CUDNN_DATA_FLOAT; + cudnnNanPropagation_t _reduce_tensor_nan_opt = CUDNN_NOT_PROPAGATE_NAN; + cudnnReduceTensorIndices_t _reduce_tensor_indices = CUDNN_REDUCE_TENSOR_NO_INDICES; + cudnnIndicesType_t _reduce_tensor_indices_type = CUDNN_32BIT_INDICES; + + CUDNN_CHECK(cudnnSetReduceTensorDescriptor(_reduce_descs, + _reduce_tensor_op, + _reduce_tensor_comp_type, + _reduce_tensor_nan_opt, + _reduce_tensor_indices, + _reduce_tensor_indices_type)); + + CUDNN_CHECK(cudnnGetReductionWorkspaceSize( + _handle, _reduce_descs, _input_descs, _output_descs, + &_workspace_fwd_sizes)); + + if (_workspace != NULL) { + cudaFree(_workspace); + } + cudaMalloc(&_workspace, _workspace_fwd_sizes); + + return SaberSuccess; +} + +template <> +SaberStatus VenderReduce::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + + this->_ctx = &ctx; + CUDNN_CHECK(cudnnCreate(&_handle)); + CUDNN_CHECK(cudnnSetStream(_handle, ctx.get_compute_stream())); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&_input_descs)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&_output_descs)); + CUDNN_CHECK(cudnnCreateReduceTensorDescriptor(&_reduce_descs)); + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderReduce::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param) { + + const void * in_data = inputs[0]->data(); + void* out_data = outputs[0]->mutable_data(); + float alpha = param.coeff;// should be 1 for default impl. + float beta = 0.f; + CUDNN_CHECK(cudnnReduceTensor(_handle, _reduce_descs, + nullptr, 0, + _workspace, _workspace_fwd_sizes, + &alpha, _input_descs, in_data, + &beta, _output_descs, out_data)); + return SaberSuccess; +} + +template class VenderReduce; +DEFINE_OP_TEMPLATE(VenderReduce, ReduceParam, NV, AK_HALF); +DEFINE_OP_TEMPLATE(VenderReduce, ReduceParam, NV, AK_INT8); + +} // namespace saber. +} // namespace anakin. \ No newline at end of file diff --git a/saber/funcs/impl/cuda/vender_reduce.h b/saber/funcs/impl/cuda/vender_reduce.h new file mode 100644 index 000000000..a1ef68ade --- /dev/null +++ b/saber/funcs/impl/cuda/vender_reduce.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_REDUCE_H +#define ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_REDUCE_H + +#include "saber/funcs/impl/impl_reduce.h" +#include +#include + +namespace anakin{ + +namespace saber{ + +template +class VenderReduce : + public ImplBase< + NV, OpDtype, + ReduceParam > { +public: + VenderReduce() = default; + ~VenderReduce() { + CUDNN_CHECK(cudnnDestroy(_handle)); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_input_descs)); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(_output_descs)); + CUDNN_CHECK(cudnnDestroyReduceTensorDescriptor(_reduce_descs)); + cudaFree(_workspace); + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param); + +private: + cudnnHandle_t _handle{nullptr}; + cudnnTensorDescriptor_t _input_descs{nullptr}; + cudnnTensorDescriptor_t _output_descs{nullptr}; + cudnnReduceTensorDescriptor_t _reduce_descs{nullptr}; + size_t _workspace_fwd_sizes{0}; + void *_workspace{nullptr}; // aliases into _workspaceData +}; +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_VENDER_REDUCE_H diff --git a/saber/funcs/impl/detection_helper.cpp b/saber/funcs/impl/detection_helper.cpp index bc00e38a2..142260e02 100644 --- a/saber/funcs/impl/detection_helper.cpp +++ b/saber/funcs/impl/detection_helper.cpp @@ -109,26 +109,22 @@ void apply_nms_fast(const dtype* bboxes, const dtype* scores, int num, } template -void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vector& result, - \ - int batch_num, int class_num, int num_priors, int background_id, \ +void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vector& result, \ + const std::vector& priors, int class_num, int background_id, \ int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, \ float nms_eta, bool share_location) { int num_kept = 0; std::vector>> all_indices; - - for (int i = 0; i < batch_num; ++i) { + long long conf_offset = 0; + long long bbox_offset = 0; + for (int i = 0; i < priors.size(); ++i) { std::map> indices; int num_det = 0; - const int conf_idx = i * class_num * num_priors; - int bbox_idx; + int num_priors = priors[i]; - if (share_location) { - bbox_idx = i * num_priors * 4; - } else { - bbox_idx = conf_idx * 4; - } + int conf_idx = class_num * conf_offset; + int bbox_idx = share_location? bbox_offset * 4 : bbox_offset * 4 * class_num; for (int c = 0; c < class_num; ++c) { if (c == background_id) { @@ -182,6 +178,8 @@ void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vec all_indices.push_back(indices); num_kept += num_det; } + conf_offset += num_priors; + bbox_offset += num_priors; } if (num_kept == 0) { @@ -193,15 +191,12 @@ void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vec int count = 0; - for (int i = 0; i < batch_num; ++i) { - const int conf_idx = i * class_num * num_priors; - int bbox_idx; - - if (share_location) { - bbox_idx = i * num_priors * 4; - } else { - bbox_idx = conf_idx * 4; - } + conf_offset = 0; + bbox_offset = 0; + for (int i = 0; i < priors.size(); ++i) { + int num_priors = priors[i]; + int conf_idx = class_num * conf_offset; + int bbox_idx = share_location? bbox_offset * 4 : bbox_offset * 4 * class_num; for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) { int label = it->first; @@ -227,6 +222,8 @@ void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vec ++count; } } + conf_offset += num_priors; + bbox_offset += num_priors; } } @@ -238,7 +235,7 @@ template void apply_nms_fast(const float* bboxes, const float* scores, int num, template void nms_detect(const float* bbox_cpu_data, const float* conf_cpu_data, std::vector& result, \ - int batch_num, int class_num, int num_priors, int background_id, \ + const std::vector& priors, int class_num, int background_id, \ int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, float nms_eta, bool share_location); diff --git a/saber/funcs/impl/detection_helper.h b/saber/funcs/impl/detection_helper.h index da8f56236..c6a705a67 100644 --- a/saber/funcs/impl/detection_helper.h +++ b/saber/funcs/impl/detection_helper.h @@ -32,10 +32,14 @@ void apply_nms_fast(const dtype* bboxes, const dtype* scores, int num, float score_threshold, float nms_threshold, float eta, int top_k, std::vector* indices); +//! for one stage: +//! boxes number in each batch is the same +//! for two stage: +//! boxes number is compute by offset in loc or conf tensor template void nms_detect(const dtype* bbox_cpu_data, const dtype* conf_cpu_data, std::vector& result, \ - int batch_num, int class_num, int num_priors, int background_id, \ + const std::vector& priors, int class_num, int background_id, \ int keep_topk, int nms_topk, float conf_thresh, float nms_thresh, float nms_eta, bool share_location); diff --git a/saber/funcs/impl/impl_fake_quantize_abs_max.h b/saber/funcs/impl/impl_aligned_mat_mul.h similarity index 75% rename from saber/funcs/impl/impl_fake_quantize_abs_max.h rename to saber/funcs/impl/impl_aligned_mat_mul.h index 923e110bf..dd142dc6d 100644 --- a/saber/funcs/impl/impl_fake_quantize_abs_max.h +++ b/saber/funcs/impl/impl_aligned_mat_mul.h @@ -13,17 +13,17 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_FAKE_QUANTIZE_ABS_MAX_H -#define ANAKIN_SABER_FUNCS_IMPL_FAKE_QUANTIZE_ABS_MAX_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_ALIGNED_MAT_MUL_H +#define ANAKIN_SABER_FUNCS_IMPL_ALIGNED_MAT_MUL_H #include "saber/funcs/impl/impl_macro.h" namespace anakin{ namespace saber{ -DEFINE_OP_CLASS(FakeQuantizeAbsMax, FakeQuantizeAbsMaxParam); +DEFINE_OP_CLASS(AlignedMatMul, AlignedMatMulParam); } } -#endif //ANAKIN_SABER_FUNCS_IMPL_FAKE_QUANTIZE_ABS_MAX_H +#endif //ANAKIN_SABER_FUNCS_IMPL_ALIGNED_MAT_MUL_H diff --git a/saber/funcs/impl/impl_anchor_generator.h b/saber/funcs/impl/impl_anchor_generator.h new file mode 100644 index 000000000..09a33d5fc --- /dev/null +++ b/saber/funcs/impl/impl_anchor_generator.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ANCHOR_GENERATOR_H +#define ANAKIN_SABER_FUNCS_IMPL_ANCHOR_GENERATOR_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(AnchorGenerator, AnchorGeneratorParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_ANCHOR_GENERATOR_H diff --git a/saber/funcs/impl/impl_arithmetic.h b/saber/funcs/impl/impl_arithmetic.h new file mode 100644 index 000000000..b8308f4f1 --- /dev/null +++ b/saber/funcs/impl/impl_arithmetic.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ARITHMETIC_H +#define ANAKIN_SABER_FUNCS_IMPL_ARITHMETIC_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(Arithmetic, ArithmeticParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_ARITHMETIC_H diff --git a/saber/funcs/impl/impl_attention_padding_mask.h b/saber/funcs/impl/impl_attention_padding_mask.h new file mode 100644 index 000000000..c7e8c4fb9 --- /dev/null +++ b/saber/funcs/impl/impl_attention_padding_mask.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ATTENTION_PADDING_MASK_H +#define ANAKIN_SABER_FUNCS_IMPL_ATTENTION_PADDING_MASK_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(AttentionPaddingMask, AttentionPaddingMaskParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_ATTENTION_PADDING_MASK_H diff --git a/saber/funcs/impl/impl_base.h b/saber/funcs/impl/impl_base.h index 91571e532..752cf5212 100644 --- a/saber/funcs/impl/impl_base.h +++ b/saber/funcs/impl/impl_base.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_IMPL_BASE_IMPL_H @@ -18,6 +18,9 @@ #include "saber/core/context.h" #include "saber/core/tensor.h" +#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) +#include "saber/funcs/timer.h" +#endif namespace anakin { namespace saber { @@ -48,12 +51,99 @@ class ImplBase { Param ¶m) { return SaberUnImplError; } + void set_op_name(const char* name){_op_name = name;} + const char* get_op_name() { return _op_name.c_str();} protected: Param* _param; Context* _ctx; + std::string _op_name; +#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) + saber::SaberTimer _timer; + saber::SaberTimer _trans_timer; +#endif +}; +#if defined(ENABLE_OP_TIMER) || defined(ENABLE_DEBUG) +struct GOPS{ + float ts; + float ops; + GOPS operator+(const GOPS& right) { + GOPS out; + out.ts = this->ts + right.ts; + out.ops = this->ops + right.ops; + return out; + } +}; + +class OpTimer { +public: + static std::map& ops() { + static std::map* _timer = new std::map(); + return *_timer; + } + // Adds a timer type. + static void add_timer(const std::string& type, GOPS ts) { + std::map& _timer = ops(); + if (_timer.count(type) < 1) { + _timer[type] = ts; + } else { + GOPS tn = _timer[type] + ts; + _timer[type] = tn; + } + } + + static void clear_timer() { + std::map& _timer = ops(); + _timer.clear(); + } + + static GOPS get_timer(const std::string type) { + std::map& _timer = ops(); + if (_timer.count(type) < 1) { + LOG(ERROR) << "unknow type: " << type.c_str(); + return {0.f, 0.f}; + } + return _timer[type]; + } + + static void print_timer() { + std::map& _timer = ops(); + GOPS to = get_timer("total"); + if (to.ts <= 0.f) { + to.ts = 1.f; + } + for (auto& it : _timer) { + printf("op: %s, timer: %f, GOPS: %f, percent: %f%%\n", + it.first.c_str(), it.second.ts, 1e-6f * it.second.ops / it.second.ts, 100.f * it.second.ts / to.ts); + } + } + template + static void print_timer(Context const& ctx) { + + float cpu_freq_cur = ctx.get_mode() == SABER_POWER_HIGH \ + ? Env::cur_env()[0]._info._max_frequence : \ + Env::cur_env()[0]._info._min_frequence; + float cpu_ca_theory = cpu_freq_cur * 8.0f / 1000; + int th_num = ctx.get_threads(); + float cpus_ops = th_num * cpu_ca_theory; + + std::map& _timer = ops(); + GOPS to = get_timer("total"); + if (to.ts <= 0.f) { + to.ts = 1.f; + } + for (auto& it : _timer) { + printf("op: %s, timer: %f, GOPS: %f, percent: %f%%, cpu potential: %f%%\n", + it.first.c_str(), it.second.ts, 1e-6f * it.second.ops / it.second.ts, 100.f * it.second.ts / to.ts, + 1e-6f * it.second.ops / it.second.ts / cpus_ops * 100); + } + } + +private: + OpTimer() {} }; +#endif } } #endif //ANAKIN_SABER_FUNCS_IMPL_BASE_IMPL_H diff --git a/saber/funcs/impl/impl_box_clip.h b/saber/funcs/impl/impl_box_clip.h new file mode 100644 index 000000000..41a572263 --- /dev/null +++ b/saber/funcs/impl/impl_box_clip.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_BOX_CLIP_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_BOX_CLIP_H +#include "saber/funcs/impl/impl_macro.h" +namespace anakin { + +namespace saber { + +DEFINE_OP_CLASS(BoxClip, EmptyParam); + +} +} +#endif //ANAKIN_IMPL_BOX_CLIP_H diff --git a/saber/funcs/impl/impl_coord2patch.h b/saber/funcs/impl/impl_coord2patch.h new file mode 100644 index 000000000..4e1e99478 --- /dev/null +++ b/saber/funcs/impl/impl_coord2patch.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_COORD2PATCH_H +#define ANAKIN_SABER_FUNCS_IMPL_COORD2PATCH_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(Coord2Patch, Coord2PatchParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_COORD2PATCH_H diff --git a/saber/funcs/impl/impl_cos_sim.h b/saber/funcs/impl/impl_cos_sim.h new file mode 100644 index 000000000..60f29a7a7 --- /dev/null +++ b/saber/funcs/impl/impl_cos_sim.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_COS_SIM_H +#define ANAKIN_SABER_FUNCS_IMPL_COS_SIM_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(CosSim, CosSimParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SOFT_SIGN_H diff --git a/saber/funcs/impl/impl_generate_proposals.h b/saber/funcs/impl/impl_generate_proposals.h new file mode 100644 index 000000000..e95914abb --- /dev/null +++ b/saber/funcs/impl/impl_generate_proposals.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_GENERATE_PROPOSALS_H +#define ANAKIN_SABER_FUNCS_IMPL_GENERATE_PROPOSALS_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(GenerateProposals, GenerateProposalsParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_GENERATE_PROPOSALS_H diff --git a/saber/funcs/impl/impl_lstmp.h b/saber/funcs/impl/impl_lstmp.h new file mode 100644 index 000000000..042b12350 --- /dev/null +++ b/saber/funcs/impl/impl_lstmp.h @@ -0,0 +1,11 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_LSTMP_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_LSTMP_H +#include "saber/funcs/impl/impl_macro.h" +namespace anakin { +namespace saber { + +DEFINE_OP_CLASS(Lstmp, LstmParam); + +} +} +#endif //ANAKIN_IMPL_LSTMP_H diff --git a/saber/funcs/impl/impl_mean.h b/saber/funcs/impl/impl_mean.h new file mode 100644 index 000000000..0bf950c00 --- /dev/null +++ b/saber/funcs/impl/impl_mean.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_MEAN_H +#define ANAKIN_SABER_FUNCS_IMPL_MEAN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(Mean, MeanParam); +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_MEAN_H diff --git a/saber/funcs/impl/impl_one_hot.h b/saber/funcs/impl/impl_one_hot.h new file mode 100644 index 000000000..a6dfc92b1 --- /dev/null +++ b/saber/funcs/impl/impl_one_hot.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ONE_HOT_H +#define ANAKIN_SABER_FUNCS_IMPL_ONE_HOT_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(OneHot, OneHotParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_ONE_HOT_H diff --git a/saber/funcs/impl/arm/impl/utils_arm.h b/saber/funcs/impl/impl_pad2d.h similarity index 66% rename from saber/funcs/impl/arm/impl/utils_arm.h rename to saber/funcs/impl/impl_pad2d.h index f7a2e782e..8de4c69ea 100644 --- a/saber/funcs/impl/arm/impl/utils_arm.h +++ b/saber/funcs/impl/impl_pad2d.h @@ -1,9 +1,10 @@ /* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -11,7 +12,18 @@ See the License for the specific language governing permissions and limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H -#define ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H -#endif //ANAKIN_SABER_FUNCS_ARM_IMPL_UTILS_ARM_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_PAD2D_H +#define ANAKIN_SABER_FUNCS_IMPL_PAD2D_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + + DEFINE_OP_CLASS(Pad2D, Pad2DParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_PAD2D_H diff --git a/saber/funcs/impl/impl_pixel_shuffle.h b/saber/funcs/impl/impl_pixel_shuffle.h new file mode 100644 index 000000000..8b2d2082c --- /dev/null +++ b/saber/funcs/impl/impl_pixel_shuffle.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_PIXEL_SHUFFLE_H +#define ANAKIN_SABER_FUNCS_IMPL_PIXEL_SHUFFLE_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(PixelShuffle, PixelShuffleParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_PIXEL_SHUFFLE_H diff --git a/saber/funcs/impl/impl_product_quant_embedding_with_vsum.h b/saber/funcs/impl/impl_product_quant_embedding_with_vsum.h new file mode 100644 index 000000000..eef403aa0 --- /dev/null +++ b/saber/funcs/impl/impl_product_quant_embedding_with_vsum.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_IMPL_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(ProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_QUANTEMBEDDINGWITHVSUM_H diff --git a/saber/funcs/impl/impl_proposal.h b/saber/funcs/impl/impl_proposal.h new file mode 100644 index 000000000..191c47738 --- /dev/null +++ b/saber/funcs/impl/impl_proposal.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_PROPOSAL_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_PROPOSAL_H + +#include "saber/funcs/impl/impl_macro.h" + +namespace anakin { + +namespace saber { + +DEFINE_OP_CLASS(Proposal, ProposalParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_PROPOSAL_H diff --git a/saber/funcs/impl/impl_ps_roi_pooling.h b/saber/funcs/impl/impl_ps_roi_pooling.h new file mode 100644 index 000000000..b426a654b --- /dev/null +++ b/saber/funcs/impl/impl_ps_roi_pooling.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_PS_ROI_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_PS_ROI_POOLING_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(PsRoiPool, PsRoiPoolParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_PS_ROI_POOLING_H diff --git a/saber/funcs/impl/impl_pyramid_hash_quant_embedding_with_vsum.h b/saber/funcs/impl/impl_pyramid_hash_quant_embedding_with_vsum.h new file mode 100644 index 000000000..2ec84c57c --- /dev/null +++ b/saber/funcs/impl/impl_pyramid_hash_quant_embedding_with_vsum.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_IMPL_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(PyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SOFT_SIGN_H diff --git a/saber/funcs/impl/impl_reduce.h b/saber/funcs/impl/impl_reduce.h new file mode 100644 index 000000000..73bd80e3e --- /dev/null +++ b/saber/funcs/impl/impl_reduce.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_REDUCE_H +#define ANAKIN_SABER_FUNCS_IMPL_REDUCE_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(Reduce, ReduceParam); +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_REDUCE_H diff --git a/saber/funcs/impl/impl_reduce_min.h b/saber/funcs/impl/impl_reduce_min.h new file mode 100644 index 000000000..d8b93cb48 --- /dev/null +++ b/saber/funcs/impl/impl_reduce_min.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_REDUCE_MIN_H +#define ANAKIN_SABER_FUNCS_IMPL_REDUCE_MIN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(ReduceMin, ReduceMinParam); +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_MEAN_H diff --git a/saber/funcs/impl/impl_roi_align.h b/saber/funcs/impl/impl_roi_align.h new file mode 100644 index 000000000..a74fb2bee --- /dev/null +++ b/saber/funcs/impl/impl_roi_align.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_ROIALIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_ROIALIGN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(RoiAlign, RoiAlignParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_ROIPOOLING_H diff --git a/saber/funcs/impl/impl_roi_output_ssd.h b/saber/funcs/impl/impl_roi_output_ssd.h index 866f81041..e0b5e2c69 100644 --- a/saber/funcs/impl/impl_roi_output_ssd.h +++ b/saber/funcs/impl/impl_roi_output_ssd.h @@ -46,7 +46,7 @@ class ImplROIOutputSSD : public ImplBase < nms_add_score_(false), num_class_(-1), do_bbox_norm_(false), read_height_offset_(0), atrs_reg_norm_idx_st_(-1), has_cam3d_(false), bbox_size_add_one_(false), zero_anchor_center_(false), kpts_classify_width_(-1), kpts_do_norm_(false), has_spmp_(false), spmp_dim_sum_(-1), - cam3d_bottom_idx_(-1), use_target_type_rcnn_(false), show_time_(false), + cam3d_bottom_idx_(-1), use_target_type_rcnn_(false), kpts_reg_as_classify_(false), kpts_classify_height_(-1), atrs_do_norm_(false), has_ftrs_(false), nms_among_classes_(false), channel_per_scale_(false), has_kpts_(false), kpts_exist_bottom_idx_(-1), kpts_reg_bottom_idx_(-1), @@ -213,7 +213,6 @@ class ImplROIOutputSSD : public ImplBase < } time_get_bbox_ = time_total_ = time_nms_ = 0; - show_time_ = ((getenv("SHOW_TIME") != NULL) && (getenv("SHOW_TIME")[0] == '1')); refine_out_of_map_bbox_ = detection_output_ssd_param.refine_out_of_map_bbox; std::copy(detection_output_ssd_param.class_indexes.begin(), detection_output_ssd_param.class_indexes.end(), @@ -621,7 +620,6 @@ class ImplROIOutputSSD : public ImplBase < OpDataType im_height_; bool rpn_proposal_output_score_; bool regress_agnostic_; - bool show_time_; OpDataType time_get_bbox_, time_total_, time_nms_, time_bbox_to_blob_; OpDataType allow_border_; OpDataType allow_border_ratio_; diff --git a/saber/funcs/impl/impl_seq_concat_seq_pool_soft_sign.h b/saber/funcs/impl/impl_seq_concat_seq_pool_soft_sign.h new file mode 100644 index 000000000..f0eaebf77 --- /dev/null +++ b/saber/funcs/impl/impl_seq_concat_seq_pool_soft_sign.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H diff --git a/saber/funcs/impl/impl_sequence_concat.h b/saber/funcs/impl/impl_sequence_concat.h new file mode 100644 index 000000000..0dda29cd6 --- /dev/null +++ b/saber/funcs/impl/impl_sequence_concat.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_CONCAT_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SequenceConcat, SequenceConcatParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_CONCAT_H diff --git a/saber/funcs/impl/impl_sequence_depadding.h b/saber/funcs/impl/impl_sequence_depadding.h new file mode 100644 index 000000000..41e9253de --- /dev/null +++ b/saber/funcs/impl/impl_sequence_depadding.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_DEPADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_DEPADDING_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SequenceDePadding, SequenceDePaddingParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_DEPADDING_H diff --git a/saber/funcs/impl/impl_sequence_padding.h b/saber/funcs/impl/impl_sequence_padding.h new file mode 100644 index 000000000..adf93d368 --- /dev/null +++ b/saber/funcs/impl/impl_sequence_padding.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_PADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_PADDING_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SequencePadding, SequencePaddingParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQUENCE_PADDING_H diff --git a/saber/funcs/impl/impl_sequence_pool_concat.h b/saber/funcs/impl/impl_sequence_pool_concat.h new file mode 100644 index 000000000..9ca0b7c66 --- /dev/null +++ b/saber/funcs/impl/impl_sequence_pool_concat.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SEQUENCEPOOL_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_SEQUENCEPOOL_CONCAT_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SequencePoolConcat, SequencePoolConcatParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SEQUENCEPOOL_H diff --git a/saber/funcs/impl/impl_slice_v2.h b/saber/funcs/impl/impl_slice_v2.h new file mode 100644 index 000000000..11c53c232 --- /dev/null +++ b/saber/funcs/impl/impl_slice_v2.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SLICE_V2_H +#define ANAKIN_SABER_FUNCS_IMPL_SLICE_V2_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SliceV2, SliceV2Param); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SLICE_V2_H diff --git a/saber/funcs/impl/impl_soft_sign.h b/saber/funcs/impl/impl_soft_sign.h new file mode 100644 index 000000000..ba6e2d577 --- /dev/null +++ b/saber/funcs/impl/impl_soft_sign.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_SOFT_SIGN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SoftSign, SoftSignParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SOFT_SIGN_H diff --git a/saber/funcs/impl/impl_sproposal.h b/saber/funcs/impl/impl_sproposal.h new file mode 100644 index 000000000..c2300b546 --- /dev/null +++ b/saber/funcs/impl/impl_sproposal.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_SPROPOSAL_H +#define ANAKIN_SABER_FUNCS_IMPL_SPROPOSAL_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SProposal, SProposalParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SPROPOSAL_H diff --git a/saber/funcs/impl/impl_sroi_align.h b/saber/funcs/impl/impl_sroi_align.h new file mode 100644 index 000000000..e090f291c --- /dev/null +++ b/saber/funcs/impl/impl_sroi_align.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_IMPL_SROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_IMPL_SROI_ALIGN_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(SRoiAlign, SRoiAlignParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_SROIPOOLING_H diff --git a/saber/funcs/impl/impl_yolo_box.h b/saber/funcs/impl/impl_yolo_box.h new file mode 100644 index 000000000..3a1ad3ae2 --- /dev/null +++ b/saber/funcs/impl/impl_yolo_box.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_YOLO_BOX_H +#define ANAKIN_SABER_FUNCS_IMPL_YOLO_BOX_H + +#include "saber/funcs/impl/impl_macro.h" +namespace anakin{ + +namespace saber{ + +DEFINE_OP_CLASS(YoloBox, YoloBoxParam); + +} +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_YOLO_BOX_H diff --git a/saber/funcs/impl/x86/.DS_Store b/saber/funcs/impl/x86/.DS_Store new file mode 100644 index 000000000..d95d48e48 Binary files /dev/null and b/saber/funcs/impl/x86/.DS_Store differ diff --git a/saber/funcs/impl/x86/anakin_thread.h b/saber/funcs/impl/x86/anakin_thread.h index f579b4a8d..889d50004 100644 --- a/saber/funcs/impl/x86/anakin_thread.h +++ b/saber/funcs/impl/x86/anakin_thread.h @@ -24,6 +24,11 @@ #define ANAKIN_THR_OMP 1 #define ANAKIN_THR_TBB 2 +#ifdef USE_SGX +#undef ANAKIN_THR +#define ANAKIN_THR ANAKIN_THR_SEQ +#endif + #if !defined(ANAKIN_THR) #define ANAKIN_THR ANAKIN_THR_OMP #endif @@ -32,9 +37,13 @@ #define ANAKIN_THR_SYNC 1 inline int anakin_get_max_threads() { return 1; } inline int anakin_get_num_threads() { return 1; } +inline int anakin_get_num_procs() { return 1; } +inline void anakin_set_num_threads(int val) {} inline int anakin_get_thread_num() { return 0; } inline int anakin_in_parallel() { return 0; } inline void anakin_thr_barrier() {} +inline void anakin_set_nested(int val) {} +inline void anakin_set_dynamic(int val) {} #elif ANAKIN_THR == ANAKIN_THR_OMP #include @@ -42,11 +51,15 @@ inline void anakin_thr_barrier() {} inline int anakin_get_max_threads() { return omp_get_max_threads(); } inline int anakin_get_num_threads() { return omp_get_num_threads(); } +inline int anakin_get_num_procs() { return omp_get_num_procs(); } +inline void anakin_set_num_threads(int val) { omp_set_num_threads(val); } inline int anakin_get_thread_num() { return omp_get_thread_num(); } inline int anakin_in_parallel() { return omp_in_parallel(); } inline void anakin_thr_barrier() { # pragma omp barrier } +inline void anakin_set_nested(int val) { omp_set_nested(val); } +inline void anakin_set_dynamic(int val) { omp_set_dynamic(val); } #elif ANAKIN_THR == ANAKIN_THR_TBB #include "tbb/parallel_for.h" @@ -74,26 +87,6 @@ namespace saber { inline bool anakin_thr_syncable() { return ANAKIN_THR_SYNC == 1; } -template -inline void balance211(T n, U team, U tid, T &n_start, T &n_end) { - T n_min = 1; - T &n_my = n_end; - if (team <= 1 || n == 0) { - n_start = 0; - n_my = n; - } else if (n_min == 1) { - // team = T1 + T2 - // n = T1*n1 + T2*n2 (n1 - n2 = 1) - T n1 = utils::div_up(n, (T)team); - T n2 = n1 - 1; - T T1 = n - n2 * (T)team; - n_my = (T)tid < T1 ? n1 : n2; - n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; - } - - n_end += n_start; -} - } // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/x86/anakin_thread_parallel_nd.h b/saber/funcs/impl/x86/anakin_thread_parallel_nd.h index 2c7c2298d..323a2be7f 100644 --- a/saber/funcs/impl/x86/anakin_thread_parallel_nd.h +++ b/saber/funcs/impl/x86/anakin_thread_parallel_nd.h @@ -17,6 +17,8 @@ #ifndef SABER_FUNCS_IMPL_X86_ANAKIN_THREAD_PARALLEL_ND_H #define SABER_FUNCS_IMPL_X86_ANAKIN_THREAD_PARALLEL_ND_H +#include + /* This header must be included by anakin_thread.hpp only */ /* Functions: @@ -52,6 +54,129 @@ void parallel(int nthr, F f) { #endif } +template +inline void balance211(T n, U team, U tid, T &n_start, T &n_end) { + T n_min = 1; + T &n_my = n_end; + if (team <= 1 || n == 0) { + n_start = 0; + n_my = n; + } else if (n_min == 1) { + // team = T1 + T2 + // n = T1*n1 + T2*n2 (n1 - n2 = 1) + T n1 = (n + (T)team - 1) / (T)team; + T n2 = n1 - 1; + T T1 = n - n2 * (T)team; + n_my = (T)tid < T1 ? n1 : n2; + n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; + } + + n_end += n_start; +} + +template +inline T nd_iterator_init(T start) { + return start; +} +template +inline T nd_iterator_init(T start, U& x, const W& X, Args&& ... tuple) { + start = nd_iterator_init(start, std::forward(tuple)...); + x = start % X; + return start / X; +} + +inline bool nd_iterator_step() { + return true; +} + +template +inline bool nd_iterator_step(U& x, const W& X, Args&& ... tuple) { + if (nd_iterator_step(std::forward(tuple)...)) { + x = (x + 1) % X; + return x == 0; + } + + return false; +} + +template +inline void parallel_nd(const T0 D0, const T1 D1, F f) { + const size_t work_amount = (size_t)D0 * D1; + + if (work_amount == 0) { + return; + } + + #pragma omp parallel + { + const int ithr = anakin_get_thread_num(); + const int nthr = anakin_get_num_threads(); + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + T0 d0{0}; + T1 d1{0}; + nd_iterator_init(start, d0, D0, d1, D1); + + for (size_t iwork = start; iwork < end; ++iwork) { + f(d0, d1); + nd_iterator_step(d0, D0, d1, D1); + } + } +} + +template +inline void parallel_nd(const T0 D0, const T1 D1, const T2 D2, F f) { + const size_t work_amount = (size_t)D0 * D1 * D2; + + if (work_amount == 0) { + return; + } + + #pragma omp parallel + { + const int ithr = anakin_get_thread_num(); + const int nthr = anakin_get_num_threads(); + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + T0 d0{0}; + T1 d1{0}; + T2 d2{0}; + nd_iterator_init(start, d0, D0, d1, D1, d2, D2); + + for (size_t iwork = start; iwork < end; ++iwork) { + f(d0, d1, d2); + nd_iterator_step(d0, D0, d1, D1, d2, D2); + } + } +} + +template +inline bool nd_iterator_jump(U& cur, const U end, W& x, const Y& X) { + U max_jump = end - cur; + U dim_jump = X - x; + + if (dim_jump <= max_jump) { + x = 0; + cur += dim_jump; + return true; + } else { + cur += max_jump; + x += max_jump; + return false; + } +} + +template +inline bool nd_iterator_jump(U& cur, const U end, W& x, const Y& X, + Args&& ... tuple) { + if (nd_iterator_jump(cur, end, std::forward(tuple)...)) { + x = (x + 1) % X; + return x == 0; + } + + return false; +} + /* for_nd section */ template @@ -69,10 +194,10 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, F f) { balance211(work_amount, nthr, ithr, start, end); T0 d0{0}; T1 d1{0}; - utils::nd_iterator_init(start, d0, D0, d1, D1); + nd_iterator_init(start, d0, D0, d1, D1); for (size_t iwork = start; iwork < end; ++iwork) { f(d0, d1); - utils::nd_iterator_step(d0, D0, d1, D1); + nd_iterator_step(d0, D0, d1, D1); } } @@ -85,10 +210,10 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, balance211(work_amount, nthr, ithr, start, end); T0 d0{0}; T1 d1{0}; T2 d2{0}; - utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2); + nd_iterator_init(start, d0, D0, d1, D1, d2, D2); for (size_t iwork = start; iwork < end; ++iwork) { f(d0, d1, d2); - utils::nd_iterator_step(d0, D0, d1, D1, d2, D2); + nd_iterator_step(d0, D0, d1, D1, d2, D2); } } @@ -101,10 +226,10 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, balance211(work_amount, nthr, ithr, start, end); T0 d0{0}; T1 d1{0}; T2 d2{0}; T3 d3{0}; - utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3); + nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3); for (size_t iwork = start; iwork < end; ++iwork) { f(d0, d1, d2, d3); - utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3); + nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3); } } @@ -118,10 +243,10 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, balance211(work_amount, nthr, ithr, start, end); T0 d0{0}; T1 d1{0}; T2 d2{0}; T3 d3{0}; T4 d4{0}; - utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); + nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); for (size_t iwork = start; iwork < end; ++iwork) { f(d0, d1, d2, d3, d4); - utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); + nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); } } @@ -135,11 +260,10 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, balance211(work_amount, nthr, ithr, start, end); T0 d0{0}; T1 d1{0}; T2 d2{0}; T3 d3{0}; T4 d4{0}; T5 d5{0}; - utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, - d5, D5); + nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); for (size_t iwork = start; iwork < end; ++iwork) { f(d0, d1, d2, d3, d4, d5); - utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); + nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); } } @@ -149,11 +273,11 @@ void for_nd(const int ithr, const int nthr, const T0 &D0, const T1 &D1, template void parallel_nd(Args &&...args) { #if ANAKIN_THR == ANAKIN_THR_SEQ - for_nd(0, 1, utils::forward(args)...); + for_nd(0, 1, std::forward(args)...); #elif ANAKIN_THR == ANAKIN_THR_OMP # pragma omp parallel for_nd(anakin_get_thread_num(), anakin_get_num_threads(), - utils::forward(args)...); + std::forward(args)...); #endif } #else // ANAKIN_THR != ANAKIN_THR_TBB @@ -217,10 +341,10 @@ void parallel_nd(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, template void parallel_nd_in_omp(Args &&...args) { #if ANAKIN_THR == ANAKIN_THR_SEQ - for_nd(0, 1, utils::forward(args)...); + for_nd(0, 1, std::forward(args)...); #elif ANAKIN_THR == ANAKIN_THR_OMP for_nd(anakin_get_thread_num(), anakin_get_num_threads(), - utils::forward(args)...); + std::forward(args)...); #elif ANAKIN_THR == ANAKIN_THR_TBB assert(!"unsupported parallel_nd_in_omp()"); #endif diff --git a/saber/funcs/impl/x86/detection_helper.cpp b/saber/funcs/impl/x86/detection_helper.cpp index dd807c563..59f05d8e3 100644 --- a/saber/funcs/impl/x86/detection_helper.cpp +++ b/saber/funcs/impl/x86/detection_helper.cpp @@ -1,4 +1,5 @@ #include "saber/funcs/impl/detection_helper.h" +#include namespace anakin{ namespace saber{ diff --git a/saber/funcs/impl/x86/gemm_u8s8s32x_conv.cpp b/saber/funcs/impl/x86/gemm_u8s8s32x_conv.cpp deleted file mode 100644 index 09f5ca7cc..000000000 --- a/saber/funcs/impl/x86/gemm_u8s8s32x_conv.cpp +++ /dev/null @@ -1,341 +0,0 @@ -#include "saber/funcs/impl/x86/gemm_u8s8s32x_conv.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include "mkl_cblas.h" -#include "anakin_thread.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -SaberStatus GemmU8S8S32XConv::init(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx) { - ConvParam *conv_param = &(param.conv_param); - this->_ctx = &ctx; - - Tensor *weights_reorder = conv_param->mutable_weight(); - if (weights_reorder == nullptr || weights_reorder->mutable_data() == nullptr) { - return SaberInvalidValue; - } - if (weights_internal_ != nullptr) { - delete weights_internal_; - weights_internal_ = nullptr; - } - weights_internal_ = new Tensor(weights_reorder->shape(), AK_INT8); - weights_internal_->set_scale(weights_reorder->get_scale()); - weight_reorder_oihw2hwio(weights_reorder, weights_internal_); - - return create(inputs, outputs, param, ctx); -} - -SaberStatus GemmU8S8S32XConv::create(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx) { - SaberStatus status = SaberSuccess; - ConvParam *conv_param = &(param.conv_param); - - status = init_conf(jcp, inputs, outputs, param); - if (status != SaberSuccess) { - return status; - } - - Tensor *bias_src = conv_param->mutable_bias(); - if (bias_internal_ != nullptr) { - delete bias_internal_; - bias_internal_ = nullptr; - } - if (bias_src != nullptr) { - bias_internal_ = new Tensor(bias_src->shape(), AK_INT32); - bias_internal_->set_scale(bias_src->get_scale()); - bias_reorder_nchw(*bias_src, *bias_internal_, bias_src->get_scale()); - } - - float scale_in = inputs[0]->get_scale()[0]; - float scale_out = outputs[0]->get_scale()[0]; - auto scale_w = weights_internal_->get_scale(); - std::vector().swap(scale_); - for (int i = 0; i < scale_w.size(); i++) { - this->scale_.push_back((scale_w[i] * scale_in) / scale_out); - } - - return status; -} - -SaberStatus GemmU8S8S32XConv::dispatch(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - ConvParam *conv_param = &(param.conv_param); - const Tensor *bias = conv_param->bias(); - Tensor *wei = conv_param->mutable_weight(); - - CHECK_EQ(inputs[0]->get_dtype(), AK_UINT8) << "only support uint8 input type"; - const unsigned char *ptr_src = reinterpret_cast(inputs[0]->data()); - const char *ptr_weights = reinterpret_cast(weights_internal_->data()); - unsigned char *ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - const int32_t *ptr_bias = nullptr; - int dst_type_size = type_length(outputs[0]->get_dtype()); - const auto oscale = scale_; - - if (bias_internal_ != nullptr) { - ptr_bias = reinterpret_cast(bias_internal_->data()); - } - - if (((wei->shape())[0] != 1) || ((wei->shape())[1] != 1)) { - wei = weights_internal_; - ptr_weights = reinterpret_cast(wei->data()); - } - - const size_t work_amount = jcp.ngroups * jcp.mb; - const size_t src_mb_stride = jcp.ngroups * jcp.ih * jcp.iw * jcp.ic; - const size_t src_g_stride = jcp.ic; - const size_t wei_g_stride = (jcp.is_dw || jcp.ngroups > 1) ? jcp.oc : 0; - const size_t dst_mb_stride = jcp.ngroups * jcp.oh * jcp.ow * jcp.oc; - const size_t dst_g_stride = jcp.oc; - const size_t dst_os_stride = jcp.oc * jcp.ngroups; - const bool do_relu = jcp.with_relu; - - parallel(jcp.nthr, [&](const int ithr, const int nthr) { - unsigned char *col = col_ + (ptrdiff_t) ithr * jcp.im2col_sz; - int32_t *acc = acc_ + (ptrdiff_t) ithr * jcp.os * jcp.oc; - - int n{0}, g{0}; - size_t start = 0, end = 0; - utils::balance211 (work_amount, nthr, ithr, start, end); - utils::nd_iterator_init (start, n, jcp.mb, g, jcp.ngroups); - - for (size_t iwork = start; iwork < end; ++iwork) { - const unsigned char *src = ptr_src + n * src_mb_stride + g * src_g_stride; - const char *wei = ptr_weights + g * wei_g_stride; - unsigned char *dst = ptr_dst + n * dst_mb_stride + g * dst_g_stride; - - if (jcp.need_im2col) { - im2col_u8 (jcp, src, col); - } - - const int M = jcp.oc; - const int K = jcp.ks * jcp.ic; - const int N = jcp.os; - const int8_t off_a = 0, off_b = 0; - const int32_t off_c = 0; - - cblas_gemm_s8u8s32 (CblasColMajor, CblasNoTrans, CblasNoTrans, - CblasFixOffset, M, N, K, 1., wei, M * jcp.ngroups, - off_a, jcp.need_im2col ? col : src, K, off_b, 0., acc, - M, (const int *) &off_c); - - #pragma omp parallel for collapse(2) - for (int os = 0; os < jcp.os; ++os) { - for (int oc = 0; oc < jcp.oc; ++oc) { - size_t acc_off = os * jcp.oc + oc; - - float d = (float) acc[acc_off]; - if (jcp.with_bias) { - d += *(ptr_bias + g * jcp.oc + oc); - } - - d *= oscale[g * jcp.oc + oc]; - if (do_relu) - d = (d < 0) ? 0 : d; - const size_t dst_off = os * dst_os_stride + oc; - dst[dst_off] = (uint8_t) nearbyintf(d); - } - } - - utils::nd_iterator_step (n, jcp.mb, g, jcp.ngroups); - } - }); - - return SaberSuccess; -} - -SaberStatus GemmU8S8S32XConv::init_conf(jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - SaberStatus status = SaberSuccess; - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = &(conv_param->activation_param); - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; - Shape src_shape; - Shape dst_shape; - Shape wgt_shape; - - if ((input == nullptr) || - (output == nullptr) || - (weights == nullptr)) { - return SaberInvalidValue; - } - - src_shape = input->shape(); - dst_shape = output->shape(); - wgt_shape = weights->shape(); - - jcp.ngroups = conv_param->group; - jcp.mb = src_shape[0]; - jcp.ih = src_shape[1]; - jcp.iw = src_shape[2]; - jcp.ic = src_shape[3] / jcp.ngroups; - jcp.oh = dst_shape[1]; - jcp.ow = dst_shape[2]; - jcp.oc = dst_shape[3] / jcp.ngroups; - jcp.kh = wgt_shape[2]; - jcp.kw = wgt_shape[3]; - jcp.is = jcp.ih * jcp.iw; - jcp.os = jcp.oh * jcp.ow; - jcp.ks = jcp.kh * jcp.kw; - jcp.im2col_sz = (ptrdiff_t)jcp.ic * jcp.ks * jcp.os; - jcp.need_im2col = !(jcp.oh == jcp.ih && - jcp.ow == jcp.iw && - jcp.ks == 1 && - jcp.ngroups == 1); - jcp.stride_h = conv_param->stride_h; - jcp.stride_w = conv_param->stride_w; - jcp.t_pad = conv_param->pad_h; - jcp.l_pad = conv_param->pad_w; - jcp.b_pad = conv_param->pad_h; - jcp.r_pad = conv_param->pad_w; - jcp.dilate_h = conv_param->dilation_h; - jcp.dilate_w = conv_param->dilation_w; - jcp.rm = conv_param->rm; - jcp.ur_h = 1; - jcp.is_dw = ((wgt_shape[1] == 1) && - (dst_shape[3] == src_shape[3])); - - // TODO remove this logic once group convolution enabled - if (jcp.ngroups > 1 && !jcp.is_dw) { - return SaberUnImplError; - } - - jcp.nthr = omp_get_max_threads(); - if (!(jcp.ic == 1 && - jcp.oc == 1 && - jcp.ngroups != 1) && - !(jcp.os / jcp.nthr < 64 && - jcp.mb != 1)) { - jcp.nthr = 1; - } - - jcp.with_bias = (bias != NULL); - jcp.with_relu = conv_param->activation_param.has_active; - if (jcp.with_relu) { - jcp.relu_negative_slope = static_cast(act_param->negative_slope); - } - - size_t col_size = (size_t) jcp.im2col_sz * sizeof (unsigned char); - size_t acc_size = (size_t) jcp.os * jcp.oc * sizeof (int32_t); - acc_ = (int32_t *) zmalloc(acc_size * jcp.nthr, 4096); - if (acc_ == nullptr) { - return SaberOutOfMem; - } - - col_ = (unsigned char *) zmalloc(col_size * jcp.nthr, 4096); - if (col_ == nullptr) { - zfree(acc_); - acc_ = nullptr; - return SaberOutOfMem; - } - memset(col_, 0, col_size * jcp.nthr); - - return SaberSuccess; -} - -SaberStatus GemmU8S8S32XConv::check_conf(const jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - return SaberSuccess; -} - -SaberStatus GemmU8S8S32XConv::im2col_u8(const jit_conv_conf_t &jcp, - const unsigned char* im, - unsigned char* col) { - int num_thr = (jcp.mb != 1) ? omp_get_max_threads() : 1; - MAYBE_UNUSED(num_thr); - #pragma omp parallel for collapse(2) num_threads(num_thr) - for (int oh = 0; oh < jcp.oh; ++oh) { - for (int ow = 0; ow < jcp.ow; ++ow) { - for (int kh = 0; kh < jcp.kh; ++kh) { - const int ih = oh * jcp.stride_h - - jcp.t_pad + kh * jcp.dilate_h; - if (ih < 0 || ih >= jcp.ih) { - continue; - } - - for (int kw = 0; kw < jcp.kw; ++kw) { - const int iw = ow * jcp.stride_w - - jcp.l_pad + kw * jcp.dilate_w; - if (iw < 0 || iw >= jcp.iw) { - continue; - } - - const size_t col_idx = (((oh * jcp.ow + ow) * jcp.kh + kh) * jcp.kw + kw) * - jcp.ic; - const size_t im_idx = (ih * jcp.iw + iw) * jcp.ngroups * jcp.ic; - #pragma omp simd - for (int ic = 0; ic < jcp.ic; ++ic) { - col[col_idx + ic] = im[im_idx + ic]; - } - } - } - } - } - - return SaberSuccess; -} - -SaberStatus GemmU8S8S32XConv::weight_reorder_oihw2hwio(Tensor* in, - Tensor* out) { - if (in == nullptr || out == nullptr) { - LOG(ERROR) << "invalid input or output weight tensor!"; - return SaberInvalidValue; - } - - Shape shape = in->shape(); - int oc_value = shape[0]; - int ic_value = shape[1]; - int kh_value = shape[2]; - int kw_value = shape[3]; - int src_index =0; - int dst_index = 0; - - if ((oc_value == 1) && (ic_value == 1)) { - return SaberSuccess; - } - - int8_t *src = (int8_t *)in->mutable_data(); - int8_t *dst = (int8_t *)out->mutable_data(); - - if ((src == nullptr) || (dst == nullptr)) { - LOG(ERROR) << "invalid input or output weight tensor!"; - return SaberInvalidValue; - } - - #pragma omp parallel for collapse(4) - for (int oc = 0; oc < oc_value; oc++) { - for (int ic = 0; ic < ic_value; ic++) { - for (int kh = 0; kh < kh_value; kh++) { - for (int kw = 0; kw < kw_value; kw++) { - src_index = oc * ic_value * kh_value * kw_value + - ic * kh_value * kw_value + - kh * kw_value + - kw; - dst_index = kh * kw_value * ic_value * oc_value + - kw * ic_value * oc_value + - ic * oc_value + - oc; - dst[dst_index] = src[src_index]; - } - } - } - } - - return SaberSuccess; -} -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/gemm_u8s8s32x_conv.h b/saber/funcs/impl/x86/gemm_u8s8s32x_conv.h deleted file mode 100644 index 8f5388243..000000000 --- a/saber/funcs/impl/x86/gemm_u8s8s32x_conv.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_U8S8S32X_CONV_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_U8S8S32X_CONV_H - -#include "anakin_config.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_macro.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -class GemmU8S8S32XConv : - public ImplBase< - X86, - AK_INT8, - ConvEltwiseParam > { -public: - typedef typename DataTrait::Dtype OpDataType; - - GemmU8S8S32XConv() - : weights_internal_(nullptr), acc_(nullptr), col_(nullptr), - bias_internal_(nullptr), ws_(nullptr), ws_per_thread_(0) { - memset(&jcp, 0, sizeof(jcp)); - } - - ~GemmU8S8S32XConv() { - if (bias_internal_ != nullptr) { - delete bias_internal_; - bias_internal_ = nullptr; - } - if (weights_internal_ != nullptr) { - delete weights_internal_; - weights_internal_ = nullptr; - } - if (ws_ != nullptr) { - delete ws_; - ws_ = nullptr; - } - if (acc_ != nullptr) { - delete acc_; - acc_ = nullptr; - } - if (col_ != nullptr) { - delete col_; - col_ = nullptr; - } - std::vector().swap(scale_); - } - - virtual SaberStatus init(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx); - - virtual SaberStatus create(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx); - - virtual SaberStatus dispatch(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); - - -private: - Tensor* weights_internal_; - Tensor* bias_internal_; - int *ws_; - size_t ws_per_thread_; - int32_t *acc_; - unsigned char *col_; - jit_conv_conf_t jcp; - - // scale for quantization - std::vector scale_; - - virtual SaberStatus init_conf(jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); - - virtual SaberStatus check_conf(const jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); - - virtual SaberStatus im2col_u8(const jit_conv_conf_t &jcp, - const unsigned char * im, - unsigned char * col); - - virtual SaberStatus weight_reorder_oihw2hwio(Tensor* in, - Tensor* out); -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_U8S8S32X_CONV_H diff --git a/saber/funcs/impl/x86/gemm_x8s8s32x_conv.cpp b/saber/funcs/impl/x86/gemm_x8s8s32x_conv.cpp new file mode 100644 index 000000000..72255d88a --- /dev/null +++ b/saber/funcs/impl/x86/gemm_x8s8s32x_conv.cpp @@ -0,0 +1,572 @@ +#include "saber/funcs/impl/x86/gemm_x8s8s32x_conv.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "mkl_cblas.h" +#include "anakin_thread.h" +#include "debug.h" +namespace anakin { +namespace saber { + +using namespace jit; + +SaberStatus GemmX8S8S32XConv::init(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { + SaberStatus status = SaberUnImplError; + ConvParam* conv_param = &(param.conv_param); + + this->_ctx = &ctx; + jcp = jit_conv_conf_t(); + + status = check_conf(jcp, inputs, outputs, param); + + if (status != SaberSuccess) { + return status; + } + + status = init_conf(jcp, inputs, outputs, param); + + if (status != SaberSuccess) { + return status; + } + + _acc_tensor.re_alloc(Shape({1, 1, 1, jcp.os* jcp.oc * jcp.nthr}), AK_INT32); + _col_tensor.re_alloc(Shape({1, 1, 1, jcp.im2col_sz * jcp.nthr}), AK_UINT8); + _offset_tensor.re_alloc(Shape({1, 1, 1, 1}), AK_INT32); + return create(inputs, outputs, param, ctx); +} + +SaberStatus GemmX8S8S32XConv::create(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { + this->_ctx = &ctx; + ConvParam* conv_param = &(param.conv_param); + auto status = init_conf(jcp, inputs, outputs, param); + + if (status != SaberSuccess) { + return status; + } + + Tensor* weights_orig = conv_param->mutable_weight(); + + if (weights_orig->get_dtype() == AK_FLOAT) { + _weights_scale.re_alloc(weights_orig->valid_shape(), AK_INT8); + utils::ScaleUtils::scale_conv_weights_to_nchw_host(_weights_scale, *conv_param->weight()); + weights_orig = &_weights_scale; + } + + CHECK(weights_orig != nullptr); + + if (weights_internal_ != nullptr) { + delete weights_internal_; + weights_internal_ = nullptr; + } + + weights_internal_ = new Tensor(weights_orig->shape(), AK_INT8); + weights_internal_->set_scale(weights_orig->get_scale()); + weight_reorder_goihw2hwigo(weights_orig, weights_internal_); + + Tensor* bias_src = conv_param->mutable_bias(); + + if (bias_internal_ != nullptr) { + delete bias_internal_; + bias_internal_ = nullptr; + } + + if (bias_src != nullptr && bias_src->valid_size() > 0) { + Tensor* input = inputs[0]; + CHECK_EQ(bias_src->get_dtype(), AK_FLOAT); + bias_internal_ = new Tensor(bias_src->valid_shape(), AK_FLOAT); + auto weights_scale = weights_orig->get_scale(); + float in_scale = 1.f; + CHECK_GT(input->get_scale().size(), 0) << "only support input scale size > 0"; + + if (input->get_scale().size() > 0) { + in_scale = input->get_scale()[0]; + } + + std::vector scale_vec(bias_src->valid_size()); + + if (inputs[0]->get_dtype() == AK_UINT8) { + for (int i = 0; i < bias_src->valid_size(); i++) { + scale_vec[i] = (1.f / (weights_scale[i] * in_scale * (127.f / 255.f))); + } + } else if (inputs[0]->get_dtype() == AK_INT8) { + for (int i = 0; i < bias_src->valid_size(); i++) { + scale_vec[i] = (1.f / (weights_scale[i] * in_scale)); + } + } else { + LOG(FATAL) << "not support input dtype " << inputs[0]->get_dtype(); + } + + bias_internal_->set_scale(scale_vec); + bias_reorder_nchw(*bias_src, *bias_internal_, scale_vec); + } + + utils::try_expand_tensor(_acc_tensor, jcp.os * jcp.oc * jcp.nthr); + fill_tensor_const(_acc_tensor, 0); + acc_ = (int32_t*)_acc_tensor.mutable_data(); + + if (acc_ == nullptr) { + return SaberOutOfMem; + } + + utils::try_expand_tensor(_col_tensor, jcp.im2col_sz * jcp.nthr); + fill_tensor_const(_col_tensor, 0); + col_ = (uint8_t*)_col_tensor.mutable_data(); + + if (col_ == nullptr) { + return SaberOutOfMem; + } + + if (jcp.signed_input) { + utils::try_expand_tensor(_offset_tensor, jcp.ngroups * jcp.oc); + fill_tensor_const(_offset_tensor, 0); + offset_c_ = (int32_t*)_offset_tensor.mutable_data(); + + if (offset_c_ == nullptr) { + return SaberOutOfMem; + } + + compute_c_offset(jcp, reinterpret_cast(weights_internal_->data()), offset_c_); + } else { + utils::try_expand_tensor(_offset_tensor, 1); + fill_tensor_const(_offset_tensor, 0); + offset_c_ = (int32_t*)_offset_tensor.mutable_data(); + + if (offset_c_ == nullptr) { + return SaberOutOfMem; + } + } + + + float scale_in = inputs[0]->get_scale()[0]; + float scale_out = 1.f; + + if (outputs[0]->get_scale().size() > 0 && outputs[0]->get_dtype() != AK_FLOAT) { + scale_out = outputs[0]->get_scale()[0]; + } + + auto scale_w = weights_internal_->get_scale(); + std::vector().swap(scale_); + + if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_INT8) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in) / scale_out); + } + } else if (inputs[0]->get_dtype() == AK_UINT8 && outputs[0]->get_dtype() == AK_UINT8) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in * (127.f / 255.f)) / (scale_out * (127.f / 255.f))); + } + } else if (inputs[0]->get_dtype() == AK_UINT8 && outputs[0]->get_dtype() == AK_INT8) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in * (127.f / 255.f)) / (scale_out)); + } + } else if (inputs[0]->get_dtype() == AK_UINT8 && outputs[0]->get_dtype() == AK_FLOAT) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in * (127.f / 255.f))); + } + } else if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_UINT8) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in) / (scale_out * (127.f / 255.f))); + } + } else if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_FLOAT) { + for (int i = 0; i < scale_w.size(); i++) { + this->scale_.push_back((scale_w[i] * scale_in)); + } + } else { + LOG(FATAL) << "can`t cal scale for dtype " << inputs[0]->get_dtype() << "," << + outputs[0]->get_dtype(); + } + + return SaberSuccess; +} + +template +SaberStatus GemmX8S8S32XConv::sub_dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = &(param.conv_param); + const Tensor* bias = conv_param->bias(); + const Tensor* wei = conv_param->mutable_weight(); + + const float* ptr_bias = nullptr; + const auto oscale = scale_; + auto* ptr_src = reinterpret_cast(inputs[0]->data()); + auto* ptr_weights = reinterpret_cast(weights_internal_->data()); + auto* ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + + if (bias_internal_ != nullptr) { + ptr_bias = reinterpret_cast(bias_internal_->data()); + } + + if (((wei->shape())[0] != 1) || ((wei->shape())[1] != 1)) { + wei = weights_internal_; + ptr_weights = reinterpret_cast(wei->data()); + } + + const size_t work_amount = jcp.ngroups * jcp.mb; + const size_t src_mb_stride = jcp.ngroups * jcp.ih * jcp.iw * jcp.ic; + const size_t src_g_stride = jcp.ic; + const size_t wei_g_stride = (jcp.ngroups > 1) ? jcp.oc : 0; + const size_t dst_mb_stride = jcp.ngroups * jcp.oh * jcp.ow * jcp.oc; + const size_t dst_g_stride = jcp.oc; + const size_t dst_os_stride = jcp.oc * jcp.ngroups; + const bool do_relu = jcp.with_relu; + const int32_t ithr = 0; + const int32_t nthr = 1; + // parallel(jcp.nthr, [&](const int32_t ithr, const int32_t nthr) { + auto col = col_ + (ptrdiff_t) ithr * jcp.im2col_sz; + auto acc = acc_ + (ptrdiff_t) ithr * jcp.os * jcp.oc; + + int32_t n = 0, g = 0; + size_t start = 0, end = 0; + balance211(work_amount, nthr, ithr, start, end); + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups); + + for (auto iwork = start; iwork < end; ++iwork) { + auto src = ptr_src + n * src_mb_stride + g * src_g_stride; + auto wei = ptr_weights + g * wei_g_stride; + auto dst = ptr_dst + n * dst_mb_stride + g * dst_g_stride; + + if (jcp.need_im2col) { + im2col_u8(jcp, (const uint8_t*)src, col); + } + + auto M = jcp.oc; + auto K = jcp.ks * jcp.ic; + auto N = jcp.os; + int8_t offset_a = 0, offset_b = 0; + + if (jcp.signed_input) { + cblas_gemm_s8u8s32(CblasColMajor, CblasNoTrans, CblasNoTrans, + CblasColOffset, M, N, K, 1.f, wei, M * jcp.ngroups, + offset_a, jcp.need_im2col ? col : (const uint8_t*)src, K, offset_b, + 0.f, acc, M, offset_c_ + g * jcp.oc); + } else { + cblas_gemm_s8u8s32(CblasColMajor, CblasNoTrans, CblasNoTrans, + CblasFixOffset, M, N, K, 1.f, wei, M * jcp.ngroups, + offset_a, jcp.need_im2col ? col : (const uint8_t*)src, K, offset_b, + 0.f, acc, M, offset_c_); + } + + + for (auto os = 0; os < jcp.os; ++os) { + for (auto oc = 0; oc < jcp.oc; ++oc) { + auto acc_off = os * jcp.oc + oc; + auto g_oc = g * jcp.oc + oc; + + auto d = (float) acc[acc_off]; + + if (jcp.with_bias) { + d += *(ptr_bias + g_oc); + } + + d *= oscale[g_oc]; + + if (do_relu && d < 0) { + d = 0; + } + + auto dst_off = os * dst_os_stride + oc; + + if (std::is_same::value) { + dst[dst_off] = d; + } else { + dst[dst_off] = (OutputDtype) nearbyintf(d); + } + } + } + + nd_iterator_step(n, jcp.mb, g, jcp.ngroups); + } + + // }); + return SaberSuccess; +} +SaberStatus GemmX8S8S32XConv::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + DLOG(INFO) << "dispatch GemmX8S8S32XConv"; + + if (inputs[0]->get_dtype() == AK_UINT8 && outputs[0]->get_dtype() == AK_FLOAT) { + return this->template sub_dispatch(inputs, outputs, param); + } else if (inputs[0]->get_dtype() == AK_UINT8 && outputs[0]->get_dtype() == AK_UINT8) { + return this->template sub_dispatch(inputs, outputs, param); + } else if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_UINT8) { + return this->template sub_dispatch(inputs, outputs, param); + } else if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_INT8) { + return this->template sub_dispatch(inputs, outputs, param); + } else if (inputs[0]->get_dtype() == AK_INT8 && outputs[0]->get_dtype() == AK_FLOAT) { + return this->template sub_dispatch(inputs, outputs, param); + } else { + LOG(FATAL) << "not support"; + return SaberSuccess; + } +} + +SaberStatus GemmX8S8S32XConv::check_conf(const jit_conv_conf_t& jcp, + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = &(param.conv_param); + ActivationParam* act_param = &(conv_param->activation_param); + Tensor const* weights = conv_param->weight(); + Tensor const* bias = conv_param->bias(); + Tensor const* input = inputs[0]; + Tensor* output = outputs[0]; + Shape src_shape = input->shape(); + Shape dst_shape = output->shape(); + Shape wgt_shape = weights->shape(); + auto group = conv_param->group; + + CHECK(input != nullptr); + CHECK(output != nullptr); + CHECK(weights != nullptr); + + if (weights_internal_ != nullptr) { + delete weights_internal_; + weights_internal_ = nullptr; + } + + if (bias_internal_ != nullptr) { + delete bias_internal_; + bias_internal_ = nullptr; + } + + auto ic_check = src_shape[3] % group; + auto oc_check = dst_shape[3] % group; + + if ((group > 1) & ((ic_check + oc_check) > 0)) { + LOG(ERROR) << "invalid input_channel or output_channel"; + return SaberInvalidValue; + } + + return SaberSuccess; +} + +SaberStatus GemmX8S8S32XConv::init_conf(jit_conv_conf_t& jcp, + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + SaberStatus status = SaberSuccess; + ConvParam* conv_param = &(param.conv_param); + ActivationParam* act_param = &(conv_param->activation_param); + Tensor const* weights = conv_param->weight(); + Tensor const* bias = conv_param->bias(); + Tensor const* input = inputs[0]; + Tensor* output = outputs[0]; + Shape src_shape = input->shape(); + Shape dst_shape = output->shape(); + Shape wgt_shape = weights->shape(); + + jcp.signed_input = (input->get_dtype() == AK_INT8) ? true : false; + jcp.ngroups = conv_param->group; + jcp.mb = src_shape[0]; + jcp.ih = src_shape[1]; + jcp.iw = src_shape[2]; + jcp.ic = src_shape[3] / jcp.ngroups; + jcp.oh = dst_shape[1]; + jcp.ow = dst_shape[2]; + jcp.oc = dst_shape[3] / jcp.ngroups; + jcp.kh = wgt_shape[2]; + jcp.kw = wgt_shape[3]; + jcp.is = jcp.ih * jcp.iw; + jcp.os = jcp.oh * jcp.ow; + jcp.ks = jcp.kh * jcp.kw; + jcp.stride_h = conv_param->stride_h; + jcp.stride_w = conv_param->stride_w; + jcp.t_pad = conv_param->pad_h; + jcp.l_pad = conv_param->pad_w; + jcp.b_pad = conv_param->pad_h; + jcp.r_pad = conv_param->pad_w; + jcp.dilate_h = conv_param->dilation_h; + jcp.dilate_w = conv_param->dilation_w; + jcp.rm = conv_param->rm; + jcp.ur_h = 1; + jcp.im2col_sz = (ptrdiff_t)jcp.ic * jcp.ks * jcp.os; + jcp.need_im2col = !(jcp.oh == jcp.ih && + jcp.ow == jcp.iw && + jcp.ks == 1 && + jcp.ngroups == 1 && + jcp.signed_input == false); + + auto mb_ngroup = jcp.mb * jcp.ngroups; + auto omp_max_threads = omp_get_max_threads(); + auto omp_mb_ngroup_threads = mb_ngroup < omp_max_threads ? + mb_ngroup : + omp_max_threads; + + if (jcp.mb != 1) { + jcp.nthr = omp_mb_ngroup_threads; + } else { + jcp.nthr = mb_ngroup > omp_max_threads / 2 ? + omp_mb_ngroup_threads : 1; + } + + im2col_u8_method = 1; + + if (jcp.kh * jcp.kw != 1 && jcp.mb != 1) { + im2col_u8_method = 2; + } + + jcp.with_bias = (bias != NULL && bias->valid_size() > 0); + jcp.with_relu = conv_param->activation_param.has_active; + + if (jcp.with_relu) { + jcp.relu_negative_slope = static_cast(act_param->negative_slope); + } + + return SaberSuccess; +} + +SaberStatus GemmX8S8S32XConv::weight_reorder_goihw2hwigo(Tensor* in, + Tensor* out) { + auto src = reinterpret_cast(in->data()); + auto dst = reinterpret_cast(out->mutable_data()); + + if ((src == nullptr) || (dst == nullptr)) { + LOG(ERROR) << "invalid empty pointer"; + return SaberInvalidValue; + } + + Shape shape = in->shape(); + auto oc_value = shape[0]; + auto ic_value = shape[1]; + auto kh_value = shape[2]; + auto kw_value = shape[3]; + auto src_index = 0, dst_index = 0; + + + for (auto oc = 0; oc < oc_value; oc++) { + for (auto ic = 0; ic < ic_value; ic++) { + for (auto kh = 0; kh < kh_value; kh++) { + for (auto kw = 0; kw < kw_value; kw++) { + src_index = ((oc * ic_value + ic) * kh_value + kh) * kw_value + kw; + dst_index = ((kh * kw_value + kw) * ic_value + ic) * oc_value + oc; + dst[dst_index] = src[src_index]; + } + } + } + } + + + return SaberSuccess; +} + +SaberStatus GemmX8S8S32XConv::compute_c_offset(const jit_conv_conf_t& jcp, + const int8_t* src, + int32_t* dst) { + if (src == nullptr || dst == nullptr) { + LOG(FATAL) << "invalid empty pointer"; + return SaberInvalidValue; + } + + auto g_value = jcp.ngroups; + auto oc_value = jcp.oc; + auto ks_value = jcp.ks; + auto ic_value = jcp.ic; + + auto k_value = ks_value * ic_value, + g_oc_value = g_value * oc_value; + + for (auto k = 0; k < k_value; ++k) { + #pragma omp simd + + for (auto g_oc = 0; g_oc < g_oc_value; ++g_oc) { + auto src_index = k * g_oc_value + g_oc; + dst[g_oc] += -128 * src[src_index]; + } + } + + return SaberSuccess; +} + + +SaberStatus GemmX8S8S32XConv::im2col_u8(const jit_conv_conf_t& jcp, + const unsigned char* im, + unsigned char* col) { + auto jcp_oh = jcp.oh; + auto jcp_ow = jcp.ow; + auto jcp_kh = jcp.kh; + auto jcp_kw = jcp.kw; + auto jcp_t_pad = jcp.t_pad; + auto jcp_l_pad = jcp.l_pad; + auto jcp_stride_h = jcp.stride_h; + auto jcp_stride_w = jcp.stride_w; + auto jcp_ic = jcp.ic; + auto jcp_ngroups = jcp.ngroups; + + switch (im2col_u8_method) { + case 1: + parallel_nd(jcp.oh, jcp.ow, [&](int32_t oh, int32_t ow) { + for (auto kh = 0; kh < jcp.kh; ++kh) { + const auto ih = oh * jcp.stride_h - jcp.t_pad + kh * jcp.dilate_h; + + for (auto kw = 0; kw < jcp.kw; ++kw) { + const auto iw = ow * jcp.stride_w - jcp.l_pad + kw * jcp.dilate_w; + + const size_t col_idx = (((oh * jcp.ow + ow) * jcp.kh + kh) * jcp.kw + kw) * jcp.ic; + const size_t im_idx = (ih * jcp.iw + iw) * jcp.ngroups * jcp.ic; + #pragma omp simd + + for (auto ic = 0; ic < jcp.ic; ++ic) { + if (iw < 0 || iw >= jcp.iw || ih < 0 || ih >= jcp.ih) { + if (jcp.signed_input) { + col[col_idx + ic] = 128; + } else { + col[col_idx + ic] = 0; + } + } else { + col[col_idx + ic] = jcp.signed_input ? + 128 + im[im_idx + ic] : + im[im_idx + ic]; + } + } + } + } + }); + + break; + + case 2: + #pragma omp parallel for collapse(2) num_threads(jcp.nthr) + for (auto oh = 0; oh < jcp.oh; ++oh) { + for (auto ow = 0; ow < jcp.ow; ++ow) { + for (auto kh = 0; kh < jcp.kh; ++kh) { + const auto ih = oh * jcp.stride_h - jcp.t_pad + kh * jcp.dilate_h; + + for (auto kw = 0; kw < jcp.kw; ++kw) { + const auto iw = ow * jcp.stride_w - jcp.l_pad + kw * jcp.dilate_w; + + const auto col_idx = (((oh * jcp.ow + ow) * jcp.kh + kh) * jcp.kw + kw) * jcp.ic; + const auto im_idx = (ih * jcp.iw + iw) * jcp.ngroups * jcp.ic; + #pragma omp simd + + for (auto ic = 0; ic < jcp.ic; ++ic) { + if (iw < 0 || iw >= jcp.iw || ih < 0 || ih >= jcp.ih) { + if (jcp.signed_input) { + col[col_idx + ic] = 128; + } else { + col[col_idx + ic] = 0; + } + } else { + col[col_idx + ic] = jcp.signed_input ? + 128 + im[im_idx + ic] : + im[im_idx + ic]; + } + } + } + } + } + } + + break; + } + + return SaberSuccess; +} + +} // namespace saber +} // namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/x86/gemm_x8s8s32x_conv.h b/saber/funcs/impl/x86/gemm_x8s8s32x_conv.h new file mode 100644 index 000000000..fc4355361 --- /dev/null +++ b/saber/funcs/impl/x86/gemm_x8s8s32x_conv.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_X8S8S32X_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_X8S8S32X_CONV_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_macro.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +class GemmX8S8S32XConv : + public ImplBase < + X86, + AK_INT8, + ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + GemmX8S8S32XConv() + : weights_internal_(nullptr), acc_(nullptr), col_(nullptr), + offset_c_(nullptr), bias_internal_(nullptr), ws_per_thread_(0) { + memset(&jcp, 0, sizeof(jcp)); + } + + ~GemmX8S8S32XConv() { + if (bias_internal_ != nullptr) { + delete bias_internal_; + bias_internal_ = nullptr; + } + + if (weights_internal_ != nullptr) { + delete weights_internal_; + weights_internal_ = nullptr; + } + + std::vector().swap(scale_); + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx); + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + + template + SaberStatus sub_dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param); +private: + Tensor* weights_internal_{nullptr}; + Tensor* bias_internal_{nullptr}; + size_t ws_per_thread_{0}; + size_t im2col_u8_method{0}; + uint8_t* col_{nullptr}; + int32_t* acc_{nullptr}; + int32_t* offset_c_{nullptr}; + Tensor _weights_scale; + Tensor _acc_tensor; + Tensor _col_tensor; + Tensor _offset_tensor; + + jit_conv_conf_t jcp; + + // scale for quantization + std::vector scale_; + + virtual SaberStatus init_conf(jit_conv_conf_t& jcp, + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + + virtual SaberStatus check_conf(const jit_conv_conf_t& jcp, + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + + virtual SaberStatus weight_reorder_goihw2hwigo(Tensor* in, + Tensor* out); + + virtual SaberStatus compute_c_offset(const jit_conv_conf_t& jcp, + const int8_t* in, + int32_t* out); + + virtual SaberStatus im2col_u8(const jit_conv_conf_t& jcp, + const unsigned char* im, + unsigned char* col); +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_GEMM_X8S8S32X_CONV_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/intrinsic_gemm.cpp b/saber/funcs/impl/x86/intrinsic_gemm.cpp new file mode 100644 index 000000000..cdb5cc798 --- /dev/null +++ b/saber/funcs/impl/x86/intrinsic_gemm.cpp @@ -0,0 +1,5724 @@ +#include "intrinsic_gemm.h" + +#include +#include +#include +#include +#include +namespace anakin { + +namespace saber { +#if defined(__AVX2__) +inline void block8x8_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block8x8_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + const int8_t* pa4 = pa0 + 4 * lda; + const int8_t* pa5 = pa0 + 5 * lda; + const int8_t* pa6 = pa0 + 6 * lda; + const int8_t* pa7 = pa0 + 7 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + const int8_t* pb4 = pb0 + 4 * ldb; + const int8_t* pb5 = pb0 + 5 * ldb; + const int8_t* pb6 = pb0 + 6 * ldb; + const int8_t* pb7 = pb0 + 7 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + int* pc4 = c + 4 * ldc; + int* pc5 = c + 5 * ldc; + int* pc6 = c + 6 * ldc; + int* pc7 = c + 7 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma4_l; + __m256i ma5_l; + __m256i ma6_l; + __m256i ma7_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + __m256i ma4_h; + __m256i ma5_h; + __m256i ma6_h; + __m256i ma7_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb2_l; + __m256i mb3_l; + __m256i mb4_l; + __m256i mb5_l; + __m256i mb6_l; + __m256i mb7_l; + __m256i mb0_h; + __m256i mb1_h; + __m256i mb2_h; + __m256i mb3_h; + __m256i mb4_h; + __m256i mb5_h; + __m256i mb6_h; + __m256i mb7_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + __m256i mc8; + __m256i mc9; + __m256i mc10; + __m256i mc11; + __m256i mc12; + __m256i mc13; + __m256i mc14; + __m256i mc15; + + _mm_prefetch((char*) pa0, _MM_HINT_T0); + _mm_prefetch((char*) pa1, _MM_HINT_T0); + _mm_prefetch((char*) pa2, _MM_HINT_T0); + _mm_prefetch((char*) pa3, _MM_HINT_T0); + _mm_prefetch((char*) pa4, _MM_HINT_T0); + _mm_prefetch((char*) pa5, _MM_HINT_T0); + _mm_prefetch((char*) pa6, _MM_HINT_T0); + _mm_prefetch((char*) pa7, _MM_HINT_T0); + + _mm_prefetch((char*) pb0, _MM_HINT_T0); + _mm_prefetch((char*) pb1, _MM_HINT_T0); + _mm_prefetch((char*) pb2, _MM_HINT_T0); + _mm_prefetch((char*) pb3, _MM_HINT_T0); + _mm_prefetch((char*) pb4, _MM_HINT_T0); + _mm_prefetch((char*) pb5, _MM_HINT_T0); + _mm_prefetch((char*) pb6, _MM_HINT_T0); + _mm_prefetch((char*) pb7, _MM_HINT_T0); + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + __m256i sum8 = _mm256_setzero_si256(); + __m256i sum9 = _mm256_setzero_si256(); + __m256i sum10 = _mm256_setzero_si256(); + __m256i sum11 = _mm256_setzero_si256(); + __m256i sum12 = _mm256_setzero_si256(); + __m256i sum13 = _mm256_setzero_si256(); + __m256i sum14 = _mm256_setzero_si256(); + __m256i sum15 = _mm256_setzero_si256(); + + __m256i sum16 = _mm256_setzero_si256(); + __m256i sum17 = _mm256_setzero_si256(); + __m256i sum18 = _mm256_setzero_si256(); + __m256i sum19 = _mm256_setzero_si256(); + __m256i sum20 = _mm256_setzero_si256(); + __m256i sum21 = _mm256_setzero_si256(); + __m256i sum22 = _mm256_setzero_si256(); + __m256i sum23 = _mm256_setzero_si256(); + + __m256i sum24 = _mm256_setzero_si256(); + __m256i sum25 = _mm256_setzero_si256(); + __m256i sum26 = _mm256_setzero_si256(); + __m256i sum27 = _mm256_setzero_si256(); + __m256i sum28 = _mm256_setzero_si256(); + __m256i sum29 = _mm256_setzero_si256(); + __m256i sum30 = _mm256_setzero_si256(); + __m256i sum31 = _mm256_setzero_si256(); + + __m256i sum32 = _mm256_setzero_si256(); + __m256i sum33 = _mm256_setzero_si256(); + __m256i sum34 = _mm256_setzero_si256(); + __m256i sum35 = _mm256_setzero_si256(); + __m256i sum36 = _mm256_setzero_si256(); + __m256i sum37 = _mm256_setzero_si256(); + __m256i sum38 = _mm256_setzero_si256(); + __m256i sum39 = _mm256_setzero_si256(); + + __m256i sum40 = _mm256_setzero_si256(); + __m256i sum41 = _mm256_setzero_si256(); + __m256i sum42 = _mm256_setzero_si256(); + __m256i sum43 = _mm256_setzero_si256(); + __m256i sum44 = _mm256_setzero_si256(); + __m256i sum45 = _mm256_setzero_si256(); + __m256i sum46 = _mm256_setzero_si256(); + __m256i sum47 = _mm256_setzero_si256(); + + __m256i sum48 = _mm256_setzero_si256(); + __m256i sum49 = _mm256_setzero_si256(); + __m256i sum50 = _mm256_setzero_si256(); + __m256i sum51 = _mm256_setzero_si256(); + __m256i sum52 = _mm256_setzero_si256(); + __m256i sum53 = _mm256_setzero_si256(); + __m256i sum54 = _mm256_setzero_si256(); + __m256i sum55 = _mm256_setzero_si256(); + + __m256i sum56 = _mm256_setzero_si256(); + __m256i sum57 = _mm256_setzero_si256(); + __m256i sum58 = _mm256_setzero_si256(); + __m256i sum59 = _mm256_setzero_si256(); + __m256i sum60 = _mm256_setzero_si256(); + __m256i sum61 = _mm256_setzero_si256(); + __m256i sum62 = _mm256_setzero_si256(); + __m256i sum63 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //the 0 row + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb2 + 16))); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma0_h, mb2_h)); + sum2 = _mm256_add_epi32(mc2, sum2); + + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mb3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb3 + 16))); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma0_h, mb3_h)); + sum3 = _mm256_add_epi32(mc3, sum3); + + mb4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb4)); + mb4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb4 + 16))); + mc4 = _mm256_madd_epi16(ma0_l, mb4_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma0_h, mb4_h)); + sum4 = _mm256_add_epi32(mc4, sum4); + + mb5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb5)); + mb5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb5 + 16))); + mc5 = _mm256_madd_epi16(ma0_l, mb5_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma0_h, mb5_h)); + sum5 = _mm256_add_epi32(mc5, sum5); + + mb6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb6)); + mb6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb6 + 16))); + mc6 = _mm256_madd_epi16(ma0_l, mb6_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma0_h, mb6_h)); + sum6 = _mm256_add_epi32(mc6, sum6); + + mb7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb7)); + mb7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb7 + 16))); + mc7 = _mm256_madd_epi16(ma0_l, mb7_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma0_h, mb7_h)); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc8 = _mm256_madd_epi16(ma1_l, mb0_l); + mc8 = _mm256_add_epi32(mc8, _mm256_madd_epi16(ma1_h, mb0_h)); + sum8 = _mm256_add_epi32(mc8, sum8); + + mc9 = _mm256_madd_epi16(ma1_l, mb1_l); + mc9 = _mm256_add_epi32(mc9, _mm256_madd_epi16(ma1_h, mb1_h)); + sum9 = _mm256_add_epi32(mc9, sum9); + + mc10 = _mm256_madd_epi16(ma1_l, mb2_l); + mc10 = _mm256_add_epi32(mc10, _mm256_madd_epi16(ma1_h, mb2_h)); + sum10 = _mm256_add_epi32(mc10, sum10); + + mc11 = _mm256_madd_epi16(ma1_l, mb3_l); + mc11 = _mm256_add_epi32(mc11, _mm256_madd_epi16(ma1_h, mb3_h)); + sum11 = _mm256_add_epi32(mc11, sum11); + + mc12 = _mm256_madd_epi16(ma1_l, mb4_l); + mc12 = _mm256_add_epi32(mc12, _mm256_madd_epi16(ma1_h, mb4_h)); + sum12 = _mm256_add_epi32(mc12, sum12); + + mc13 = _mm256_madd_epi16(ma1_l, mb5_l); + mc13 = _mm256_add_epi32(mc13, _mm256_madd_epi16(ma1_h, mb5_h)); + sum13 = _mm256_add_epi32(mc13, sum13); + + mc14 = _mm256_madd_epi16(ma1_l, mb6_l); + mc14 = _mm256_add_epi32(mc14, _mm256_madd_epi16(ma1_h, mb6_h)); + sum14 = _mm256_add_epi32(mc14, sum14); + + mc15 = _mm256_madd_epi16(ma1_l, mb7_l); + mc15 = _mm256_add_epi32(mc15, _mm256_madd_epi16(ma1_h, mb7_h)); + sum15 = _mm256_add_epi32(mc15, sum15); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma2_h, mb0_h)); + sum16 = _mm256_add_epi32(mc0, sum16); + + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma2_h, mb1_h)); + sum17 = _mm256_add_epi32(mc1, sum17); + + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma2_h, mb2_h)); + sum18 = _mm256_add_epi32(mc2, sum18); + + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma2_h, mb3_h)); + sum19 = _mm256_add_epi32(mc3, sum19); + + mc4 = _mm256_madd_epi16(ma2_l, mb4_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma2_h, mb4_h)); + sum20 = _mm256_add_epi32(mc4, sum20); + + mc5 = _mm256_madd_epi16(ma2_l, mb5_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma2_h, mb5_h)); + sum21 = _mm256_add_epi32(mc5, sum21); + + mc6 = _mm256_madd_epi16(ma2_l, mb6_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma2_h, mb6_h)); + sum22 = _mm256_add_epi32(mc6, sum22); + + mc7 = _mm256_madd_epi16(ma2_l, mb7_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma2_h, mb7_h)); + sum23 = _mm256_add_epi32(mc7, sum23); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc8 = _mm256_madd_epi16(ma3_l, mb0_l); + mc8 = _mm256_add_epi32(mc8, _mm256_madd_epi16(ma3_h, mb0_h)); + sum24 = _mm256_add_epi32(mc8, sum24); + + mc9 = _mm256_madd_epi16(ma3_l, mb1_l); + mc9 = _mm256_add_epi32(mc9, _mm256_madd_epi16(ma3_h, mb1_h)); + sum25 = _mm256_add_epi32(mc9, sum25); + + mc10 = _mm256_madd_epi16(ma3_l, mb2_l); + mc10 = _mm256_add_epi32(mc10, _mm256_madd_epi16(ma3_h, mb2_h)); + sum26 = _mm256_add_epi32(mc10, sum26); + + mc11 = _mm256_madd_epi16(ma3_l, mb3_l); + mc11 = _mm256_add_epi32(mc11, _mm256_madd_epi16(ma3_h, mb3_h)); + sum27 = _mm256_add_epi32(mc11, sum27); + + mc12 = _mm256_madd_epi16(ma3_l, mb4_l); + mc12 = _mm256_add_epi32(mc12, _mm256_madd_epi16(ma3_h, mb4_h)); + sum28 = _mm256_add_epi32(mc12, sum28); + + mc13 = _mm256_madd_epi16(ma3_l, mb5_l); + mc13 = _mm256_add_epi32(mc13, _mm256_madd_epi16(ma3_h, mb5_h)); + sum29 = _mm256_add_epi32(mc13, sum29); + + mc14 = _mm256_madd_epi16(ma3_l, mb6_l); + mc14 = _mm256_add_epi32(mc14, _mm256_madd_epi16(ma3_h, mb6_h)); + sum30 = _mm256_add_epi32(mc14, sum30); + + mc15 = _mm256_madd_epi16(ma3_l, mb7_l); + mc15 = _mm256_add_epi32(mc15, _mm256_madd_epi16(ma3_h, mb7_h)); + sum31 = _mm256_add_epi32(mc15, sum31); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + ma4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa4 + 16))); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma4_h, mb0_h)); + sum32 = _mm256_add_epi32(mc0, sum32); + + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma4_h, mb1_h)); + sum33 = _mm256_add_epi32(mc1, sum33); + + mc2 = _mm256_madd_epi16(ma4_l, mb2_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma4_h, mb2_h)); + sum34 = _mm256_add_epi32(mc2, sum34); + + mc3 = _mm256_madd_epi16(ma4_l, mb3_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma4_h, mb3_h)); + sum35 = _mm256_add_epi32(mc3, sum35); + + mc4 = _mm256_madd_epi16(ma4_l, mb4_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma4_h, mb4_h)); + sum36 = _mm256_add_epi32(mc4, sum36); + + mc5 = _mm256_madd_epi16(ma4_l, mb5_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma4_h, mb5_h)); + sum37 = _mm256_add_epi32(mc5, sum37); + + mc6 = _mm256_madd_epi16(ma4_l, mb6_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma4_h, mb6_h)); + sum38 = _mm256_add_epi32(mc6, sum38); + + mc7 = _mm256_madd_epi16(ma4_l, mb7_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma4_h, mb7_h)); + sum39 = _mm256_add_epi32(mc7, sum39); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + ma5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa5 + 16))); + + mc8 = _mm256_madd_epi16(ma5_l, mb0_l); + mc8 = _mm256_add_epi32(mc8, _mm256_madd_epi16(ma5_h, mb0_h)); + sum40 = _mm256_add_epi32(mc8, sum40); + + mc9 = _mm256_madd_epi16(ma5_l, mb1_l); + mc9 = _mm256_add_epi32(mc9, _mm256_madd_epi16(ma5_h, mb1_h)); + sum41 = _mm256_add_epi32(mc9, sum41); + + mc10 = _mm256_madd_epi16(ma5_l, mb2_l); + mc10 = _mm256_add_epi32(mc10, _mm256_madd_epi16(ma5_h, mb2_h)); + sum42 = _mm256_add_epi32(mc10, sum42); + + mc11 = _mm256_madd_epi16(ma5_l, mb3_l); + mc11 = _mm256_add_epi32(mc11, _mm256_madd_epi16(ma5_h, mb3_h)); + sum43 = _mm256_add_epi32(mc11, sum43); + + mc12 = _mm256_madd_epi16(ma5_l, mb4_l); + mc12 = _mm256_add_epi32(mc12, _mm256_madd_epi16(ma5_h, mb4_h)); + sum44 = _mm256_add_epi32(mc12, sum44); + + mc13 = _mm256_madd_epi16(ma5_l, mb5_l); + mc13 = _mm256_add_epi32(mc13, _mm256_madd_epi16(ma5_h, mb5_h)); + sum45 = _mm256_add_epi32(mc13, sum45); + + mc14 = _mm256_madd_epi16(ma5_l, mb6_l); + mc14 = _mm256_add_epi32(mc14, _mm256_madd_epi16(ma5_h, mb6_h)); + sum46 = _mm256_add_epi32(mc14, sum46); + + mc15 = _mm256_madd_epi16(ma5_l, mb7_l); + mc15 = _mm256_add_epi32(mc15, _mm256_madd_epi16(ma5_h, mb7_h)); + sum47 = _mm256_add_epi32(mc15, sum47); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + ma6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa6 + 16))); + + mc0 = _mm256_madd_epi16(ma6_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma6_h, mb0_h)); + sum48 = _mm256_add_epi32(mc0, sum48); + + mc1 = _mm256_madd_epi16(ma6_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma6_h, mb1_h)); + sum49 = _mm256_add_epi32(mc1, sum49); + + mc2 = _mm256_madd_epi16(ma6_l, mb2_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma6_h, mb2_h)); + sum50 = _mm256_add_epi32(mc2, sum50); + + mc3 = _mm256_madd_epi16(ma6_l, mb3_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma6_h, mb3_h)); + sum51 = _mm256_add_epi32(mc3, sum51); + + mc4 = _mm256_madd_epi16(ma6_l, mb4_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma6_h, mb4_h)); + sum52 = _mm256_add_epi32(mc4, sum52); + + mc5 = _mm256_madd_epi16(ma6_l, mb5_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma6_h, mb5_h)); + sum53 = _mm256_add_epi32(mc5, sum53); + + mc6 = _mm256_madd_epi16(ma6_l, mb6_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma6_h, mb6_h)); + sum54 = _mm256_add_epi32(mc6, sum54); + + mc7 = _mm256_madd_epi16(ma6_l, mb7_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma6_h, mb7_h)); + sum55 = _mm256_add_epi32(mc7, sum55); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + ma7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa7 + 16))); + + mc8 = _mm256_madd_epi16(ma7_l, mb0_l); + mc8 = _mm256_add_epi32(mc8, _mm256_madd_epi16(ma7_h, mb0_h)); + sum56 = _mm256_add_epi32(mc8, sum56); + + mc9 = _mm256_madd_epi16(ma7_l, mb1_l); + mc9 = _mm256_add_epi32(mc9, _mm256_madd_epi16(ma7_h, mb1_h)); + sum57 = _mm256_add_epi32(mc9, sum57); + + mc10 = _mm256_madd_epi16(ma7_l, mb2_l); + mc10 = _mm256_add_epi32(mc10, _mm256_madd_epi16(ma7_h, mb2_h)); + sum58 = _mm256_add_epi32(mc10, sum58); + + mc11 = _mm256_madd_epi16(ma7_l, mb3_l); + mc11 = _mm256_add_epi32(mc11, _mm256_madd_epi16(ma7_h, mb3_h)); + sum59 = _mm256_add_epi32(mc11, sum59); + + mc12 = _mm256_madd_epi16(ma7_l, mb4_l); + mc12 = _mm256_add_epi32(mc12, _mm256_madd_epi16(ma7_h, mb4_h)); + sum60 = _mm256_add_epi32(mc12, sum60); + + mc13 = _mm256_madd_epi16(ma7_l, mb5_l); + mc13 = _mm256_add_epi32(mc13, _mm256_madd_epi16(ma7_h, mb5_h)); + sum61 = _mm256_add_epi32(mc13, sum61); + + mc14 = _mm256_madd_epi16(ma7_l, mb6_l); + mc14 = _mm256_add_epi32(mc14, _mm256_madd_epi16(ma7_h, mb6_h)); + sum62 = _mm256_add_epi32(mc14, sum62); + + mc15 = _mm256_madd_epi16(ma7_l, mb7_l); + mc15 = _mm256_add_epi32(mc15, _mm256_madd_epi16(ma7_h, mb7_h)); + sum63 = _mm256_add_epi32(mc15, sum63); + + _mm_prefetch((char*) pa0 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa1 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa2 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa3 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa4 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa5 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa6 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pa7 + 32, _MM_HINT_T0); + + _mm_prefetch((char*) pb0 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb1 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb2 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb3 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb4 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb5 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb6 + 32, _MM_HINT_T0); + _mm_prefetch((char*) pb7 + 32, _MM_HINT_T0); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + pa4 += 32; + pa5 += 32; + pa6 += 32; + pa7 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + pb4 += 32; + pb5 += 32; + pb6 += 32; + pb7 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mb4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb4)); + mb5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb5)); + mb6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb6)); + mb7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb7)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + mc4 = _mm256_madd_epi16(ma0_l, mb4_l); + mc5 = _mm256_madd_epi16(ma0_l, mb5_l); + mc6 = _mm256_madd_epi16(ma0_l, mb6_l); + mc7 = _mm256_madd_epi16(ma0_l, mb7_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc0 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_madd_epi16(ma1_l, mb1_l); + mc2 = _mm256_madd_epi16(ma1_l, mb2_l); + mc3 = _mm256_madd_epi16(ma1_l, mb3_l); + mc4 = _mm256_madd_epi16(ma1_l, mb4_l); + mc5 = _mm256_madd_epi16(ma1_l, mb5_l); + mc6 = _mm256_madd_epi16(ma1_l, mb6_l); + mc7 = _mm256_madd_epi16(ma1_l, mb7_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + mc4 = _mm256_madd_epi16(ma2_l, mb4_l); + mc5 = _mm256_madd_epi16(ma2_l, mb5_l); + mc6 = _mm256_madd_epi16(ma2_l, mb6_l); + mc7 = _mm256_madd_epi16(ma2_l, mb7_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + sum20 = _mm256_add_epi32(mc4, sum20); + sum21 = _mm256_add_epi32(mc5, sum21); + sum22 = _mm256_add_epi32(mc6, sum22); + sum23 = _mm256_add_epi32(mc7, sum23); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc0 = _mm256_madd_epi16(ma3_l, mb0_l); + mc1 = _mm256_madd_epi16(ma3_l, mb1_l); + mc2 = _mm256_madd_epi16(ma3_l, mb2_l); + mc3 = _mm256_madd_epi16(ma3_l, mb3_l); + mc4 = _mm256_madd_epi16(ma3_l, mb4_l); + mc5 = _mm256_madd_epi16(ma3_l, mb5_l); + mc6 = _mm256_madd_epi16(ma3_l, mb6_l); + mc7 = _mm256_madd_epi16(ma3_l, mb7_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + sum28 = _mm256_add_epi32(mc4, sum28); + sum29 = _mm256_add_epi32(mc5, sum29); + sum30 = _mm256_add_epi32(mc6, sum30); + sum31 = _mm256_add_epi32(mc7, sum31); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + mc2 = _mm256_madd_epi16(ma4_l, mb2_l); + mc3 = _mm256_madd_epi16(ma4_l, mb3_l); + mc4 = _mm256_madd_epi16(ma4_l, mb4_l); + mc5 = _mm256_madd_epi16(ma4_l, mb5_l); + mc6 = _mm256_madd_epi16(ma4_l, mb6_l); + mc7 = _mm256_madd_epi16(ma4_l, mb7_l); + + sum32 = _mm256_add_epi32(mc0, sum32); + sum33 = _mm256_add_epi32(mc1, sum33); + sum34 = _mm256_add_epi32(mc2, sum34); + sum35 = _mm256_add_epi32(mc3, sum35); + sum36 = _mm256_add_epi32(mc4, sum36); + sum37 = _mm256_add_epi32(mc5, sum37); + sum38 = _mm256_add_epi32(mc6, sum38); + sum39 = _mm256_add_epi32(mc7, sum39); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + + mc0 = _mm256_madd_epi16(ma5_l, mb0_l); + mc1 = _mm256_madd_epi16(ma5_l, mb1_l); + mc2 = _mm256_madd_epi16(ma5_l, mb2_l); + mc3 = _mm256_madd_epi16(ma5_l, mb3_l); + mc4 = _mm256_madd_epi16(ma5_l, mb4_l); + mc5 = _mm256_madd_epi16(ma5_l, mb5_l); + mc6 = _mm256_madd_epi16(ma5_l, mb6_l); + mc7 = _mm256_madd_epi16(ma5_l, mb7_l); + + sum40 = _mm256_add_epi32(mc0, sum40); + sum41 = _mm256_add_epi32(mc1, sum41); + sum42 = _mm256_add_epi32(mc2, sum42); + sum43 = _mm256_add_epi32(mc3, sum43); + sum44 = _mm256_add_epi32(mc4, sum44); + sum45 = _mm256_add_epi32(mc5, sum45); + sum46 = _mm256_add_epi32(mc6, sum46); + sum47 = _mm256_add_epi32(mc7, sum47); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + + mc0 = _mm256_madd_epi16(ma6_l, mb0_l); + mc1 = _mm256_madd_epi16(ma6_l, mb1_l); + mc2 = _mm256_madd_epi16(ma6_l, mb2_l); + mc3 = _mm256_madd_epi16(ma6_l, mb3_l); + mc4 = _mm256_madd_epi16(ma6_l, mb4_l); + mc5 = _mm256_madd_epi16(ma6_l, mb5_l); + mc6 = _mm256_madd_epi16(ma6_l, mb6_l); + mc7 = _mm256_madd_epi16(ma6_l, mb7_l); + + sum48 = _mm256_add_epi32(mc0, sum48); + sum49 = _mm256_add_epi32(mc1, sum49); + sum50 = _mm256_add_epi32(mc2, sum50); + sum51 = _mm256_add_epi32(mc3, sum51); + sum52 = _mm256_add_epi32(mc4, sum52); + sum53 = _mm256_add_epi32(mc5, sum53); + sum54 = _mm256_add_epi32(mc6, sum54); + sum55 = _mm256_add_epi32(mc7, sum55); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + + mc0 = _mm256_madd_epi16(ma7_l, mb0_l); + mc1 = _mm256_madd_epi16(ma7_l, mb1_l); + mc2 = _mm256_madd_epi16(ma7_l, mb2_l); + mc3 = _mm256_madd_epi16(ma7_l, mb3_l); + mc4 = _mm256_madd_epi16(ma7_l, mb4_l); + mc5 = _mm256_madd_epi16(ma7_l, mb5_l); + mc6 = _mm256_madd_epi16(ma7_l, mb6_l); + mc7 = _mm256_madd_epi16(ma7_l, mb7_l); + + sum56 = _mm256_add_epi32(mc0, sum56); + sum57 = _mm256_add_epi32(mc1, sum57); + sum58 = _mm256_add_epi32(mc2, sum58); + sum59 = _mm256_add_epi32(mc3, sum59); + sum60 = _mm256_add_epi32(mc4, sum60); + sum61 = _mm256_add_epi32(mc5, sum61); + sum62 = _mm256_add_epi32(mc6, sum62); + sum63 = _mm256_add_epi32(mc7, sum63); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + pa4 += 16; + pa5 += 16; + pa6 += 16; + pa7 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + pb4 += 16; + pb5 += 16; + pb6 += 16; + pb7 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb3)); + mb4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb4)); + mb5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb5)); + mb6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb6)); + mb7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb7)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma0_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma0_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma0_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma0_l, mb7_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma1_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma1_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma1_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma1_l, mb7_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma2_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma2_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma2_l, mb7_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + sum20 = _mm256_add_epi32(mc4, sum20); + sum21 = _mm256_add_epi32(mc5, sum21); + sum22 = _mm256_add_epi32(mc6, sum22); + sum23 = _mm256_add_epi32(mc7, sum23); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma3_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma3_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma3_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb7_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + sum28 = _mm256_add_epi32(mc4, sum28); + sum29 = _mm256_add_epi32(mc5, sum29); + sum30 = _mm256_add_epi32(mc6, sum30); + sum31 = _mm256_add_epi32(mc7, sum31); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma4_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma4_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma4_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma4_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma4_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma4_l, mb7_l); + + sum32 = _mm256_add_epi32(mc0, sum32); + sum33 = _mm256_add_epi32(mc1, sum33); + sum34 = _mm256_add_epi32(mc2, sum34); + sum35 = _mm256_add_epi32(mc3, sum35); + sum36 = _mm256_add_epi32(mc4, sum36); + sum37 = _mm256_add_epi32(mc5, sum37); + sum38 = _mm256_add_epi32(mc6, sum38); + sum39 = _mm256_add_epi32(mc7, sum39); + + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa5)); + + mc0 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma5_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma5_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma5_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma5_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma5_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma5_l, mb7_l); + + sum40 = _mm256_add_epi32(mc0, sum40); + sum41 = _mm256_add_epi32(mc1, sum41); + sum42 = _mm256_add_epi32(mc2, sum42); + sum43 = _mm256_add_epi32(mc3, sum43); + sum44 = _mm256_add_epi32(mc4, sum44); + sum45 = _mm256_add_epi32(mc5, sum45); + sum46 = _mm256_add_epi32(mc6, sum46); + sum47 = _mm256_add_epi32(mc7, sum47); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa6)); + + mc0 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma6_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma6_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma6_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma6_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma6_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma6_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma6_l, mb7_l); + + sum48 = _mm256_add_epi32(mc0, sum48); + sum49 = _mm256_add_epi32(mc1, sum49); + sum50 = _mm256_add_epi32(mc2, sum50); + sum51 = _mm256_add_epi32(mc3, sum51); + sum52 = _mm256_add_epi32(mc4, sum52); + sum53 = _mm256_add_epi32(mc5, sum53); + sum54 = _mm256_add_epi32(mc6, sum54); + sum55 = _mm256_add_epi32(mc7, sum55); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa7)); + + mc0 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma7_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma7_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma7_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma7_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma7_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma7_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma7_l, mb7_l); + + sum56 = _mm256_add_epi32(mc0, sum56); + sum57 = _mm256_add_epi32(mc1, sum57); + sum58 = _mm256_add_epi32(mc2, sum58); + sum59 = _mm256_add_epi32(mc3, sum59); + sum60 = _mm256_add_epi32(mc4, sum60); + sum61 = _mm256_add_epi32(mc5, sum61); + sum62 = _mm256_add_epi32(mc6, sum62); + sum63 = _mm256_add_epi32(mc7, sum63); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + pa4 += 8; + pa5 += 8; + pa6 += 8; + pa7 += 8; + + pb0 += 8; + pb1 += 8; + pb2 += 8; + pb3 += 8; + pb4 += 8; + pb5 += 8; + pb6 += 8; + pb7 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + ga4[i] = pa4[i]; + ga5[i] = pa5[i]; + ga6[i] = pa6[i]; + ga7[i] = pa7[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + gb2[i] = pb2[i]; + gb3[i] = pb3[i]; + gb4[i] = pb4[i]; + gb5[i] = pb5[i]; + gb6[i] = pb6[i]; + gb7[i] = pb7[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb3)); + mb4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb4)); + mb5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb5)); + mb6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb6)); + mb7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb7)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma0_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma0_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma0_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma0_l, mb7_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma1_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma1_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma1_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma1_l, mb7_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma2_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma2_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma2_l, mb7_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + sum20 = _mm256_add_epi32(mc4, sum20); + sum21 = _mm256_add_epi32(mc5, sum21); + sum22 = _mm256_add_epi32(mc6, sum22); + sum23 = _mm256_add_epi32(mc7, sum23); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma3_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma3_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma3_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb7_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + sum28 = _mm256_add_epi32(mc4, sum28); + sum29 = _mm256_add_epi32(mc5, sum29); + sum30 = _mm256_add_epi32(mc6, sum30); + sum31 = _mm256_add_epi32(mc7, sum31); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma4_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma4_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma4_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma4_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma4_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma4_l, mb7_l); + + sum32 = _mm256_add_epi32(mc0, sum32); + sum33 = _mm256_add_epi32(mc1, sum33); + sum34 = _mm256_add_epi32(mc2, sum34); + sum35 = _mm256_add_epi32(mc3, sum35); + sum36 = _mm256_add_epi32(mc4, sum36); + sum37 = _mm256_add_epi32(mc5, sum37); + sum38 = _mm256_add_epi32(mc6, sum38); + sum39 = _mm256_add_epi32(mc7, sum39); + + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga5)); + + mc0 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma5_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma5_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma5_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma5_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma5_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma5_l, mb7_l); + + sum40 = _mm256_add_epi32(mc0, sum40); + sum41 = _mm256_add_epi32(mc1, sum41); + sum42 = _mm256_add_epi32(mc2, sum42); + sum43 = _mm256_add_epi32(mc3, sum43); + sum44 = _mm256_add_epi32(mc4, sum44); + sum45 = _mm256_add_epi32(mc5, sum45); + sum46 = _mm256_add_epi32(mc6, sum46); + sum47 = _mm256_add_epi32(mc7, sum47); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga6)); + + mc0 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma6_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma6_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma6_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma6_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma6_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma6_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma6_l, mb7_l); + + sum48 = _mm256_add_epi32(mc0, sum48); + sum49 = _mm256_add_epi32(mc1, sum49); + sum50 = _mm256_add_epi32(mc2, sum50); + sum51 = _mm256_add_epi32(mc3, sum51); + sum52 = _mm256_add_epi32(mc4, sum52); + sum53 = _mm256_add_epi32(mc5, sum53); + sum54 = _mm256_add_epi32(mc6, sum54); + sum55 = _mm256_add_epi32(mc7, sum55); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga7)); + + mc0 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma7_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma7_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma7_l, mb3_l); + mc4 = _mm256_mullo_epi32(ma7_l, mb4_l); + mc5 = _mm256_mullo_epi32(ma7_l, mb5_l); + mc6 = _mm256_mullo_epi32(ma7_l, mb6_l); + mc7 = _mm256_mullo_epi32(ma7_l, mb7_l); + + sum56 = _mm256_add_epi32(mc0, sum56); + sum57 = _mm256_add_epi32(mc1, sum57); + sum58 = _mm256_add_epi32(mc2, sum58); + sum59 = _mm256_add_epi32(mc3, sum59); + sum60 = _mm256_add_epi32(mc4, sum60); + sum61 = _mm256_add_epi32(mc5, sum61); + sum62 = _mm256_add_epi32(mc6, sum62); + sum63 = _mm256_add_epi32(mc7, sum63); + } + + //store + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum0 = _mm256_hadd_epi32(sum0, sum2); + + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum4 = _mm256_hadd_epi32(sum4, sum6); + + sum0 = _mm256_add_epi32(_mm256_permute2x128_si256(sum0, sum4, 0x20), + _mm256_permute2x128_si256(sum0, sum4, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + pc0[2] = _mm256_extract_epi32(sum0, 2); + pc0[3] = _mm256_extract_epi32(sum0, 3); + pc0[4] = _mm256_extract_epi32(sum0, 4); + pc0[5] = _mm256_extract_epi32(sum0, 5); + pc0[6] = _mm256_extract_epi32(sum0, 6); + pc0[7] = _mm256_extract_epi32(sum0, 7); + + //the 1 row + sum8 = _mm256_hadd_epi32(sum8, sum9); + sum10 = _mm256_hadd_epi32(sum10, sum11); + sum8 = _mm256_hadd_epi32(sum8, sum10); + + sum12 = _mm256_hadd_epi32(sum12, sum13); + sum14 = _mm256_hadd_epi32(sum14, sum15); + sum12 = _mm256_hadd_epi32(sum12, sum14); + + sum8 = _mm256_add_epi32(_mm256_permute2x128_si256(sum8, sum12, 0x20), + _mm256_permute2x128_si256(sum8, sum12, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum8, 0); + pc1[1] = _mm256_extract_epi32(sum8, 1); + pc1[2] = _mm256_extract_epi32(sum8, 2); + pc1[3] = _mm256_extract_epi32(sum8, 3); + pc1[4] = _mm256_extract_epi32(sum8, 4); + pc1[5] = _mm256_extract_epi32(sum8, 5); + pc1[6] = _mm256_extract_epi32(sum8, 6); + pc1[7] = _mm256_extract_epi32(sum8, 7); + + //the 2 row + sum16 = _mm256_hadd_epi32(sum16, sum17); + sum18 = _mm256_hadd_epi32(sum18, sum19); + sum16 = _mm256_hadd_epi32(sum16, sum18); + + sum20 = _mm256_hadd_epi32(sum20, sum21); + sum22 = _mm256_hadd_epi32(sum22, sum23); + sum20 = _mm256_hadd_epi32(sum20, sum22); + + sum16 = _mm256_add_epi32(_mm256_permute2x128_si256(sum16, sum20, 0x20), + _mm256_permute2x128_si256(sum16, sum20, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum16, 0); + pc2[1] = _mm256_extract_epi32(sum16, 1); + pc2[2] = _mm256_extract_epi32(sum16, 2); + pc2[3] = _mm256_extract_epi32(sum16, 3); + pc2[4] = _mm256_extract_epi32(sum16, 4); + pc2[5] = _mm256_extract_epi32(sum16, 5); + pc2[6] = _mm256_extract_epi32(sum16, 6); + pc2[7] = _mm256_extract_epi32(sum16, 7); + + //the 3 row + sum24 = _mm256_hadd_epi32(sum24, sum25); + sum26 = _mm256_hadd_epi32(sum26, sum27); + sum24 = _mm256_hadd_epi32(sum24, sum26); + + sum28 = _mm256_hadd_epi32(sum28, sum29); + sum30 = _mm256_hadd_epi32(sum30, sum31); + sum28 = _mm256_hadd_epi32(sum28, sum30); + + sum24 = _mm256_add_epi32(_mm256_permute2x128_si256(sum24, sum28, 0x20), + _mm256_permute2x128_si256(sum24, sum28, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum24, 0); + pc3[1] = _mm256_extract_epi32(sum24, 1); + pc3[2] = _mm256_extract_epi32(sum24, 2); + pc3[3] = _mm256_extract_epi32(sum24, 3); + pc3[4] = _mm256_extract_epi32(sum24, 4); + pc3[5] = _mm256_extract_epi32(sum24, 5); + pc3[6] = _mm256_extract_epi32(sum24, 6); + pc3[7] = _mm256_extract_epi32(sum24, 7); + + //the 4 row + sum32 = _mm256_hadd_epi32(sum32, sum33); + sum34 = _mm256_hadd_epi32(sum34, sum35); + sum32 = _mm256_hadd_epi32(sum32, sum34); + + sum36 = _mm256_hadd_epi32(sum36, sum37); + sum38 = _mm256_hadd_epi32(sum38, sum39); + sum36 = _mm256_hadd_epi32(sum36, sum38); + + sum32 = _mm256_add_epi32(_mm256_permute2x128_si256(sum32, sum36, 0x20), + _mm256_permute2x128_si256(sum32, sum36, 0x31)); + + pc4[0] = _mm256_extract_epi32(sum32, 0); + pc4[1] = _mm256_extract_epi32(sum32, 1); + pc4[2] = _mm256_extract_epi32(sum32, 2); + pc4[3] = _mm256_extract_epi32(sum32, 3); + pc4[4] = _mm256_extract_epi32(sum32, 4); + pc4[5] = _mm256_extract_epi32(sum32, 5); + pc4[6] = _mm256_extract_epi32(sum32, 6); + pc4[7] = _mm256_extract_epi32(sum32, 7); + + //the 5 row + sum40 = _mm256_hadd_epi32(sum40, sum41); + sum42 = _mm256_hadd_epi32(sum42, sum43); + sum40 = _mm256_hadd_epi32(sum40, sum42); + + sum44 = _mm256_hadd_epi32(sum44, sum45); + sum46 = _mm256_hadd_epi32(sum46, sum47); + sum44 = _mm256_hadd_epi32(sum44, sum46); + + sum40 = _mm256_add_epi32(_mm256_permute2x128_si256(sum40, sum44, 0x20), + _mm256_permute2x128_si256(sum40, sum44, 0x31)); + + pc5[0] = _mm256_extract_epi32(sum40, 0); + pc5[1] = _mm256_extract_epi32(sum40, 1); + pc5[2] = _mm256_extract_epi32(sum40, 2); + pc5[3] = _mm256_extract_epi32(sum40, 3); + pc5[4] = _mm256_extract_epi32(sum40, 4); + pc5[5] = _mm256_extract_epi32(sum40, 5); + pc5[6] = _mm256_extract_epi32(sum40, 6); + pc5[7] = _mm256_extract_epi32(sum40, 7); + + //the 6 row + sum48 = _mm256_hadd_epi32(sum48, sum49); + sum50 = _mm256_hadd_epi32(sum50, sum51); + sum48 = _mm256_hadd_epi32(sum48, sum50); + + sum52 = _mm256_hadd_epi32(sum52, sum53); + sum54 = _mm256_hadd_epi32(sum54, sum55); + sum52 = _mm256_hadd_epi32(sum52, sum54); + + sum48 = _mm256_add_epi32(_mm256_permute2x128_si256(sum48, sum52, 0x20), + _mm256_permute2x128_si256(sum48, sum52, 0x31)); + + pc6[0] = _mm256_extract_epi32(sum48, 0); + pc6[1] = _mm256_extract_epi32(sum48, 1); + pc6[2] = _mm256_extract_epi32(sum48, 2); + pc6[3] = _mm256_extract_epi32(sum48, 3); + pc6[4] = _mm256_extract_epi32(sum48, 4); + pc6[5] = _mm256_extract_epi32(sum48, 5); + pc6[6] = _mm256_extract_epi32(sum48, 6); + pc6[7] = _mm256_extract_epi32(sum48, 7); + + //the 7 row + sum56 = _mm256_hadd_epi32(sum56, sum57); + sum58 = _mm256_hadd_epi32(sum58, sum59); + sum56 = _mm256_hadd_epi32(sum56, sum58); + + sum60 = _mm256_hadd_epi32(sum60, sum61); + sum62 = _mm256_hadd_epi32(sum62, sum63); + sum60 = _mm256_hadd_epi32(sum60, sum62); + + sum56 = _mm256_add_epi32(_mm256_permute2x128_si256(sum56, sum60, 0x20), + _mm256_permute2x128_si256(sum56, sum60, 0x31)); + + pc7[0] = _mm256_extract_epi32(sum56, 0); + pc7[1] = _mm256_extract_epi32(sum56, 1); + pc7[2] = _mm256_extract_epi32(sum56, 2); + pc7[3] = _mm256_extract_epi32(sum56, 3); + pc7[4] = _mm256_extract_epi32(sum56, 4); + pc7[5] = _mm256_extract_epi32(sum56, 5); + pc7[6] = _mm256_extract_epi32(sum56, 6); + pc7[7] = _mm256_extract_epi32(sum56, 7); +} + +inline void block8x4_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int32_t stride) { + //printf("block8x4_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + const int8_t* pa4 = pa0 + 4 * lda; + const int8_t* pa5 = pa0 + 5 * lda; + const int8_t* pa6 = pa0 + 6 * lda; + const int8_t* pa7 = pa0 + 7 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + int* pc4 = c + 4 * ldc; + int* pc5 = c + 5 * ldc; + int* pc6 = c + 6 * ldc; + int* pc7 = c + 7 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma4_l; + __m256i ma5_l; + __m256i ma6_l; + __m256i ma7_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + __m256i ma4_h; + __m256i ma5_h; + __m256i ma6_h; + __m256i ma7_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb2_l; + __m256i mb3_l; + __m256i mb0_h; + __m256i mb1_h; + __m256i mb2_h; + __m256i mb3_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + __m256i sum8 = _mm256_setzero_si256(); + __m256i sum9 = _mm256_setzero_si256(); + __m256i sum10 = _mm256_setzero_si256(); + __m256i sum11 = _mm256_setzero_si256(); + __m256i sum12 = _mm256_setzero_si256(); + __m256i sum13 = _mm256_setzero_si256(); + __m256i sum14 = _mm256_setzero_si256(); + __m256i sum15 = _mm256_setzero_si256(); + + __m256i sum16 = _mm256_setzero_si256(); + __m256i sum17 = _mm256_setzero_si256(); + __m256i sum18 = _mm256_setzero_si256(); + __m256i sum19 = _mm256_setzero_si256(); + __m256i sum20 = _mm256_setzero_si256(); + __m256i sum21 = _mm256_setzero_si256(); + __m256i sum22 = _mm256_setzero_si256(); + __m256i sum23 = _mm256_setzero_si256(); + + __m256i sum24 = _mm256_setzero_si256(); + __m256i sum25 = _mm256_setzero_si256(); + __m256i sum26 = _mm256_setzero_si256(); + __m256i sum27 = _mm256_setzero_si256(); + __m256i sum28 = _mm256_setzero_si256(); + __m256i sum29 = _mm256_setzero_si256(); + __m256i sum30 = _mm256_setzero_si256(); + __m256i sum31 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb2 + 16))); + + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mb3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb3 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma0_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma0_h, mb3_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc4 = _mm256_madd_epi16(ma1_l, mb0_l); + mc5 = _mm256_madd_epi16(ma1_l, mb1_l); + mc6 = _mm256_madd_epi16(ma1_l, mb2_l); + mc7 = _mm256_madd_epi16(ma1_l, mb3_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma1_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma1_h, mb1_h)); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma1_h, mb2_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma1_h, mb3_h)); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma2_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma2_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma2_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma2_h, mb3_h)); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc4 = _mm256_madd_epi16(ma3_l, mb0_l); + mc5 = _mm256_madd_epi16(ma3_l, mb1_l); + mc6 = _mm256_madd_epi16(ma3_l, mb2_l); + mc7 = _mm256_madd_epi16(ma3_l, mb3_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma3_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma3_h, mb1_h)); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma3_h, mb2_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma3_h, mb3_h)); + + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + ma4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa4 + 16))); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + mc2 = _mm256_madd_epi16(ma4_l, mb2_l); + mc3 = _mm256_madd_epi16(ma4_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma4_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma4_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma4_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma4_h, mb3_h)); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + ma5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa5 + 16))); + + mc4 = _mm256_madd_epi16(ma5_l, mb0_l); + mc5 = _mm256_madd_epi16(ma5_l, mb1_l); + mc6 = _mm256_madd_epi16(ma5_l, mb2_l); + mc7 = _mm256_madd_epi16(ma5_l, mb3_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma5_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma5_h, mb1_h)); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma5_h, mb2_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma5_h, mb3_h)); + + sum20 = _mm256_add_epi32(mc4, sum20); + sum21 = _mm256_add_epi32(mc5, sum21); + sum22 = _mm256_add_epi32(mc6, sum22); + sum23 = _mm256_add_epi32(mc7, sum23); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + ma6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa6 + 16))); + + mc0 = _mm256_madd_epi16(ma6_l, mb0_l); + mc1 = _mm256_madd_epi16(ma6_l, mb1_l); + mc2 = _mm256_madd_epi16(ma6_l, mb2_l); + mc3 = _mm256_madd_epi16(ma6_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma6_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma6_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma6_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma6_h, mb3_h)); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + ma7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa7 + 16))); + + mc4 = _mm256_madd_epi16(ma7_l, mb0_l); + mc5 = _mm256_madd_epi16(ma7_l, mb1_l); + mc6 = _mm256_madd_epi16(ma7_l, mb2_l); + mc7 = _mm256_madd_epi16(ma7_l, mb3_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma7_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma7_h, mb1_h)); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma7_h, mb2_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma7_h, mb3_h)); + + sum28 = _mm256_add_epi32(mc4, sum28); + sum29 = _mm256_add_epi32(mc5, sum29); + sum30 = _mm256_add_epi32(mc6, sum30); + sum31 = _mm256_add_epi32(mc7, sum31); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + pa4 += 32; + pa5 += 32; + pa6 += 32; + pa7 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + } + + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc0 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_madd_epi16(ma1_l, mb1_l); + mc2 = _mm256_madd_epi16(ma1_l, mb2_l); + mc3 = _mm256_madd_epi16(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc0 = _mm256_madd_epi16(ma3_l, mb0_l); + mc1 = _mm256_madd_epi16(ma3_l, mb1_l); + mc2 = _mm256_madd_epi16(ma3_l, mb2_l); + mc3 = _mm256_madd_epi16(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + mc2 = _mm256_madd_epi16(ma4_l, mb2_l); + mc3 = _mm256_madd_epi16(ma4_l, mb3_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + ma5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa5 + 16))); + + mc0 = _mm256_madd_epi16(ma5_l, mb0_l); + mc1 = _mm256_madd_epi16(ma5_l, mb1_l); + mc2 = _mm256_madd_epi16(ma5_l, mb2_l); + mc3 = _mm256_madd_epi16(ma5_l, mb3_l); + + sum20 = _mm256_add_epi32(mc0, sum20); + sum21 = _mm256_add_epi32(mc1, sum21); + sum22 = _mm256_add_epi32(mc2, sum22); + sum23 = _mm256_add_epi32(mc3, sum23); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + ma6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa6 + 16))); + + mc0 = _mm256_madd_epi16(ma6_l, mb0_l); + mc1 = _mm256_madd_epi16(ma6_l, mb1_l); + mc2 = _mm256_madd_epi16(ma6_l, mb2_l); + mc3 = _mm256_madd_epi16(ma6_l, mb3_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + ma7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa7 + 16))); + + mc0 = _mm256_madd_epi16(ma7_l, mb0_l); + mc1 = _mm256_madd_epi16(ma7_l, mb1_l); + mc2 = _mm256_madd_epi16(ma7_l, mb2_l); + mc3 = _mm256_madd_epi16(ma7_l, mb3_l); + + sum28 = _mm256_add_epi32(mc0, sum28); + sum29 = _mm256_add_epi32(mc1, sum29); + sum30 = _mm256_add_epi32(mc2, sum30); + sum31 = _mm256_add_epi32(mc3, sum31); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + pa4 += 16; + pa5 += 16; + pa6 += 16; + pa7 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb3)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma4_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma4_l, mb3_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa5)); + + mc0 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma5_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma5_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb3_l); + + sum20 = _mm256_add_epi32(mc0, sum20); + sum21 = _mm256_add_epi32(mc1, sum21); + sum22 = _mm256_add_epi32(mc2, sum22); + sum23 = _mm256_add_epi32(mc3, sum23); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa6)); + + mc0 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma6_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma6_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma6_l, mb3_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa7)); + + mc0 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma7_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma7_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma7_l, mb3_l); + + sum28 = _mm256_add_epi32(mc0, sum28); + sum29 = _mm256_add_epi32(mc1, sum29); + sum30 = _mm256_add_epi32(mc2, sum30); + sum31 = _mm256_add_epi32(mc3, sum31); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + pa4 += 8; + pa5 += 8; + pa6 += 8; + pa7 += 8; + + pb0 += 8; + pb1 += 8; + pb2 += 8; + pb3 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + ga4[i] = pa4[i]; + ga5[i] = pa5[i]; + ga6[i] = pa6[i]; + ga7[i] = pa7[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + gb2[i] = pb2[i]; + gb3[i] = pb3[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb3)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma4_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma4_l, mb3_l); + + sum16 = _mm256_add_epi32(mc0, sum16); + sum17 = _mm256_add_epi32(mc1, sum17); + sum18 = _mm256_add_epi32(mc2, sum18); + sum19 = _mm256_add_epi32(mc3, sum19); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga5)); + + mc0 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma5_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma5_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb3_l); + + sum20 = _mm256_add_epi32(mc0, sum20); + sum21 = _mm256_add_epi32(mc1, sum21); + sum22 = _mm256_add_epi32(mc2, sum22); + sum23 = _mm256_add_epi32(mc3, sum23); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga6)); + + mc0 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma6_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma6_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma6_l, mb3_l); + + sum24 = _mm256_add_epi32(mc0, sum24); + sum25 = _mm256_add_epi32(mc1, sum25); + sum26 = _mm256_add_epi32(mc2, sum26); + sum27 = _mm256_add_epi32(mc3, sum27); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga7)); + + mc0 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma7_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma7_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma7_l, mb3_l); + + sum28 = _mm256_add_epi32(mc0, sum28); + sum29 = _mm256_add_epi32(mc1, sum29); + sum30 = _mm256_add_epi32(mc2, sum30); + sum31 = _mm256_add_epi32(mc3, sum31); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum0 = _mm256_hadd_epi32(sum0, sum2); + + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + pc0[2 * stride] = _mm256_extract_epi32(sum0, 2); + pc0[3 * stride] = _mm256_extract_epi32(sum0, 3); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum4 = _mm256_hadd_epi32(sum4, sum6); + + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum4, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum4, 1); + pc1[2 * stride] = _mm256_extract_epi32(sum4, 2); + pc1[3 * stride] = _mm256_extract_epi32(sum4, 3); + + //the 2 row + sum8 = _mm256_hadd_epi32(sum8, sum9); + sum10 = _mm256_hadd_epi32(sum10, sum11); + sum8 = _mm256_hadd_epi32(sum8, sum10); + sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum8, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum8, 1); + pc2[2 * stride] = _mm256_extract_epi32(sum8, 2); + pc2[3 * stride] = _mm256_extract_epi32(sum8, 3); + + //the 3 row + sum12 = _mm256_hadd_epi32(sum12, sum13); + sum14 = _mm256_hadd_epi32(sum14, sum15); + sum12 = _mm256_hadd_epi32(sum12, sum14); + sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, zero, 0x31)); + pc3[0] = _mm256_extract_epi32(sum12, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum12, 1); + pc3[2 * stride] = _mm256_extract_epi32(sum12, 2); + pc3[3 * stride] = _mm256_extract_epi32(sum12, 3); + + //the 4 row + sum16 = _mm256_hadd_epi32(sum16, sum17); + sum18 = _mm256_hadd_epi32(sum18, sum19); + sum16 = _mm256_hadd_epi32(sum16, sum18); + sum16 = _mm256_add_epi32(sum16, _mm256_permute2x128_si256(sum16, zero, 0x31)); + pc4[0] = _mm256_extract_epi32(sum16, 0); + pc4[1 * stride] = _mm256_extract_epi32(sum16, 1); + pc4[2 * stride] = _mm256_extract_epi32(sum16, 2); + pc4[3 * stride] = _mm256_extract_epi32(sum16, 3); + + //the 5 row + sum20 = _mm256_hadd_epi32(sum20, sum21); + sum22 = _mm256_hadd_epi32(sum22, sum23); + sum20 = _mm256_hadd_epi32(sum20, sum22); + sum20 = _mm256_add_epi32(sum20, _mm256_permute2x128_si256(sum20, zero, 0x31)); + pc5[0] = _mm256_extract_epi32(sum20, 0); + pc5[1 * stride] = _mm256_extract_epi32(sum20, 1); + pc5[2 * stride] = _mm256_extract_epi32(sum20, 2); + pc5[3 * stride] = _mm256_extract_epi32(sum20, 3); + + //the 6 row + sum24 = _mm256_hadd_epi32(sum24, sum25); + sum26 = _mm256_hadd_epi32(sum26, sum27); + sum24 = _mm256_hadd_epi32(sum24, sum26); + sum24 = _mm256_add_epi32(sum24, _mm256_permute2x128_si256(sum24, zero, 0x31)); + pc6[0] = _mm256_extract_epi32(sum24, 0); + pc6[1 * stride] = _mm256_extract_epi32(sum24, 1); + pc6[2 * stride] = _mm256_extract_epi32(sum24, 2); + pc6[3 * stride] = _mm256_extract_epi32(sum24, 3); + + //the 7 row + sum28 = _mm256_hadd_epi32(sum28, sum29); + sum30 = _mm256_hadd_epi32(sum30, sum31); + sum28 = _mm256_hadd_epi32(sum28, sum30); + sum28 = _mm256_add_epi32(sum28, _mm256_permute2x128_si256(sum28, zero, 0x31)); + pc7[0] = _mm256_extract_epi32(sum28, 0); + pc7[1 * stride] = _mm256_extract_epi32(sum28, 1); + pc7[2 * stride] = _mm256_extract_epi32(sum28, 2); + pc7[3 * stride] = _mm256_extract_epi32(sum28, 3); + +} + +inline void block8x2_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int32_t stride) { + //printf("block8x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + const int8_t* pa4 = pa0 + 4 * lda; + const int8_t* pa5 = pa0 + 5 * lda; + const int8_t* pa6 = pa0 + 6 * lda; + const int8_t* pa7 = pa0 + 7 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + int* pc4 = c + 4 * ldc; + int* pc5 = c + 5 * ldc; + int* pc6 = c + 6 * ldc; + int* pc7 = c + 7 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma4_l; + __m256i ma5_l; + __m256i ma6_l; + __m256i ma7_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + __m256i ma4_h; + __m256i ma5_h; + __m256i ma6_h; + __m256i ma7_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb0_h; + __m256i mb1_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + __m256i sum8 = _mm256_setzero_si256(); + __m256i sum9 = _mm256_setzero_si256(); + __m256i sum10 = _mm256_setzero_si256(); + __m256i sum11 = _mm256_setzero_si256(); + __m256i sum12 = _mm256_setzero_si256(); + __m256i sum13 = _mm256_setzero_si256(); + __m256i sum14 = _mm256_setzero_si256(); + __m256i sum15 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb0_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb1_h)); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma2_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma2_h, mb1_h)); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma3_h, mb0_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma3_h, mb1_h)); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + ma4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa4 + 16))); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma4_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma4_h, mb1_h)); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + ma5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa5 + 16))); + + mc2 = _mm256_madd_epi16(ma5_l, mb0_l); + mc3 = _mm256_madd_epi16(ma5_l, mb1_l); + + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma5_h, mb0_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma5_h, mb1_h)); + + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + ma6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa6 + 16))); + + mc4 = _mm256_madd_epi16(ma6_l, mb0_l); + mc5 = _mm256_madd_epi16(ma6_l, mb1_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma6_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma6_h, mb1_h)); + + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + ma7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa7 + 16))); + + mc6 = _mm256_madd_epi16(ma7_l, mb0_l); + mc7 = _mm256_madd_epi16(ma7_l, mb1_l); + + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma7_h, mb0_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma7_h, mb1_h)); + + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + pa4 += 32; + pa5 += 32; + pa6 += 32; + pa7 += 32; + + pb0 += 32; + pb1 += 32; + } + + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + + mc0 = _mm256_madd_epi16(ma4_l, mb0_l); + mc1 = _mm256_madd_epi16(ma4_l, mb1_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + + mc2 = _mm256_madd_epi16(ma5_l, mb0_l); + mc3 = _mm256_madd_epi16(ma5_l, mb1_l); + + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + + mc4 = _mm256_madd_epi16(ma6_l, mb0_l); + mc5 = _mm256_madd_epi16(ma6_l, mb1_l); + + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + + mc6 = _mm256_madd_epi16(ma7_l, mb0_l); + mc7 = _mm256_madd_epi16(ma7_l, mb1_l); + + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + pa4 += 16; + pa5 += 16; + pa6 += 16; + pa7 += 16; + + pb0 += 16; + pb1 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa5)); + + mc2 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb1_l); + + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa6)); + + mc4 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma6_l, mb1_l); + + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa7)); + + mc6 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma7_l, mb1_l); + + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + pa4 += 8; + pa5 += 8; + pa6 += 8; + pa7 += 8; + + pb0 += 8; + pb1 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + ga4[i] = pa4[i]; + ga5[i] = pa5[i]; + ga6[i] = pa6[i]; + ga7[i] = pa7[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga4)); + + mc0 = _mm256_mullo_epi32(ma4_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma4_l, mb1_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga5)); + + mc2 = _mm256_mullo_epi32(ma5_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma5_l, mb1_l); + + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga6)); + + mc4 = _mm256_mullo_epi32(ma6_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma6_l, mb1_l); + + sum12 = _mm256_add_epi32(mc4, sum12); + sum13 = _mm256_add_epi32(mc5, sum13); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga7)); + + mc6 = _mm256_mullo_epi32(ma7_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma7_l, mb1_l); + + sum14 = _mm256_add_epi32(mc6, sum14); + sum15 = _mm256_add_epi32(mc7, sum15); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum2, 1); + + //the 2 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum4, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum4, 1); + + //the 3 row + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum6 = _mm256_hadd_epi32(sum6, zero); + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, zero, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum6, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum6, 1); + + //the 4 row + sum8 = _mm256_hadd_epi32(sum8, sum9); + sum8 = _mm256_hadd_epi32(sum8, zero); + sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, zero, 0x31)); + + pc4[0] = _mm256_extract_epi32(sum8, 0); + pc4[1 * stride] = _mm256_extract_epi32(sum8, 1); + + //the 5 row + sum10 = _mm256_hadd_epi32(sum10, sum11); + sum10 = _mm256_hadd_epi32(sum10, zero); + sum10 = _mm256_add_epi32(sum10, _mm256_permute2x128_si256(sum10, zero, 0x31)); + + pc5[0] = _mm256_extract_epi32(sum10, 0); + pc5[1 * stride] = _mm256_extract_epi32(sum10, 1); + + //the 6 row + sum12 = _mm256_hadd_epi32(sum12, sum13); + sum12 = _mm256_hadd_epi32(sum12, zero); + sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, zero, 0x31)); + + pc6[0] = _mm256_extract_epi32(sum12, 0); + pc6[1 * stride] = _mm256_extract_epi32(sum12, 1); + + //the 7 row + sum14 = _mm256_hadd_epi32(sum14, sum15); + sum14 = _mm256_hadd_epi32(sum14, zero); + sum14 = _mm256_add_epi32(sum14, _mm256_permute2x128_si256(sum14, zero, 0x31)); + + pc7[0] = _mm256_extract_epi32(sum14, 0); + pc7[1 * stride] = _mm256_extract_epi32(sum14, 1); +} + +inline void block8x1_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int32_t stride) { + //printf("block8x1_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + const int8_t* pa4 = pa0 + 4 * lda; + const int8_t* pa5 = pa0 + 5 * lda; + const int8_t* pa6 = pa0 + 6 * lda; + const int8_t* pa7 = pa0 + 7 * lda; + + const int8_t* pb0 = b; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + int* pc4 = c + 4 * ldc; + int* pc5 = c + 5 * ldc; + int* pc6 = c + 6 * ldc; + int* pc7 = c + 7 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma4_l; + __m256i ma5_l; + __m256i ma6_l; + __m256i ma7_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + __m256i ma4_h; + __m256i ma5_h; + __m256i ma6_h; + __m256i ma7_h; + + __m256i mb0_l; + __m256i mb0_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma1_h, mb0_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc2 = _mm256_madd_epi16(ma2_l, mb0_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma2_h, mb0_h)); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc3 = _mm256_madd_epi16(ma3_l, mb0_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma3_h, mb0_h)); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + ma4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa4 + 16))); + + mc4 = _mm256_madd_epi16(ma4_l, mb0_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma4_h, mb0_h)); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + ma5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa5 + 16))); + + mc5 = _mm256_madd_epi16(ma5_l, mb0_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma5_h, mb0_h)); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + ma6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa6 + 16))); + + mc6 = _mm256_madd_epi16(ma6_l, mb0_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma6_h, mb0_h)); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + ma7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa7 + 16))); + + mc7 = _mm256_madd_epi16(ma7_l, mb0_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma7_h, mb0_h)); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + pa4 += 32; + pa5 += 32; + pa6 += 32; + pa7 += 32; + + pb0 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + mc2 = _mm256_madd_epi16(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + mc3 = _mm256_madd_epi16(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa4)); + mc4 = _mm256_madd_epi16(ma4_l, mb0_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa5)); + mc5 = _mm256_madd_epi16(ma5_l, mb0_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa6)); + mc6 = _mm256_madd_epi16(ma6_l, mb0_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa7)); + mc7 = _mm256_madd_epi16(ma7_l, mb0_l); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + pa4 += 16; + pa5 += 16; + pa6 += 16; + pa7 += 16; + + pb0 += 16; + } + + if (0x08 & k_leftover) { + //a + __m256i ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + __m256i mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + mc2 = _mm256_mullo_epi32(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + mc3 = _mm256_mullo_epi32(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa4)); + mc4 = _mm256_mullo_epi32(ma4_l, mb0_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa5)); + mc5 = _mm256_mullo_epi32(ma5_l, mb0_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa6)); + mc6 = _mm256_mullo_epi32(ma6_l, mb0_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa7)); + mc7 = _mm256_mullo_epi32(ma7_l, mb0_l); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + pa4 += 8; + pa5 += 8; + pa6 += 8; + pa7 += 8; + + pb0 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + ga4[i] = pa4[i]; + ga5[i] = pa5[i]; + ga6[i] = pa6[i]; + ga7[i] = pa7[i]; + + gb0[i] = pb0[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + mc2 = _mm256_mullo_epi32(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + mc3 = _mm256_mullo_epi32(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 row + ma4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga4)); + mc4 = _mm256_mullo_epi32(ma4_l, mb0_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 row + ma5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga5)); + mc5 = _mm256_mullo_epi32(ma5_l, mb0_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 row + ma6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga6)); + mc6 = _mm256_mullo_epi32(ma6_l, mb0_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 row + ma7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga7)); + mc7 = _mm256_mullo_epi32(ma7_l, mb0_l); + sum7 = _mm256_add_epi32(mc7, sum7); + } + + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); + + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4)); + pc1[0] = _mm256_extract_epi32(sum1, 0); + + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4)); + pc2[0] = _mm256_extract_epi32(sum2, 0); + + sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4)); + pc3[0] = _mm256_extract_epi32(sum3, 0); + + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, sum4, 0x81)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 8)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 4)); + pc4[0] = _mm256_extract_epi32(sum4, 0); + + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, sum5, 0x81)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 8)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 4)); + pc5[0] = _mm256_extract_epi32(sum5, 0); + + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, sum6, 0x81)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 8)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 4)); + pc6[0] = _mm256_extract_epi32(sum6, 0); + + sum7 = _mm256_add_epi32(sum7, _mm256_permute2x128_si256(sum7, sum7, 0x81)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 8)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 4)); + pc7[0] = _mm256_extract_epi32(sum7, 0); + +} + +inline void block4x8_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int32_t stride) { + //printf("block8x4_kernel_avx2\n"); + block8x4_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); +} + +inline void block4x4_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block4x4_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb2_l; + __m256i mb3_l; + __m256i mb0_h; + __m256i mb1_h; + __m256i mb2_h; + __m256i mb3_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + __m256i sum8 = _mm256_setzero_si256(); + __m256i sum9 = _mm256_setzero_si256(); + __m256i sum10 = _mm256_setzero_si256(); + __m256i sum11 = _mm256_setzero_si256(); + __m256i sum12 = _mm256_setzero_si256(); + __m256i sum13 = _mm256_setzero_si256(); + __m256i sum14 = _mm256_setzero_si256(); + __m256i sum15 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb2 + 16))); + + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mb3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb3 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma0_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma0_h, mb3_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc0 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_madd_epi16(ma1_l, mb1_l); + mc2 = _mm256_madd_epi16(ma1_l, mb2_l); + mc3 = _mm256_madd_epi16(ma1_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma1_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma1_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb3_h)); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma2_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma2_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma2_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma2_h, mb3_h)); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc0 = _mm256_madd_epi16(ma3_l, mb0_l); + mc1 = _mm256_madd_epi16(ma3_l, mb1_l); + mc2 = _mm256_madd_epi16(ma3_l, mb2_l); + mc3 = _mm256_madd_epi16(ma3_l, mb3_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma3_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma3_h, mb1_h)); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma3_h, mb2_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma3_h, mb3_h)); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc0 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_madd_epi16(ma1_l, mb1_l); + mc2 = _mm256_madd_epi16(ma1_l, mb2_l); + mc3 = _mm256_madd_epi16(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc0 = _mm256_madd_epi16(ma2_l, mb0_l); + mc1 = _mm256_madd_epi16(ma2_l, mb1_l); + mc2 = _mm256_madd_epi16(ma2_l, mb2_l); + mc3 = _mm256_madd_epi16(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc0 = _mm256_madd_epi16(ma3_l, mb0_l); + mc1 = _mm256_madd_epi16(ma3_l, mb1_l); + mc2 = _mm256_madd_epi16(ma3_l, mb2_l); + mc3 = _mm256_madd_epi16(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb3)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + + pb0 += 8; + pb1 += 8; + pb2 += 8; + pb3 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + gb2[i] = pb2[i]; + gb3[i] = pb3[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb2)); + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb3)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc0 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma1_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma1_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb3_l); + + sum4 = _mm256_add_epi32(mc0, sum4); + sum5 = _mm256_add_epi32(mc1, sum5); + sum6 = _mm256_add_epi32(mc2, sum6); + sum7 = _mm256_add_epi32(mc3, sum7); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc0 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma2_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma2_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma2_l, mb3_l); + + sum8 = _mm256_add_epi32(mc0, sum8); + sum9 = _mm256_add_epi32(mc1, sum9); + sum10 = _mm256_add_epi32(mc2, sum10); + sum11 = _mm256_add_epi32(mc3, sum11); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc0 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma3_l, mb1_l); + mc2 = _mm256_mullo_epi32(ma3_l, mb2_l); + mc3 = _mm256_mullo_epi32(ma3_l, mb3_l); + + sum12 = _mm256_add_epi32(mc0, sum12); + sum13 = _mm256_add_epi32(mc1, sum13); + sum14 = _mm256_add_epi32(mc2, sum14); + sum15 = _mm256_add_epi32(mc3, sum15); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + pc0[2] = _mm256_extract_epi32(sum0, 2); + pc0[3] = _mm256_extract_epi32(sum0, 3); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum4, 0); + pc1[1] = _mm256_extract_epi32(sum4, 1); + pc1[2] = _mm256_extract_epi32(sum4, 2); + pc1[3] = _mm256_extract_epi32(sum4, 3); + + //the 2 row + sum8 = _mm256_hadd_epi32(sum8, sum9); + sum10 = _mm256_hadd_epi32(sum10, sum11); + sum8 = _mm256_hadd_epi32(sum8, sum10); + sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum8, 0); + pc2[1] = _mm256_extract_epi32(sum8, 1); + pc2[2] = _mm256_extract_epi32(sum8, 2); + pc2[3] = _mm256_extract_epi32(sum8, 3); + + //the 3 row + sum12 = _mm256_hadd_epi32(sum12, sum13); + sum14 = _mm256_hadd_epi32(sum14, sum15); + sum12 = _mm256_hadd_epi32(sum12, sum14); + sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, zero, 0x31)); + pc3[0] = _mm256_extract_epi32(sum12, 0); + pc3[1] = _mm256_extract_epi32(sum12, 1); + pc3[2] = _mm256_extract_epi32(sum12, 2); + pc3[3] = _mm256_extract_epi32(sum12, 3); +} + +inline void block4x2_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb0_h; + __m256i mb1_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb0_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb1_h)); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma2_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma2_h, mb1_h)); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma3_h, mb0_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma3_h, mb1_h)); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + pb1 += 16; + } + + if (0x08 & k_leftover) { + //a + __m256i ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + __m256i mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + __m256i mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + + pb0 += 8; + pb1 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum2, 1); + + //the 2 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum4, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum4, 1); + + //the 3 row + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum6 = _mm256_hadd_epi32(sum6, zero); + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, zero, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum6, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum6, 1); +} + +inline void block4x1_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block4x1_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + + __m256i mb0_l; + __m256i mb0_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma1_h, mb0_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc2 = _mm256_madd_epi16(ma2_l, mb0_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma2_h, mb0_h)); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc3 = _mm256_madd_epi16(ma3_l, mb0_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma3_h, mb0_h)); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + mc2 = _mm256_madd_epi16(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + mc3 = _mm256_madd_epi16(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + mc2 = _mm256_mullo_epi32(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + mc3 = _mm256_mullo_epi32(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + + pb0 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + + gb0[i] = pb0[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + mc2 = _mm256_mullo_epi32(ma2_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + mc3 = _mm256_mullo_epi32(ma3_l, mb0_l); + sum3 = _mm256_add_epi32(mc3, sum3); + } + + //store + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); + + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4)); + pc1[0] = _mm256_extract_epi32(sum1, 0); + + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4)); + pc2[0] = _mm256_extract_epi32(sum2, 0); + + sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4)); + pc3[0] = _mm256_extract_epi32(sum3, 0); +} + +inline void block2x8_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block2x8_kernel_avx2\n"); + block8x2_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); +} + +inline void block2x4_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block2x4_kernel_avx2\n"); + block4x2_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); + +} + +inline void block2x2_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block2x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma0_h; + __m256i ma1_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb0_h; + __m256i mb1_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb0_h)); + sum2 = _mm256_add_epi32(mc2, sum2); + + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb1_h)); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 32; + pa1 += 32; + pb0 += 32; + pb1 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + pb1 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + pa0 += 8; + pb0 += 8; + pa1 += 8; + pb1 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum3 = _mm256_add_epi32(mc3, sum3); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1] = _mm256_extract_epi32(sum2, 1); +} + +inline void block2x1_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block2x1_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + const int8_t* pb0 = b; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma0_h; + __m256i ma1_h; + + __m256i mb0_l; + __m256i mb0_h; + + __m256i mc0; + __m256i mc1; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma1_h, mb0_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + pa0 += 32; + pa1 += 32; + + pb0 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + mc1 = _mm256_madd_epi16(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + } + + if (0x08 & k_leftover) { + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + + //the 0 row + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + pa0 += 8; + pa1 += 8; + + pb0 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + + gb0[i] = pb0[i]; + } + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + + //the 0 row + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + mc1 = _mm256_mullo_epi32(ma1_l, mb0_l); + sum1 = _mm256_add_epi32(mc1, sum1); + } + + //store + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); + + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4)); + pc1[0] = _mm256_extract_epi32(sum1, 0); +} + +inline void block1x16_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c) { + //printf("block1x16_kernel_avx2\n"); + const int8_t* pa0 = a; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + const int8_t* pb4 = pb0 + 4 * ldb; + const int8_t* pb5 = pb0 + 5 * ldb; + const int8_t* pb6 = pb0 + 6 * ldb; + const int8_t* pb7 = pb0 + 7 * ldb; + const int8_t* pb8 = pb0 + 8 * ldb; + const int8_t* pb9 = pb0 + 9 * ldb; + const int8_t* pb10 = pb0 + 10 * ldb; + const int8_t* pb11 = pb0 + 11 * ldb; + const int8_t* pb12 = pb0 + 12 * ldb; + const int8_t* pb13 = pb0 + 13 * ldb; + const int8_t* pb14 = pb0 + 14 * ldb; + const int8_t* pb15 = pb0 + 15 * ldb; + + int* pc0 = c; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma0_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb2_l; + __m256i mb3_l; + __m256i mb4_l; + __m256i mb5_l; + __m256i mb6_l; + __m256i mb7_l; + __m256i mb0_h; + __m256i mb1_h; + __m256i mb2_h; + __m256i mb3_h; + __m256i mb4_h; + __m256i mb5_h; + __m256i mb6_h; + __m256i mb7_h; + __m256i mb8_l; + __m256i mb9_l; + __m256i mb10_l; + __m256i mb11_l; + __m256i mb12_l; + __m256i mb13_l; + __m256i mb14_l; + __m256i mb15_l; + __m256i mb8_h; + __m256i mb9_h; + __m256i mb10_h; + __m256i mb11_h; + __m256i mb12_h; + __m256i mb13_h; + __m256i mb14_h; + __m256i mb15_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + __m256i mc8; + __m256i mc9; + __m256i mc10; + __m256i mc11; + __m256i mc12; + __m256i mc13; + __m256i mc14; + __m256i mc15; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + __m256i sum8 = _mm256_setzero_si256(); + __m256i sum9 = _mm256_setzero_si256(); + __m256i sum10 = _mm256_setzero_si256(); + __m256i sum11 = _mm256_setzero_si256(); + __m256i sum12 = _mm256_setzero_si256(); + __m256i sum13 = _mm256_setzero_si256(); + __m256i sum14 = _mm256_setzero_si256(); + __m256i sum15 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //the 0 col + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 col + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 col + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb2 + 16))); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma0_h, mb2_h)); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 col + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mb3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb3 + 16))); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma0_h, mb3_h)); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 col + mb4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb4)); + mb4_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb4 + 16))); + mc4 = _mm256_madd_epi16(ma0_l, mb4_l); + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma0_h, mb4_h)); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 col + mb5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb5)); + mb5_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb5 + 16))); + mc5 = _mm256_madd_epi16(ma0_l, mb5_l); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma0_h, mb5_h)); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 col + mb6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb6)); + mb6_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb6 + 16))); + mc6 = _mm256_madd_epi16(ma0_l, mb6_l); + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma0_h, mb6_h)); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 col + mb7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb7)); + mb7_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb7 + 16))); + mc7 = _mm256_madd_epi16(ma0_l, mb7_l); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma0_h, mb7_h)); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 8 col + mb8_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb8)); + mb8_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb8 + 16))); + mc8 = _mm256_madd_epi16(ma0_l, mb8_l); + mc8 = _mm256_add_epi32(mc8, _mm256_madd_epi16(ma0_h, mb8_h)); + sum8 = _mm256_add_epi32(mc8, sum8); + + //the 9 col + mb9_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb9)); + mb9_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb9 + 16))); + mc9 = _mm256_madd_epi16(ma0_l, mb9_l); + mc9 = _mm256_add_epi32(mc9, _mm256_madd_epi16(ma0_h, mb9_h)); + sum9 = _mm256_add_epi32(mc9, sum9); + + //the 10 col + mb10_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb10)); + mb10_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb10 + 16))); + mc10 = _mm256_madd_epi16(ma0_l, mb10_l); + mc10 = _mm256_add_epi32(mc10, _mm256_madd_epi16(ma0_h, mb10_h)); + sum10 = _mm256_add_epi32(mc10, sum10); + + //the 11 col + mb11_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb11)); + mb11_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb11 + 16))); + mc11 = _mm256_madd_epi16(ma0_l, mb11_l); + mc11 = _mm256_add_epi32(mc11, _mm256_madd_epi16(ma0_h, mb11_h)); + sum11 = _mm256_add_epi32(mc11, sum11); + + //the 12 col + mb12_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb12)); + mb12_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb12 + 16))); + mc12 = _mm256_madd_epi16(ma0_l, mb12_l); + mc12 = _mm256_add_epi32(mc12, _mm256_madd_epi16(ma0_h, mb12_h)); + sum12 = _mm256_add_epi32(mc12, sum12); + + //the 13 col + mb13_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb13)); + mb13_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb13 + 16))); + mc13 = _mm256_madd_epi16(ma0_l, mb13_l); + mc13 = _mm256_add_epi32(mc13, _mm256_madd_epi16(ma0_h, mb13_h)); + sum13 = _mm256_add_epi32(mc13, sum13); + + //the 14 col + mb14_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb14)); + mb14_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb14 + 16))); + mc14 = _mm256_madd_epi16(ma0_l, mb14_l); + mc14 = _mm256_add_epi32(mc14, _mm256_madd_epi16(ma0_h, mb14_h)); + sum14 = _mm256_add_epi32(mc14, sum14); + + //the 15 col + mb15_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb15)); + mb15_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb15 + 16))); + mc15 = _mm256_madd_epi16(ma0_l, mb15_l); + mc15 = _mm256_add_epi32(mc15, _mm256_madd_epi16(ma0_h, mb15_h)); + sum15 = _mm256_add_epi32(mc15, sum15); + + pa0 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + pb4 += 32; + pb5 += 32; + pb6 += 32; + pb7 += 32; + + pb8 += 32; + pb9 += 32; + pb10 += 32; + pb11 += 32; + pb12 += 32; + pb13 += 32; + pb14 += 32; + pb15 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //the 0 col + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 col + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 col + mb2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mc2 = _mm256_madd_epi16(ma0_l, mb2_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 col + mb3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + mc3 = _mm256_madd_epi16(ma0_l, mb3_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 col + mb4_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb4)); + mc4 = _mm256_madd_epi16(ma0_l, mb4_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 col + mb5_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb5)); + mc5 = _mm256_madd_epi16(ma0_l, mb5_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 col + mb6_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb6)); + mc6 = _mm256_madd_epi16(ma0_l, mb6_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 col + mb7_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb7)); + mc7 = _mm256_madd_epi16(ma0_l, mb7_l); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 8 col + mb8_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb8)); + mc8 = _mm256_madd_epi16(ma0_l, mb8_l); + sum8 = _mm256_add_epi32(mc8, sum8); + + //the 9 col + mb9_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb9)); + mc9 = _mm256_madd_epi16(ma0_l, mb9_l); + sum9 = _mm256_add_epi32(mc9, sum9); + + //the 10 col + mb10_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb10)); + mc10 = _mm256_madd_epi16(ma0_l, mb10_l); + sum10 = _mm256_add_epi32(mc10, sum10); + + //the 11 col + mb11_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb11)); + mc11 = _mm256_madd_epi16(ma0_l, mb11_l); + sum11 = _mm256_add_epi32(mc11, sum11); + + //the 12 col + mb12_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb12)); + mc12 = _mm256_madd_epi16(ma0_l, mb12_l); + sum12 = _mm256_add_epi32(mc12, sum12); + + //the 13 col + mb13_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb13)); + mc13 = _mm256_madd_epi16(ma0_l, mb13_l); + sum13 = _mm256_add_epi32(mc13, sum13); + + //the 14 col + mb14_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb14)); + mc14 = _mm256_madd_epi16(ma0_l, mb14_l); + sum14 = _mm256_add_epi32(mc14, sum14); + + //the 15 col + mb15_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb15)); + mc15 = _mm256_madd_epi16(ma0_l, mb15_l); + sum15 = _mm256_add_epi32(mc15, sum15); + + pa0 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + pb4 += 16; + pb5 += 16; + pb6 += 16; + pb7 += 16; + + pb8 += 16; + pb9 += 16; + pb10 += 16; + pb11 += 16; + pb12 += 16; + pb13 += 16; + pb14 += 16; + pb15 += 16; + } + + if (0x08 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //the 0 col + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 col + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 col + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb2)); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 col + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb3)); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 col + mb4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb4)); + mc4 = _mm256_mullo_epi32(ma0_l, mb4_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 col + mb5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb5)); + mc5 = _mm256_mullo_epi32(ma0_l, mb5_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 col + mb6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb6)); + mc6 = _mm256_mullo_epi32(ma0_l, mb6_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 col + mb7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb7)); + mc7 = _mm256_mullo_epi32(ma0_l, mb7_l); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 8 col + mb8_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb8)); + mc8 = _mm256_mullo_epi32(ma0_l, mb8_l); + sum8 = _mm256_add_epi32(mc8, sum8); + + //the 9 col + mb9_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb9)); + mc9 = _mm256_mullo_epi32(ma0_l, mb9_l); + sum9 = _mm256_add_epi32(mc9, sum9); + + //the 10 col + mb10_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb10)); + mc10 = _mm256_mullo_epi32(ma0_l, mb10_l); + sum10 = _mm256_add_epi32(mc10, sum10); + + //the 11 col + mb11_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb11)); + mc11 = _mm256_mullo_epi32(ma0_l, mb11_l); + sum11 = _mm256_add_epi32(mc11, sum11); + + //the 12 col + mb12_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb12)); + mc12 = _mm256_mullo_epi32(ma0_l, mb12_l); + sum12 = _mm256_add_epi32(mc12, sum12); + + //the 13 col + mb13_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb13)); + mc13 = _mm256_mullo_epi32(ma0_l, mb13_l); + sum13 = _mm256_add_epi32(mc13, sum13); + + //the 14 col + mb14_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb14)); + mc14 = _mm256_mullo_epi32(ma0_l, mb14_l); + sum14 = _mm256_add_epi32(mc14, sum14); + + //the 15 col + mb15_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb15)); + mc15 = _mm256_mullo_epi32(ma0_l, mb15_l); + sum15 = _mm256_add_epi32(mc15, sum15); + + pa0 += 8; + + pb0 += 8; + pb1 += 8; + pb2 += 8; + pb3 += 8; + pb4 += 8; + pb5 += 8; + pb6 += 8; + pb7 += 8; + + pb8 += 8; + pb9 += 8; + pb10 += 8; + pb11 += 8; + pb12 += 8; + pb13 += 8; + pb14 += 8; + pb15 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb4[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb5[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb6[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb7[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb8[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb9[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb10[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb11[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb12[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb13[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb14[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb15[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + gb2[i] = pb2[i]; + gb3[i] = pb3[i]; + gb4[i] = pb4[i]; + gb5[i] = pb5[i]; + gb6[i] = pb6[i]; + gb7[i] = pb7[i]; + + gb8[i] = pb8[i]; + gb9[i] = pb9[i]; + gb10[i] = pb10[i]; + gb11[i] = pb11[i]; + gb12[i] = pb12[i]; + gb13[i] = pb13[i]; + gb14[i] = pb14[i]; + gb15[i] = pb15[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //the 0 col + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + //the 1 col + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 2 col + mb2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb2)); + mc2 = _mm256_mullo_epi32(ma0_l, mb2_l); + sum2 = _mm256_add_epi32(mc2, sum2); + + //the 3 col + mb3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb3)); + mc3 = _mm256_mullo_epi32(ma0_l, mb3_l); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 4 col + mb4_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb4)); + mc4 = _mm256_mullo_epi32(ma0_l, mb4_l); + sum4 = _mm256_add_epi32(mc4, sum4); + + //the 5 col + mb5_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb5)); + mc5 = _mm256_mullo_epi32(ma0_l, mb5_l); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 6 col + mb6_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb6)); + mc6 = _mm256_mullo_epi32(ma0_l, mb6_l); + sum6 = _mm256_add_epi32(mc6, sum6); + + //the 7 col + mb7_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb7)); + mc7 = _mm256_mullo_epi32(ma0_l, mb7_l); + sum7 = _mm256_add_epi32(mc7, sum7); + + //the 8 col + mb8_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb8)); + mc8 = _mm256_mullo_epi32(ma0_l, mb8_l); + sum8 = _mm256_add_epi32(mc8, sum8); + + //the 9 col + mb9_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb9)); + mc9 = _mm256_mullo_epi32(ma0_l, mb9_l); + sum9 = _mm256_add_epi32(mc9, sum9); + + //the 10 col + mb10_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb10)); + mc10 = _mm256_mullo_epi32(ma0_l, mb10_l); + sum10 = _mm256_add_epi32(mc10, sum10); + + //the 11 col + mb11_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb11)); + mc11 = _mm256_mullo_epi32(ma0_l, mb11_l); + sum11 = _mm256_add_epi32(mc11, sum11); + + //the 12 col + mb12_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb12)); + mc12 = _mm256_mullo_epi32(ma0_l, mb12_l); + sum12 = _mm256_add_epi32(mc12, sum12); + + //the 13 col + mb13_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb13)); + mc13 = _mm256_mullo_epi32(ma0_l, mb13_l); + sum13 = _mm256_add_epi32(mc13, sum13); + + //the 14 col + mb14_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb14)); + mc14 = _mm256_mullo_epi32(ma0_l, mb14_l); + sum14 = _mm256_add_epi32(mc14, sum14); + + //the 15 col + mb15_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb15)); + mc15 = _mm256_mullo_epi32(ma0_l, mb15_l); + sum15 = _mm256_add_epi32(mc15, sum15); + } + + //store + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); + + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4)); + pc0[1] = _mm256_extract_epi32(sum1, 0); + + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4)); + pc0[2] = _mm256_extract_epi32(sum2, 0); + + sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4)); + pc0[3] = _mm256_extract_epi32(sum3, 0); + + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, sum4, 0x81)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 8)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 4)); + pc0[4] = _mm256_extract_epi32(sum4, 0); + + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, sum5, 0x81)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 8)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 4)); + pc0[5] = _mm256_extract_epi32(sum5, 0); + + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, sum6, 0x81)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 8)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 4)); + pc0[6] = _mm256_extract_epi32(sum6, 0); + + sum7 = _mm256_add_epi32(sum7, _mm256_permute2x128_si256(sum7, sum7, 0x81)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 8)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 4)); + pc0[7] = _mm256_extract_epi32(sum7, 0); + + sum8 = _mm256_add_epi32(sum8, _mm256_permute2x128_si256(sum8, sum8, 0x81)); + sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 8)); + sum8 = _mm256_add_epi32(sum8, _mm256_srli_si256(sum8, 4)); + pc0[8] = _mm256_extract_epi32(sum8, 0); + + sum9 = _mm256_add_epi32(sum9, _mm256_permute2x128_si256(sum9, sum9, 0x81)); + sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 8)); + sum9 = _mm256_add_epi32(sum9, _mm256_srli_si256(sum9, 4)); + pc0[9] = _mm256_extract_epi32(sum9, 0); + + sum10 = _mm256_add_epi32(sum10, _mm256_permute2x128_si256(sum10, sum10, 0x81)); + sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 8)); + sum10 = _mm256_add_epi32(sum10, _mm256_srli_si256(sum10, 4)); + pc0[10] = _mm256_extract_epi32(sum10, 0); + + sum11 = _mm256_add_epi32(sum11, _mm256_permute2x128_si256(sum11, sum11, 0x81)); + sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 8)); + sum11 = _mm256_add_epi32(sum11, _mm256_srli_si256(sum11, 4)); + pc0[11] = _mm256_extract_epi32(sum11, 0); + + sum12 = _mm256_add_epi32(sum12, _mm256_permute2x128_si256(sum12, sum12, 0x81)); + sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 8)); + sum12 = _mm256_add_epi32(sum12, _mm256_srli_si256(sum12, 4)); + pc0[12] = _mm256_extract_epi32(sum12, 0); + + sum13 = _mm256_add_epi32(sum13, _mm256_permute2x128_si256(sum13, sum13, 0x81)); + sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 8)); + sum13 = _mm256_add_epi32(sum13, _mm256_srli_si256(sum13, 4)); + pc0[13] = _mm256_extract_epi32(sum13, 0); + + sum14 = _mm256_add_epi32(sum14, _mm256_permute2x128_si256(sum14, sum14, 0x81)); + sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 8)); + sum14 = _mm256_add_epi32(sum14, _mm256_srli_si256(sum14, 4)); + pc0[14] = _mm256_extract_epi32(sum14, 0); + + sum15 = _mm256_add_epi32(sum15, _mm256_permute2x128_si256(sum15, sum15, 0x81)); + sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 8)); + sum15 = _mm256_add_epi32(sum15, _mm256_srli_si256(sum15, 4)); + pc0[15] = _mm256_extract_epi32(sum15, 0); +} + +void block1x8_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + block8x1_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); +} + +void block1x4_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + block4x1_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); +} + +void block1x2_kernel_avx2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + block2x1_kernel_avx2(k, b, ldb, a, lda, c, stride, ldc); +} + +void block1x1_kernel_avx2(const int32_t k, const int8_t* a, const int8_t* b, int* c) { + //printf("block1x1_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pb0 = b; + + int* pc0 = c; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma0_h; + + __m256i mb0_l; + __m256i mb0_h; + + __m256i mc0; + + __m256i sum0 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + sum0 = _mm256_add_epi32(mc0, sum0); + + pa0 += 32; + pb0 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + pa0 += 16; + pb0 += 16; + } + + if (0x08 & k_leftover) { + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + + //the 0 row + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + + pa0 += 8; + pb0 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + gb0[i] = pb0[i]; + } + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + + //the 0 row + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + sum0 = _mm256_add_epi32(mc0, sum0); + } + + //store + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); +} + +void chgemm_c_c_n_t_avx2( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + size_t m_block_size = 8; + size_t mb = m / m_block_size; + size_t m_leftover = m % m_block_size; + + // LOG(INFO)<<"chgemm_c_c_n_t_avx2"; + //m>=8 + for (size_t i = 0; i < mb; ++i) { + size_t n_block_size = 8; + size_t nb = n / n_block_size; + size_t n_leftover = n % n_block_size; + + //n=8 + for (size_t j = 0; j < nb; ++j) { + block8x8_kernel_avx2(k, a + (i * m_block_size) * lda, lda, + b + (j * n_block_size) * ldb, ldb, + c + (i * m_block_size) * ldc + j * n_block_size, ldc); + } + + //n=4 + if (n_leftover & 0x04) { + block8x4_kernel_avx2(k, a + (i * m_block_size) * lda, lda, + b + (nb * n_block_size) * ldb, ldb, + c + (i * m_block_size) * ldc + nb * n_block_size, ldc, 1); + } + + //n=2 + if (n_leftover & 0x02) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + block8x2_kernel_avx2(k, a + (i * m_block_size) * lda, lda, + b + (nb * n_block_size + n4) * ldb, ldb, + c + (i * m_block_size) * ldc + nb * n_block_size + n4, ldc, 1); + } + + //n=1 + if (n_leftover & 0x01) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + size_t n2 = n_leftover & 0x02 ? 2 : 0; + block8x1_kernel_avx2(k, a + (i * m_block_size) * lda, lda, + b + (nb * n_block_size + n2 + n4) * ldb, ldb, + c + (i * m_block_size) * ldc + nb * n_block_size + n2 + n4, ldc, 1); + } + } + + //m==4 + if (m_leftover & 0x04) { + size_t n_block_size = 8; + size_t nb = n / n_block_size; + size_t n_leftover = n % n_block_size; + + //n=8 + for (size_t j = 0; j < nb; ++j) { + block4x8_kernel_avx2(k, a + (mb * m_block_size) * lda, lda, + b + (j * n_block_size) * ldb, ldb, + c + (mb * m_block_size) * ldc + j * n_block_size, ldc, 1); + } + + //n=4 + if (n_leftover & 0x04) { + block4x4_kernel_avx2(k, a + (mb * m_block_size) * lda, lda, + b + (nb * n_block_size) * ldb, ldb, + c + (mb * m_block_size) * ldc + nb * n_block_size, ldc); + } + + //n=2 + if (n_leftover & 0x02) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + block4x2_kernel_avx2(k, a + (mb * m_block_size) * lda, lda, + b + (nb * n_block_size + n4) * ldb, ldb, + c + (mb * m_block_size) * ldc + nb * n_block_size + n4, ldc, 1); + } + + //n=1 + if (n_leftover & 0x01) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + size_t n2 = n_leftover & 0x02 ? 2 : 0; + block4x1_kernel_avx2(k, a + (mb * m_block_size) * lda, lda, + b + (nb * n_block_size + n4 + n2) * ldb, ldb, + c + (mb * m_block_size) * ldc + nb * n_block_size + n4 + n2, ldc, 1); + } + } + + //m==2 + if (m_leftover & 0x02) { + LOG(INFO) << "hello m_leftover"; + size_t n_block_size = 8; + size_t nb = n / n_block_size; + size_t n_leftover = n % n_block_size; + + size_t m4 = m_leftover & 0x04 ? 4 : 0; + + //n=8 + for (size_t j = 0; j < nb; ++j) { + block2x8_kernel_avx2(k, a + (mb * m_block_size + m4) * lda, lda, + b + (j * n_block_size) * ldb, ldb, + c + (mb * m_block_size + m4) * ldc + j * n_block_size, ldc, 1); + } + + //n=4 + if (n_leftover & 0x04) { + block2x4_kernel_avx2(k, a + (mb * m_block_size + m4) * lda, lda, + b + (nb * n_block_size) * ldb, ldb, + c + (mb * m_block_size + m4) * ldc + + nb * n_block_size, ldc, 1); + } + + //n=2 + if (n_leftover & 0x02) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + block2x2_kernel_avx2(k, a + (mb * m_block_size + m4) * lda, lda, + b + (nb * n_block_size + n4) * ldb, ldb, + c + (mb * m_block_size + m4) * ldc + + nb * n_block_size + n4, ldc); + LOG(INFO) << "hello"; + } + + //n=1 + if (n_leftover & 0x01) { + size_t n4 = n_leftover & 0x04 ? 4 : 0; + size_t n2 = n_leftover & 0x02 ? 2 : 0; + block2x1_kernel_avx2(k, a + (mb * m_block_size + m4) * lda, lda, + b + (nb * n_block_size + n4 + n2) * ldb, ldb, + c + (mb * m_block_size + m4) * ldc + + nb * n_block_size + n4 + n2, ldc, 1); + } + } + + //m==1 + if (m_leftover & 0x01) { + size_t n_block_size = 16; + size_t nb = n / n_block_size; + size_t n_leftover = n % n_block_size; + + size_t m4 = m_leftover & 0x04 ? 4 : 0; + size_t m2 = m_leftover & 0x02 ? 2 : 0; + + //n=16 + for (size_t j = 0; j < nb; ++j) { + block1x16_kernel_avx2(k, a + (mb * m_block_size + m4 + m2) * lda, lda, + b + (j * n_block_size) * ldb, ldb, + c + (mb * m_block_size + m4 + m2) * ldc + j * n_block_size); + } + + //n=8 + if (n_leftover & 0x08) { + block1x8_kernel_avx2(k, a + (mb * m_block_size + m4 + m2) * lda, lda, + b + (nb * n_block_size) * ldb, ldb, + c + (mb * m_block_size + m4 + m2) * ldc + nb * n_block_size, ldc, 1); + } + + //n=4 + if (n_leftover & 0x04) { + size_t n8 = n_leftover & 0x08 ? 8 : 0; + block1x4_kernel_avx2(k, a + (mb * m_block_size + m4 + m2) * lda, lda, + b + (nb * n_block_size + n8) * ldb, ldb, + c + (mb * m_block_size + m4 + m2) * ldc + nb * n_block_size + n8, ldc, 1); + } + + //n=2 + if (n_leftover & 0x02) { + size_t n8 = n_leftover & 0x08 ? 8 : 0; + size_t n4 = n_leftover & 0x04 ? 4 : 0; + block1x2_kernel_avx2(k, a + (mb * m_block_size + m4 + m2) * lda, lda, + b + (nb * n_block_size + n8 + n4) * ldb, ldb, + c + (mb * m_block_size + m4 + m2) * ldc + nb * n_block_size + n8 + n4, ldc, 1); + } + + //n=1 + if (n_leftover & 0x01) { + size_t n8 = n_leftover & 0x08 ? 8 : 0; + size_t n4 = n_leftover & 0x04 ? 4 : 0; + size_t n2 = n_leftover & 0x02 ? 2 : 0; + block1x1_kernel_avx2(k, a + (mb * m_block_size + m4 + m2) * lda, + b + (nb * n_block_size + n8 + n4 + n2) * ldb, + c + (mb * m_block_size + m4 + m2) * ldc + nb * n_block_size + n8 + n4 + n2); + } + } +} + +template <> +SaberStatus IntrinsicGemm< char, char, int >::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + CHECK_EQ(trans_a, false) << "only support no trans"; + CHECK_EQ(trans_b, false) << "only support no trans"; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? k : n; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + return SaberSuccess; +} + +inline void block4x2_kernel_avx2_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb0_h; + __m256i mb1_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb0_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb1_h)); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma2_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma2_h, mb1_h)); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma3_h, mb0_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma3_h, mb1_h)); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + pb1 += 16; + } + + if (0x08 & k_leftover) { + //a + __m256i ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + __m256i mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + __m256i mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + + pb0 += 8; + pb1 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum2, 1); + + //the 2 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum4, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum4, 1); + + //the 3 row + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum6 = _mm256_hadd_epi32(sum6, zero); + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, zero, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum6, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum6, 1); +} +/** + * b must packed + */ +inline void avx_s8s8s32_gemm_2x4_packed( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + // LOG(INFO)<<"my code"; + const int m_block = 4; + const int n_block = 2; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x2_kernel_avx2_me(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc, 1); + } + } +} +template <> +SaberStatus IntrinsicGemm< char, char, int>::dispatch( + const float alpha, const float beta, + const char* ptr_a, const char* ptr_b, int* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + // LOG(INFO)<<"chgemm_c_c_n_t_avx2 dispatch"; + // LOG(INFO)<<_m<<","<<_n<<","<<_k<<","<<","<<_lda<<","<<","<<_ldb<<","<<_ldc; + chgemm_c_c_n_t_avx2(_m, _n, _k, (int8_t*)ptr_a, _lda, (int8_t*)ptr_b, _ldb, ptr_c, _ldc); + // LOG(INFO)<<"chgemm_c_c_n_t_avx2 end"; + // avx_s8s8s32_gemm_2x4_packed(_m,_n,_k,ptr_a,_lda,ptr_b,_ldb,ptr_c,_ldc); + // exit(0); + return SaberSuccess; +} +#else + +template <> +SaberStatus IntrinsicGemm< char, char, int >::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx) { + LOG(FATAL)<<"not impl"; + return SaberSuccess; +} + +template <> +SaberStatus IntrinsicGemm< char, char, int>::dispatch( + const float alpha, const float beta, + const char* ptr_a, const char* ptr_b, int* ptr_c) { + LOG(FATAL)<<"not impl"; + return SaberSuccess; +} +#endif +} +} + diff --git a/saber/funcs/impl/x86/intrinsic_gemm.h b/saber/funcs/impl/x86/intrinsic_gemm.h new file mode 100644 index 000000000..501149a7e --- /dev/null +++ b/saber/funcs/impl/x86/intrinsic_gemm.h @@ -0,0 +1,46 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_INTRINSIC_GEMM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_INTRINSIC_GEMM_H +#include "saber/core/tensor.h" +#include "saber/funcs/gemm.h" +namespace anakin { +namespace saber { + +template +class IntrinsicGemm { + +public: + IntrinsicGemm() = default; + ~IntrinsicGemm() {} + + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx); + + SaberStatus dispatch(const float alpha, const float beta, + const inDtype_A* a, const inDtype_B* b, + outDtype* c); + +private: + int _m{-1}; + int _n{-1}; + int _k{-1}; + int _lda{-1}; + int _ldb{-1}; + int _ldc{-1}; + float _alpha{1.f}; + float _beta{0.f}; + char _trans_a{'N'}; + char _trans_b{'N'}; + char _offset_c_flag{'F'}; + int8_t _offset_a{0}; + int8_t _offset_b{0}; + int32_t _offset_c{0}; +}; + + +} +} + +#endif //ANAKIN_INTRINSIC_GEMM_H diff --git a/saber/funcs/impl/x86/intrinsic_packed_fc.cpp b/saber/funcs/impl/x86/intrinsic_packed_fc.cpp new file mode 100644 index 000000000..eb3e17277 --- /dev/null +++ b/saber/funcs/impl/x86/intrinsic_packed_fc.cpp @@ -0,0 +1,3788 @@ + +#include "saber/funcs/impl/x86/intrinsic_packed_fc.h" +#include +#include "jit_generator.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "debug.h" +namespace anakin { +namespace saber { +namespace jit { + +#define USE_OMP_IN_INTRINSIC_PACKED_FC 0 + +#define GET_OFF(field) offsetof(jit_int8_packed_fc_call_t, field) +using namespace Xbyak; + +void jit_s8s8s32_packed_gemm::cal_one_block() { + /** + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + */ + vpmovsxbw(a0, ptr[address_a_0]); + vpmovsxbw(a1, ptr[address_a_1]); + vpmovsxbw(b0, ptr[address_b_0]); + vpmovsxbw(b1, ptr[address_b_1]); + vpmovsxbw(b2, ptr[address_b_2]); + vpmovsxbw(b3, ptr[address_b_3]); + /** + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + */ + + vpmaddwd(vtemp_0, a0, b0); + vpmaddwd(vtemp_1, a1, b0); + vpaddd(sum_row0_col0, vtemp_0, sum_row0_col0); + vpaddd(sum_row1_col0, vtemp_1, sum_row1_col0); + + add(address_a_0, reg_k_block_size); + add(address_a_1, reg_k_block_size); + add(address_b_0, reg_k_block_size); + add(address_b_1, reg_k_block_size); + add(address_b_2, reg_k_block_size); + add(address_b_3, reg_k_block_size); + + vpmaddwd(vtemp_0, a0, b1); + vpmaddwd(vtemp_1, a1, b1); + vpaddd(sum_row0_col1, vtemp_0, sum_row0_col1); + vpaddd(sum_row1_col1, vtemp_1, sum_row1_col1); + + + vpmaddwd(vtemp_0, a0, b2); + vpmaddwd(vtemp_1, a1, b2); + vpaddd(sum_row0_col2, vtemp_0, sum_row0_col2); + vpaddd(sum_row1_col2, vtemp_1, sum_row1_col2); + + + vpmaddwd(vtemp_0, a0, b3); + vpmaddwd(vtemp_1, a1, b3); + vpaddd(sum_row0_col3, vtemp_0, sum_row0_col3); + vpaddd(sum_row1_col3, vtemp_1, sum_row1_col3); + +} + +void jit_s8s8s32_packed_gemm::load_and_init() { + mov(reg_lda, ptr[this->param1 + GET_OFF(lda)]); + mov(reg_ldb, ptr[this->param1 + GET_OFF(ldb)]); + /** + * + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + */ + mov(address_a_0, reg_input); + mov(address_a_1, reg_input); + add(address_a_1, reg_lda); + mov(reg_ldc, ptr[this->param1 + GET_OFF(ldc)]); + + mov(address_b_0, reg_weights); + mov(address_b_1, reg_weights); + add(address_b_1, reg_ldb); + mov(address_b_2, address_b_1); + add(address_b_2, reg_ldb); + mov(address_b_3, address_b_2); + add(address_b_3, reg_ldb); + + vpxor(sum_row0_col0, sum_row0_col0, sum_row0_col0); + vpxor(sum_row1_col0, sum_row1_col0, sum_row1_col0); + vpxor(sum_row0_col1, sum_row0_col1, sum_row0_col1); + vpxor(sum_row1_col1, sum_row1_col1, sum_row1_col1); + vpxor(sum_row0_col2, sum_row0_col2, sum_row0_col2); + vpxor(sum_row1_col2, sum_row1_col2, sum_row1_col2); + vpxor(sum_row0_col3, sum_row0_col3, sum_row0_col3); + vpxor(sum_row1_col3, sum_row1_col3, sum_row1_col3); + +} + +void jit_s8s8s32_packed_gemm::reduction_and_store2mem() { + vpxor(zero_in_reduction, zero_in_reduction, zero_in_reduction); + /** + * + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc0[2] = _mm256_extract_epi32(sum4, 0); + pc0[3] = _mm256_extract_epi32(sum4, 1); + */ + + vphaddd(c_row0_col0_1, sum_row0_col0, sum_row0_col1); + vphaddd(c_row0_col0_1, c_row0_col0_1, zero_in_reduction); + vperm2i128(temp0_in_reduction, c_row0_col0_1, zero_in_reduction, 0x31); + vpaddd(c_row0_col0_1, temp0_in_reduction, c_row0_col0_1); + + + vphaddd(c_row0_col2_3, sum_row0_col2, sum_row0_col3); + vphaddd(c_row0_col2_3, c_row0_col2_3, zero_in_reduction); + vperm2i128(temp1_in_reduction, c_row0_col2_3, zero_in_reduction, 0x31); + vpaddd(c_row0_col2_3, temp1_in_reduction, c_row0_col2_3); + + vpermq(c_row0_col2_3, c_row0_col2_3, 0x00); + vpblendd(c_row0_col0_1_2_3, c_row0_col0_1, c_row0_col2_3, 0x0c); + movdqu(ptr[reg_output], c_row0_col0_1_2_3_m128); + /** + * + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum1, 0); + pc1[1] = _mm256_extract_epi32(sum1, 1); + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + pc1[2] = _mm256_extract_epi32(sum5, 0); + pc1[3] = _mm256_extract_epi32(sum5, 1); + */ + + vphaddd(c_row1_col0_1, sum_row1_col0, sum_row1_col1); + vphaddd(c_row1_col0_1, c_row1_col0_1, zero_in_reduction); + vperm2i128(temp2_in_reduction, c_row1_col0_1, zero_in_reduction, 0x31); + vpaddd(c_row1_col0_1, temp2_in_reduction, c_row1_col0_1); + + vphaddd(c_row1_col2_3, sum_row1_col2, sum_row1_col3); + vphaddd(c_row1_col2_3, c_row1_col2_3, zero_in_reduction); + vperm2i128(temp3_in_reduction, c_row1_col2_3, zero_in_reduction, 0x31); + vpaddd(c_row1_col2_3, temp3_in_reduction, c_row1_col2_3); + + + vpermq(c_row1_col2_3, c_row1_col2_3, 0x00); + vpblendd(c_row1_col0_1_2_3, c_row1_col0_1, c_row1_col2_3, 0x0c); + + mov(rax, 4); + mul(reg_ldc); + add(reg_output, rax); + movdqu(ptr[reg_output], c_row1_col0_1_2_3_m128); +} + +/*void jit_s8s8s32_packed_gemm::generate() { + this->preamble(); + mov(reg_input, ptr[this->param1 + GET_OFF(src)]); + mov(reg_weights, ptr[this->param1 + GET_OFF(weights)]); + mov(reg_output, ptr[this->param1 + GET_OFF(output_data)]); + mov(reg_k_block_num, ptr[this->param1 + GET_OFF(k_block)]); + mov(reg_k_block_size, aligned_length); + + load_and_init(); + + L("FOR_01"); + cal_one_block(); + + dec(reg_k_block_num); + jnz("FOR_01"); + + reduction_and_store2mem(); + + this->postamble(); +}*/ + + +void jit_s8s8s32_packed_gemm::generate() { + this->preamble(); + mov(reg_input, ptr[this->param1 + GET_OFF(src)]); + mov(reg_weights, ptr[this->param1 + GET_OFF(weights)]); + mov(reg_output, ptr[this->param1 + GET_OFF(output_data)]); + mov(reg_k_block_num, ptr[this->param1 + GET_OFF(k_block)]); + mov(reg_k_block_size, aligned_length); + + mov(reg_lda, ptr[this->param1 + GET_OFF(lda)]); + mov(reg_ldb, ptr[this->param1 + GET_OFF(ldb)]); + + mov(address_a_0, reg_input); + vpmovsxbw(a0, ptr[address_a_0]); + mov(address_a_1, reg_input); + add(address_a_1, reg_lda); + mov(reg_ldc, ptr[this->param1 + GET_OFF(ldc)]); + vpmovsxbw(a1, ptr[address_a_1]); + + mov(address_b_0, reg_weights); + vpmovsxbw(b0, ptr[address_b_0]); + mov(address_b_1, reg_weights); + add(address_b_1, reg_ldb); + vpmovsxbw(b1, ptr[address_b_1]); + mov(address_b_2, address_b_1); + add(address_b_2, reg_ldb); + vpmovsxbw(b2, ptr[address_b_2]); + mov(address_b_3, address_b_2); + add(address_b_3, reg_ldb); + vpmovsxbw(b3, ptr[address_b_3]); + + vpxor(sum_row0_col0, sum_row0_col0, sum_row0_col0); + vmovdqa(sum_row1_col0, sum_row0_col0); + vmovdqa(sum_row0_col1, sum_row0_col0); + vmovdqa(sum_row1_col1, sum_row0_col0); + vmovdqa(sum_row0_col2, sum_row0_col0); + vmovdqa(sum_row1_col2, sum_row0_col0); + vmovdqa(sum_row0_col3, sum_row0_col0); + vmovdqa(sum_row1_col3, sum_row0_col0); + + // LOG(INFO)<<"jcp.k_block_number "<postamble(); +} + + +} +} +} + +namespace anakin { +namespace saber { + +#if defined(__AVX2__) + +inline __m256i load_int8_to_int16(const void* ptr) { + return _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) ptr)); +} +inline void load_2int16_madd(const int& epi16x2, const __m256i& b, __m256i& c) { + c = _mm256_add_epi32(c, _mm256_madd_epi16(_mm256_set1_epi32(epi16x2), b)); +} + +void packed_weights_k2(Tensor& inner_tensor, const Tensor& weights_tensor, const int n, + const int k, int slice_n) { + CHECK_EQ(weights_tensor.get_dtype(), AK_INT8); + CHECK_EQ(k % 2, 0) << "only support k % 16 = 0"; + CHECK_EQ(n % slice_n, 0) << "only support n % 8 = 0"; + const int new_row = n / slice_n; + const int new_col = k * slice_n; + inner_tensor.re_alloc(Shape({1, 1, new_row, new_col}), weights_tensor.get_dtype()); + const int8_t* in_ptr = static_cast(weights_tensor.data()); + int8_t* out_ptr = static_cast(inner_tensor.data()); + + for (int row = 0; row < k; row++) { + for (int col = 0; col < n; col++) { + int out_row = col / slice_n; + int slice_id = row / 2; + int slice_inner_id_0 = row % 2; + int slice_inner_id_1 = col % slice_n; + int output_index = out_row * new_col + slice_id * 2 * slice_n + slice_inner_id_1 * 2 + + slice_inner_id_0; + int input_index = row * n + col; + out_ptr[output_index] = in_ptr[input_index]; + } + } + + Tensortemp_tensor = weights_tensor; +} + +void packed_weights_k2_split_k(Tensor& inner_tensor, const Tensor& weights_tensor, + const int n, const int k, int slice_n, int slice_n_inner_length) { + CHECK_EQ(weights_tensor.get_dtype(), AK_INT8); + CHECK_EQ(k % (2 * 8), 0) << "only support k % 16 = 0"; + CHECK_EQ(n % 8, 0) << "only support n % 8 = 0"; + const int new_row = n / slice_n; + const int new_col = k * slice_n; + inner_tensor.re_alloc(Shape({1, 1, new_row, new_col}), weights_tensor.get_dtype()); + const int8_t* in_ptr = static_cast(weights_tensor.data()); + int8_t* out_ptr = static_cast(inner_tensor.data()); + + for (int row = 0; row < k; row++) { + for (int col = 0; col < n; col++) { + int out_row = col / slice_n; + int slice_id = row / 2; + int slice_inner_id_0 = row % 2; + int slice_inner_id_1 = col % slice_n; + int output_index = out_row * new_col + slice_id * 2 * slice_n + slice_inner_id_1 * 2 + + slice_inner_id_0; + int input_index = row * n + col; + out_ptr[output_index] = in_ptr[input_index]; + } + } + + Tensortemp_tensor = weights_tensor; +} + +void packed_weights_transpose_k(Tensor& inner_tensor, const Tensor& weights_tensor, + const int n, const int k, + const int n_slice, const int k_slice) { + CHECK_EQ(weights_tensor.get_dtype(), AK_INT8); + CHECK_EQ(k % 16, 0) << "only support k % 16 = 0"; + CHECK_EQ(n % n_slice, 0) << "only support n % 8 = 0"; + const int new_row = n / n_slice; + const int new_col = k * n_slice; + inner_tensor.re_alloc(Shape({1, 1, new_row, new_col}), weights_tensor.get_dtype()); + const int8_t* in_ptr = static_cast(weights_tensor.data()); + int8_t* out_ptr = static_cast(inner_tensor.data()); + + for (int row = 0; row < k; row++) { + for (int col = 0; col < n; col++) { + int out_row = col / n_slice; + int slice_id = row / k_slice; + int slice_inner_id_0 = row % k_slice; + int slice_inner_id_1 = col % n_slice; + int output_index = out_row * new_col + slice_id * k_slice * n_slice + slice_inner_id_1 * k_slice + + slice_inner_id_0; + int input_index = row * n + col; + out_ptr[output_index] = in_ptr[input_index]; + } + } +} + +void block4x128_kernel_avx2_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + const int8_t* pb4 = pb0 + 4 * 16; + const int8_t* pb5 = pb0 + 5 * 16; + const int8_t* pb6 = pb0 + 6 * 16; + const int8_t* pb7 = pb0 + 7 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 4; + size_t k_leftover = k - (nk << 4); + + + __m256i c0 = _mm256_setzero_si256(); + __m256i c1 = _mm256_setzero_si256(); + __m256i c2 = _mm256_setzero_si256(); + __m256i c3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + const __m256i b0 = load_int8_to_int16(pb0); + const __m256i b1 = load_int8_to_int16(pb1); + const __m256i b2 = load_int8_to_int16(pb2); + const __m256i b3 = load_int8_to_int16(pb3); + const __m256i b4 = load_int8_to_int16(pb4); + const __m256i b5 = load_int8_to_int16(pb5); + const __m256i b6 = load_int8_to_int16(pb6); + const __m256i b7 = load_int8_to_int16(pb7); + + const __v8si a0 = (__v8si)load_int8_to_int16(pa0); + const __v8si a1 = (__v8si)load_int8_to_int16(pa1); + const __v8si a2 = (__v8si)load_int8_to_int16(pa2); + const __v8si a3 = (__v8si)load_int8_to_int16(pa3); + + load_2int16_madd(a0[0], b0, c0); + load_2int16_madd(a0[1], b1, c0); + load_2int16_madd(a0[2], b2, c0); + load_2int16_madd(a0[3], b3, c0); + load_2int16_madd(a0[4], b4, c0); + load_2int16_madd(a0[5], b5, c0); + load_2int16_madd(a0[6], b6, c0); + load_2int16_madd(a0[7], b7, c0); + + load_2int16_madd(a1[0], b0, c1); + load_2int16_madd(a1[1], b1, c1); + load_2int16_madd(a1[2], b2, c1); + load_2int16_madd(a1[3], b3, c1); + load_2int16_madd(a1[4], b4, c1); + load_2int16_madd(a1[5], b5, c1); + load_2int16_madd(a1[6], b6, c1); + load_2int16_madd(a1[7], b7, c1); + + load_2int16_madd(a2[0], b0, c2); + load_2int16_madd(a2[1], b1, c2); + load_2int16_madd(a2[2], b2, c2); + load_2int16_madd(a2[3], b3, c2); + load_2int16_madd(a2[4], b4, c2); + load_2int16_madd(a2[5], b5, c2); + load_2int16_madd(a2[6], b6, c2); + load_2int16_madd(a2[7], b7, c2); + + load_2int16_madd(a3[0], b0, c3); + load_2int16_madd(a3[1], b1, c3); + load_2int16_madd(a3[2], b2, c3); + load_2int16_madd(a3[3], b3, c3); + load_2int16_madd(a3[4], b4, c3); + load_2int16_madd(a3[5], b5, c3); + load_2int16_madd(a3[6], b6, c3); + load_2int16_madd(a3[7], b7, c3); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16 * 8; + pb1 += 16 * 8; + pb2 += 16 * 8; + pb3 += 16 * 8; + pb4 += 16 * 8; + pb5 += 16 * 8; + pb6 += 16 * 8; + pb7 += 16 * 8; + + } + + _mm256_storeu_si256((__m256i*)pc0, c0); + _mm256_storeu_si256((__m256i*)pc1, c1); + _mm256_storeu_si256((__m256i*)pc2, c2); + _mm256_storeu_si256((__m256i*)pc3, c3); +} +void block_mx8_kernel_avx2_me(const int32_t m, + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + const int8_t* pb4 = pb0 + 4 * 16; + const int8_t* pb5 = pb0 + 5 * 16; + const int8_t* pb6 = pb0 + 6 * 16; + const int8_t* pb7 = pb0 + 7 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 4; + size_t k_leftover = k - (nk << 4); + + + __m256i c0 = _mm256_setzero_si256(); + __m256i c1 = _mm256_setzero_si256(); + __m256i c2 = _mm256_setzero_si256(); + __m256i c3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + const __m256i b0 = load_int8_to_int16(pb0); + const __m256i b1 = load_int8_to_int16(pb1); + const __m256i b2 = load_int8_to_int16(pb2); + const __m256i b3 = load_int8_to_int16(pb3); + const __m256i b4 = load_int8_to_int16(pb4); + const __m256i b5 = load_int8_to_int16(pb5); + const __m256i b6 = load_int8_to_int16(pb6); + const __m256i b7 = load_int8_to_int16(pb7); +#pragma unroll + + for (int m_index = 0; m_index < m; m_index++) { + if (k == 0) { + __m256i c0 = _mm256_setzero_si256(); + const __v8si a0 = (__v8si)load_int8_to_int16(a + m_index * lda + k * 16); + load_2int16_madd(a0[0], b0, c0); + load_2int16_madd(a0[1], b1, c0); + load_2int16_madd(a0[2], b2, c0); + load_2int16_madd(a0[3], b3, c0); + load_2int16_madd(a0[4], b4, c0); + load_2int16_madd(a0[5], b5, c0); + load_2int16_madd(a0[6], b6, c0); + load_2int16_madd(a0[7], b7, c0); + _mm256_storeu_si256((__m256i*)(c + m_index * ldc), c0); + } else { + __m256i c0 = _mm256_loadu_si256((__m256i*)(c + m_index * ldc)); + const __v8si a0 = (__v8si)load_int8_to_int16(a + m_index * lda + k * 16); + load_2int16_madd(a0[0], b0, c0); + load_2int16_madd(a0[1], b1, c0); + load_2int16_madd(a0[2], b2, c0); + load_2int16_madd(a0[3], b3, c0); + load_2int16_madd(a0[4], b4, c0); + load_2int16_madd(a0[5], b5, c0); + load_2int16_madd(a0[6], b6, c0); + load_2int16_madd(a0[7], b7, c0); + _mm256_storeu_si256((__m256i*)(c + m_index * ldc), c0); + } + } + + pb0 += 16 * 8; + pb1 += 16 * 8; + pb2 += 16 * 8; + pb3 += 16 * 8; + pb4 += 16 * 8; + pb5 += 16 * 8; + pb6 += 16 * 8; + pb7 += 16 * 8; + + } +} +void block4x8_kernel_avx2_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + const int8_t* pb4 = pb0 + 4 * 16; + const int8_t* pb5 = pb0 + 5 * 16; + const int8_t* pb6 = pb0 + 6 * 16; + const int8_t* pb7 = pb0 + 7 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 4; + size_t k_leftover = k - (nk << 4); + + + __m256i c0 = _mm256_setzero_si256(); + __m256i c1 = _mm256_setzero_si256(); + __m256i c2 = _mm256_setzero_si256(); + __m256i c3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + const __m256i b0 = load_int8_to_int16(pb0); + const __m256i b1 = load_int8_to_int16(pb1); + const __m256i b2 = load_int8_to_int16(pb2); + const __m256i b3 = load_int8_to_int16(pb3); + const __m256i b4 = load_int8_to_int16(pb4); + const __m256i b5 = load_int8_to_int16(pb5); + const __m256i b6 = load_int8_to_int16(pb6); + const __m256i b7 = load_int8_to_int16(pb7); + + const __v8si a0 = (__v8si)load_int8_to_int16(pa0); + const __v8si a1 = (__v8si)load_int8_to_int16(pa1); + const __v8si a2 = (__v8si)load_int8_to_int16(pa2); + const __v8si a3 = (__v8si)load_int8_to_int16(pa3); + + load_2int16_madd(a0[0], b0, c0); + load_2int16_madd(a0[1], b1, c0); + load_2int16_madd(a0[2], b2, c0); + load_2int16_madd(a0[3], b3, c0); + load_2int16_madd(a0[4], b4, c0); + load_2int16_madd(a0[5], b5, c0); + load_2int16_madd(a0[6], b6, c0); + load_2int16_madd(a0[7], b7, c0); + + load_2int16_madd(a1[0], b0, c1); + load_2int16_madd(a1[1], b1, c1); + load_2int16_madd(a1[2], b2, c1); + load_2int16_madd(a1[3], b3, c1); + load_2int16_madd(a1[4], b4, c1); + load_2int16_madd(a1[5], b5, c1); + load_2int16_madd(a1[6], b6, c1); + load_2int16_madd(a1[7], b7, c1); + + load_2int16_madd(a2[0], b0, c2); + load_2int16_madd(a2[1], b1, c2); + load_2int16_madd(a2[2], b2, c2); + load_2int16_madd(a2[3], b3, c2); + load_2int16_madd(a2[4], b4, c2); + load_2int16_madd(a2[5], b5, c2); + load_2int16_madd(a2[6], b6, c2); + load_2int16_madd(a2[7], b7, c2); + + load_2int16_madd(a3[0], b0, c3); + load_2int16_madd(a3[1], b1, c3); + load_2int16_madd(a3[2], b2, c3); + load_2int16_madd(a3[3], b3, c3); + load_2int16_madd(a3[4], b4, c3); + load_2int16_madd(a3[5], b5, c3); + load_2int16_madd(a3[6], b6, c3); + load_2int16_madd(a3[7], b7, c3); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16 * 8; + pb1 += 16 * 8; + pb2 += 16 * 8; + pb3 += 16 * 8; + pb4 += 16 * 8; + pb5 += 16 * 8; + pb6 += 16 * 8; + pb7 += 16 * 8; + + } + + _mm256_storeu_si256((__m256i*)pc0, c0); + _mm256_storeu_si256((__m256i*)pc1, c1); + _mm256_storeu_si256((__m256i*)pc2, c2); + _mm256_storeu_si256((__m256i*)pc3, c3); +} + +void block4x8_kernel_avx2_k2( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + const int8_t* pb4 = pb0 + 4 * 16; + const int8_t* pb5 = pb0 + 5 * 16; + const int8_t* pb6 = pb0 + 6 * 16; + const int8_t* pb7 = pb0 + 7 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 4; + size_t k_leftover = k - (nk << 4); + + + __m256i c0 = _mm256_setzero_si256(); + __m256i c1 = _mm256_setzero_si256(); + __m256i c2 = _mm256_setzero_si256(); + __m256i c3 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + const __m256i b0 = load_int8_to_int16(pb0); + const __m256i b1 = load_int8_to_int16(pb1); + const __m256i b2 = load_int8_to_int16(pb2); + const __m256i b3 = load_int8_to_int16(pb3); + const __m256i b4 = load_int8_to_int16(pb4); + const __m256i b5 = load_int8_to_int16(pb5); + const __m256i b6 = load_int8_to_int16(pb6); + const __m256i b7 = load_int8_to_int16(pb7); + + const __v8si a0 = (__v8si)load_int8_to_int16(pa0); + const __v8si a1 = (__v8si)load_int8_to_int16(pa1); + const __v8si a2 = (__v8si)load_int8_to_int16(pa2); + const __v8si a3 = (__v8si)load_int8_to_int16(pa3); + + + + load_2int16_madd(a0[0], b0, c0); + load_2int16_madd(a0[1], b1, c0); + load_2int16_madd(a0[2], b2, c0); + load_2int16_madd(a0[3], b3, c0); + load_2int16_madd(a0[4], b4, c0); + load_2int16_madd(a0[5], b5, c0); + load_2int16_madd(a0[6], b6, c0); + load_2int16_madd(a0[7], b7, c0); + + load_2int16_madd(a1[0], b0, c1); + load_2int16_madd(a1[1], b1, c1); + load_2int16_madd(a1[2], b2, c1); + load_2int16_madd(a1[3], b3, c1); + load_2int16_madd(a1[4], b4, c1); + load_2int16_madd(a1[5], b5, c1); + load_2int16_madd(a1[6], b6, c1); + load_2int16_madd(a1[7], b7, c1); + + load_2int16_madd(a2[0], b0, c2); + load_2int16_madd(a2[1], b1, c2); + load_2int16_madd(a2[2], b2, c2); + load_2int16_madd(a2[3], b3, c2); + load_2int16_madd(a2[4], b4, c2); + load_2int16_madd(a2[5], b5, c2); + load_2int16_madd(a2[6], b6, c2); + load_2int16_madd(a2[7], b7, c2); + + load_2int16_madd(a3[0], b0, c3); + load_2int16_madd(a3[1], b1, c3); + load_2int16_madd(a3[2], b2, c3); + load_2int16_madd(a3[3], b3, c3); + load_2int16_madd(a3[4], b4, c3); + load_2int16_madd(a3[5], b5, c3); + load_2int16_madd(a3[6], b6, c3); + load_2int16_madd(a3[7], b7, c3); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16 * 8; + pb1 += 16 * 8; + pb2 += 16 * 8; + pb3 += 16 * 8; + pb4 += 16 * 8; + pb5 += 16 * 8; + pb6 += 16 * 8; + pb7 += 16 * 8; + + } + + _mm256_storeu_si256((__m256i*)pc0, c0); + _mm256_storeu_si256((__m256i*)pc1, c1); + _mm256_storeu_si256((__m256i*)pc2, c2); + _mm256_storeu_si256((__m256i*)pc3, c3); +} + +void block4x64_kernel_avx2_split_k( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + const int8_t* pb4 = pb0 + 4 * 16; + const int8_t* pb5 = pb0 + 5 * 16; + const int8_t* pb6 = pb0 + 6 * 16; + const int8_t* pb7 = pb0 + 7 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + + size_t nk = k >> 4; + size_t k_leftover = k - (nk << 4); + + __m256i c0_0 = _mm256_setzero_si256(); + __m256i c0_1 = _mm256_setzero_si256(); + __m256i c0_2 = _mm256_setzero_si256(); + __m256i c0_3 = _mm256_setzero_si256(); + __m256i c0_4 = _mm256_setzero_si256(); + __m256i c0_5 = _mm256_setzero_si256(); + __m256i c0_6 = _mm256_setzero_si256(); + __m256i c0_7 = _mm256_setzero_si256(); + __m256i c1_0 = _mm256_setzero_si256(); + __m256i c1_1 = _mm256_setzero_si256(); + __m256i c1_2 = _mm256_setzero_si256(); + __m256i c1_3 = _mm256_setzero_si256(); + __m256i c1_4 = _mm256_setzero_si256(); + __m256i c1_5 = _mm256_setzero_si256(); + __m256i c1_6 = _mm256_setzero_si256(); + __m256i c1_7 = _mm256_setzero_si256(); + __m256i c2_0 = _mm256_setzero_si256(); + __m256i c2_1 = _mm256_setzero_si256(); + __m256i c2_2 = _mm256_setzero_si256(); + __m256i c2_3 = _mm256_setzero_si256(); + __m256i c2_4 = _mm256_setzero_si256(); + __m256i c2_5 = _mm256_setzero_si256(); + __m256i c2_6 = _mm256_setzero_si256(); + __m256i c2_7 = _mm256_setzero_si256(); + __m256i c3_0 = _mm256_setzero_si256(); + __m256i c3_1 = _mm256_setzero_si256(); + __m256i c3_2 = _mm256_setzero_si256(); + __m256i c3_3 = _mm256_setzero_si256(); + __m256i c3_4 = _mm256_setzero_si256(); + __m256i c3_5 = _mm256_setzero_si256(); + __m256i c3_6 = _mm256_setzero_si256(); + __m256i c3_7 = _mm256_setzero_si256(); + + + for (size_t k = 0; k < nk; ++k) { + + __v8si a0 = (__v8si)load_int8_to_int16(pa0); + __v8si a1 = (__v8si)load_int8_to_int16(pa1); + __v8si a2 = (__v8si)load_int8_to_int16(pa2); + __v8si a3 = (__v8si)load_int8_to_int16(pa3); + + // short* a0=(short*)pa0; + // short* a1=(short*)pa1; + // short* a2=(short*)pa2; + // short* a3=(short*)pa3; + + __m256i b0 = load_int8_to_int16(pb0); + __m256i b1 = load_int8_to_int16(pb1); + __m256i b2 = load_int8_to_int16(pb2); + __m256i b3 = load_int8_to_int16(pb3); + __m256i b4 = load_int8_to_int16(pb4); + __m256i b5 = load_int8_to_int16(pb5); + __m256i b6 = load_int8_to_int16(pb6); + __m256i b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[0], b0, c0_0); + load_2int16_madd(a0[0], b1, c0_1); + load_2int16_madd(a0[0], b2, c0_2); + load_2int16_madd(a0[0], b3, c0_3); + load_2int16_madd(a0[0], b4, c0_4); + load_2int16_madd(a0[0], b5, c0_5); + load_2int16_madd(a0[0], b6, c0_6); + load_2int16_madd(a0[0], b7, c0_7); + + load_2int16_madd(a1[0], b0, c1_0); + load_2int16_madd(a1[0], b1, c1_1); + load_2int16_madd(a1[0], b2, c1_2); + load_2int16_madd(a1[0], b3, c1_3); + load_2int16_madd(a1[0], b4, c1_4); + load_2int16_madd(a1[0], b5, c1_5); + load_2int16_madd(a1[0], b6, c1_6); + load_2int16_madd(a1[0], b7, c1_7); + + load_2int16_madd(a2[0], b0, c2_0); + load_2int16_madd(a2[0], b1, c2_1); + load_2int16_madd(a2[0], b2, c2_2); + load_2int16_madd(a2[0], b3, c2_3); + load_2int16_madd(a2[0], b4, c2_4); + load_2int16_madd(a2[0], b5, c2_5); + load_2int16_madd(a2[0], b6, c2_6); + load_2int16_madd(a2[0], b7, c2_7); + + load_2int16_madd(a3[0], b0, c3_0); + load_2int16_madd(a3[0], b1, c3_1); + load_2int16_madd(a3[0], b2, c3_2); + load_2int16_madd(a3[0], b3, c3_3); + load_2int16_madd(a3[0], b4, c3_4); + load_2int16_madd(a3[0], b5, c3_5); + load_2int16_madd(a3[0], b6, c3_6); + load_2int16_madd(a3[0], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[1], b0, c0_0); + load_2int16_madd(a0[1], b1, c0_1); + load_2int16_madd(a0[1], b2, c0_2); + load_2int16_madd(a0[1], b3, c0_3); + load_2int16_madd(a0[1], b4, c0_4); + load_2int16_madd(a0[1], b5, c0_5); + load_2int16_madd(a0[1], b6, c0_6); + load_2int16_madd(a0[1], b7, c0_7); + + load_2int16_madd(a1[1], b0, c1_0); + load_2int16_madd(a1[1], b1, c1_1); + load_2int16_madd(a1[1], b2, c1_2); + load_2int16_madd(a1[1], b3, c1_3); + load_2int16_madd(a1[1], b4, c1_4); + load_2int16_madd(a1[1], b5, c1_5); + load_2int16_madd(a1[1], b6, c1_6); + load_2int16_madd(a1[1], b7, c1_7); + + load_2int16_madd(a2[1], b0, c2_0); + load_2int16_madd(a2[1], b1, c2_1); + load_2int16_madd(a2[1], b2, c2_2); + load_2int16_madd(a2[1], b3, c2_3); + load_2int16_madd(a2[1], b4, c2_4); + load_2int16_madd(a2[1], b5, c2_5); + load_2int16_madd(a2[1], b6, c2_6); + load_2int16_madd(a2[1], b7, c2_7); + + load_2int16_madd(a3[1], b0, c3_0); + load_2int16_madd(a3[1], b1, c3_1); + load_2int16_madd(a3[1], b2, c3_2); + load_2int16_madd(a3[1], b3, c3_3); + load_2int16_madd(a3[1], b4, c3_4); + load_2int16_madd(a3[1], b5, c3_5); + load_2int16_madd(a3[1], b6, c3_6); + load_2int16_madd(a3[1], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[2], b0, c0_0); + load_2int16_madd(a0[2], b1, c0_1); + load_2int16_madd(a0[2], b2, c0_2); + load_2int16_madd(a0[2], b3, c0_3); + load_2int16_madd(a0[2], b4, c0_4); + load_2int16_madd(a0[2], b5, c0_5); + load_2int16_madd(a0[2], b6, c0_6); + load_2int16_madd(a0[2], b7, c0_7); + + load_2int16_madd(a1[2], b0, c1_0); + load_2int16_madd(a1[2], b1, c1_1); + load_2int16_madd(a1[2], b2, c1_2); + load_2int16_madd(a1[2], b3, c1_3); + load_2int16_madd(a1[2], b4, c1_4); + load_2int16_madd(a1[2], b5, c1_5); + load_2int16_madd(a1[2], b6, c1_6); + load_2int16_madd(a1[2], b7, c1_7); + + load_2int16_madd(a2[2], b0, c2_0); + load_2int16_madd(a2[2], b1, c2_1); + load_2int16_madd(a2[2], b2, c2_2); + load_2int16_madd(a2[2], b3, c2_3); + load_2int16_madd(a2[2], b4, c2_4); + load_2int16_madd(a2[2], b5, c2_5); + load_2int16_madd(a2[2], b6, c2_6); + load_2int16_madd(a2[2], b7, c2_7); + + load_2int16_madd(a3[2], b0, c3_0); + load_2int16_madd(a3[2], b1, c3_1); + load_2int16_madd(a3[2], b2, c3_2); + load_2int16_madd(a3[2], b3, c3_3); + load_2int16_madd(a3[2], b4, c3_4); + load_2int16_madd(a3[2], b5, c3_5); + load_2int16_madd(a3[2], b6, c3_6); + load_2int16_madd(a3[2], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[3], b0, c0_0); + load_2int16_madd(a0[3], b1, c0_1); + load_2int16_madd(a0[3], b2, c0_2); + load_2int16_madd(a0[3], b3, c0_3); + load_2int16_madd(a0[3], b4, c0_4); + load_2int16_madd(a0[3], b5, c0_5); + load_2int16_madd(a0[3], b6, c0_6); + load_2int16_madd(a0[3], b7, c0_7); + + load_2int16_madd(a1[3], b0, c1_0); + load_2int16_madd(a1[3], b1, c1_1); + load_2int16_madd(a1[3], b2, c1_2); + load_2int16_madd(a1[3], b3, c1_3); + load_2int16_madd(a1[3], b4, c1_4); + load_2int16_madd(a1[3], b5, c1_5); + load_2int16_madd(a1[3], b6, c1_6); + load_2int16_madd(a1[3], b7, c1_7); + + load_2int16_madd(a2[3], b0, c2_0); + load_2int16_madd(a2[3], b1, c2_1); + load_2int16_madd(a2[3], b2, c2_2); + load_2int16_madd(a2[3], b3, c2_3); + load_2int16_madd(a2[3], b4, c2_4); + load_2int16_madd(a2[3], b5, c2_5); + load_2int16_madd(a2[3], b6, c2_6); + load_2int16_madd(a2[3], b7, c2_7); + + load_2int16_madd(a3[3], b0, c3_0); + load_2int16_madd(a3[3], b1, c3_1); + load_2int16_madd(a3[3], b2, c3_2); + load_2int16_madd(a3[3], b3, c3_3); + load_2int16_madd(a3[3], b4, c3_4); + load_2int16_madd(a3[3], b5, c3_5); + load_2int16_madd(a3[3], b6, c3_6); + load_2int16_madd(a3[3], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[4], b0, c0_0); + load_2int16_madd(a0[4], b1, c0_1); + load_2int16_madd(a0[4], b2, c0_2); + load_2int16_madd(a0[4], b3, c0_3); + load_2int16_madd(a0[4], b4, c0_4); + load_2int16_madd(a0[4], b5, c0_5); + load_2int16_madd(a0[4], b6, c0_6); + load_2int16_madd(a0[4], b7, c0_7); + + load_2int16_madd(a1[4], b0, c1_0); + load_2int16_madd(a1[4], b1, c1_1); + load_2int16_madd(a1[4], b2, c1_2); + load_2int16_madd(a1[4], b3, c1_3); + load_2int16_madd(a1[4], b4, c1_4); + load_2int16_madd(a1[4], b5, c1_5); + load_2int16_madd(a1[4], b6, c1_6); + load_2int16_madd(a1[4], b7, c1_7); + + load_2int16_madd(a2[4], b0, c2_0); + load_2int16_madd(a2[4], b1, c2_1); + load_2int16_madd(a2[4], b2, c2_2); + load_2int16_madd(a2[4], b3, c2_3); + load_2int16_madd(a2[4], b4, c2_4); + load_2int16_madd(a2[4], b5, c2_5); + load_2int16_madd(a2[4], b6, c2_6); + load_2int16_madd(a2[4], b7, c2_7); + + load_2int16_madd(a3[4], b0, c3_0); + load_2int16_madd(a3[4], b1, c3_1); + load_2int16_madd(a3[4], b2, c3_2); + load_2int16_madd(a3[4], b3, c3_3); + load_2int16_madd(a3[4], b4, c3_4); + load_2int16_madd(a3[4], b5, c3_5); + load_2int16_madd(a3[4], b6, c3_6); + load_2int16_madd(a3[4], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[5], b0, c0_0); + load_2int16_madd(a0[5], b1, c0_1); + load_2int16_madd(a0[5], b2, c0_2); + load_2int16_madd(a0[5], b3, c0_3); + load_2int16_madd(a0[5], b4, c0_4); + load_2int16_madd(a0[5], b5, c0_5); + load_2int16_madd(a0[5], b6, c0_6); + load_2int16_madd(a0[5], b7, c0_7); + + load_2int16_madd(a1[5], b0, c1_0); + load_2int16_madd(a1[5], b1, c1_1); + load_2int16_madd(a1[5], b2, c1_2); + load_2int16_madd(a1[5], b3, c1_3); + load_2int16_madd(a1[5], b4, c1_4); + load_2int16_madd(a1[5], b5, c1_5); + load_2int16_madd(a1[5], b6, c1_6); + load_2int16_madd(a1[5], b7, c1_7); + + load_2int16_madd(a2[5], b0, c2_0); + load_2int16_madd(a2[5], b1, c2_1); + load_2int16_madd(a2[5], b2, c2_2); + load_2int16_madd(a2[5], b3, c2_3); + load_2int16_madd(a2[5], b4, c2_4); + load_2int16_madd(a2[5], b5, c2_5); + load_2int16_madd(a2[5], b6, c2_6); + load_2int16_madd(a2[5], b7, c2_7); + + load_2int16_madd(a3[5], b0, c3_0); + load_2int16_madd(a3[5], b1, c3_1); + load_2int16_madd(a3[5], b2, c3_2); + load_2int16_madd(a3[5], b3, c3_3); + load_2int16_madd(a3[5], b4, c3_4); + load_2int16_madd(a3[5], b5, c3_5); + load_2int16_madd(a3[5], b6, c3_6); + load_2int16_madd(a3[5], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[6], b0, c0_0); + load_2int16_madd(a0[6], b1, c0_1); + load_2int16_madd(a0[6], b2, c0_2); + load_2int16_madd(a0[6], b3, c0_3); + load_2int16_madd(a0[6], b4, c0_4); + load_2int16_madd(a0[6], b5, c0_5); + load_2int16_madd(a0[6], b6, c0_6); + load_2int16_madd(a0[6], b7, c0_7); + + load_2int16_madd(a1[6], b0, c1_0); + load_2int16_madd(a1[6], b1, c1_1); + load_2int16_madd(a1[6], b2, c1_2); + load_2int16_madd(a1[6], b3, c1_3); + load_2int16_madd(a1[6], b4, c1_4); + load_2int16_madd(a1[6], b5, c1_5); + load_2int16_madd(a1[6], b6, c1_6); + load_2int16_madd(a1[6], b7, c1_7); + + load_2int16_madd(a2[6], b0, c2_0); + load_2int16_madd(a2[6], b1, c2_1); + load_2int16_madd(a2[6], b2, c2_2); + load_2int16_madd(a2[6], b3, c2_3); + load_2int16_madd(a2[6], b4, c2_4); + load_2int16_madd(a2[6], b5, c2_5); + load_2int16_madd(a2[6], b6, c2_6); + load_2int16_madd(a2[6], b7, c2_7); + + load_2int16_madd(a3[6], b0, c3_0); + load_2int16_madd(a3[6], b1, c3_1); + load_2int16_madd(a3[6], b2, c3_2); + load_2int16_madd(a3[6], b3, c3_3); + load_2int16_madd(a3[6], b4, c3_4); + load_2int16_madd(a3[6], b5, c3_5); + load_2int16_madd(a3[6], b6, c3_6); + load_2int16_madd(a3[6], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + b0 = load_int8_to_int16(pb0); + b1 = load_int8_to_int16(pb1); + b2 = load_int8_to_int16(pb2); + b3 = load_int8_to_int16(pb3); + b4 = load_int8_to_int16(pb4); + b5 = load_int8_to_int16(pb5); + b6 = load_int8_to_int16(pb6); + b7 = load_int8_to_int16(pb7); + + load_2int16_madd(a0[7], b0, c0_0); + load_2int16_madd(a0[7], b1, c0_1); + load_2int16_madd(a0[7], b2, c0_2); + load_2int16_madd(a0[7], b3, c0_3); + load_2int16_madd(a0[7], b4, c0_4); + load_2int16_madd(a0[7], b5, c0_5); + load_2int16_madd(a0[7], b6, c0_6); + load_2int16_madd(a0[7], b7, c0_7); + + load_2int16_madd(a1[7], b0, c1_0); + load_2int16_madd(a1[7], b1, c1_1); + load_2int16_madd(a1[7], b2, c1_2); + load_2int16_madd(a1[7], b3, c1_3); + load_2int16_madd(a1[7], b4, c1_4); + load_2int16_madd(a1[7], b5, c1_5); + load_2int16_madd(a1[7], b6, c1_6); + load_2int16_madd(a1[7], b7, c1_7); + + load_2int16_madd(a2[7], b0, c2_0); + load_2int16_madd(a2[7], b1, c2_1); + load_2int16_madd(a2[7], b2, c2_2); + load_2int16_madd(a2[7], b3, c2_3); + load_2int16_madd(a2[7], b4, c2_4); + load_2int16_madd(a2[7], b5, c2_5); + load_2int16_madd(a2[7], b6, c2_6); + load_2int16_madd(a2[7], b7, c2_7); + + load_2int16_madd(a3[7], b0, c3_0); + load_2int16_madd(a3[7], b1, c3_1); + load_2int16_madd(a3[7], b2, c3_2); + load_2int16_madd(a3[7], b3, c3_3); + load_2int16_madd(a3[7], b4, c3_4); + load_2int16_madd(a3[7], b5, c3_5); + load_2int16_madd(a3[7], b6, c3_6); + load_2int16_madd(a3[7], b7, c3_7); + + pb0 += 8 * 16; + pb1 += 8 * 16; + pb2 += 8 * 16; + pb3 += 8 * 16; + pb4 += 8 * 16; + pb5 += 8 * 16; + pb6 += 8 * 16; + pb7 += 8 * 16; + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + } + + _mm256_storeu_si256((__m256i*)(pc0 + 0 * 8), c0_0); + _mm256_storeu_si256((__m256i*)(pc0 + 1 * 8), c0_1); + _mm256_storeu_si256((__m256i*)(pc0 + 2 * 8), c0_2); + _mm256_storeu_si256((__m256i*)(pc0 + 3 * 8), c0_3); + _mm256_storeu_si256((__m256i*)(pc0 + 4 * 8), c0_4); + _mm256_storeu_si256((__m256i*)(pc0 + 5 * 8), c0_5); + _mm256_storeu_si256((__m256i*)(pc0 + 6 * 8), c0_6); + _mm256_storeu_si256((__m256i*)(pc0 + 7 * 8), c0_7); + + _mm256_storeu_si256((__m256i*)(pc1 + 0 * 8), c1_0); + _mm256_storeu_si256((__m256i*)(pc1 + 1 * 8), c1_1); + _mm256_storeu_si256((__m256i*)(pc1 + 2 * 8), c1_2); + _mm256_storeu_si256((__m256i*)(pc1 + 3 * 8), c1_3); + _mm256_storeu_si256((__m256i*)(pc1 + 4 * 8), c1_4); + _mm256_storeu_si256((__m256i*)(pc1 + 5 * 8), c1_5); + _mm256_storeu_si256((__m256i*)(pc1 + 6 * 8), c1_6); + _mm256_storeu_si256((__m256i*)(pc1 + 7 * 8), c1_7); + + _mm256_storeu_si256((__m256i*)(pc2 + 0 * 8), c2_0); + _mm256_storeu_si256((__m256i*)(pc2 + 1 * 8), c2_1); + _mm256_storeu_si256((__m256i*)(pc2 + 2 * 8), c2_2); + _mm256_storeu_si256((__m256i*)(pc2 + 3 * 8), c2_3); + _mm256_storeu_si256((__m256i*)(pc2 + 4 * 8), c2_4); + _mm256_storeu_si256((__m256i*)(pc2 + 5 * 8), c2_5); + _mm256_storeu_si256((__m256i*)(pc2 + 6 * 8), c2_6); + _mm256_storeu_si256((__m256i*)(pc2 + 7 * 8), c2_7); + + _mm256_storeu_si256((__m256i*)(pc3 + 0 * 8), c3_0); + _mm256_storeu_si256((__m256i*)(pc3 + 1 * 8), c3_1); + _mm256_storeu_si256((__m256i*)(pc3 + 2 * 8), c3_2); + _mm256_storeu_si256((__m256i*)(pc3 + 3 * 8), c3_3); + _mm256_storeu_si256((__m256i*)(pc3 + 4 * 8), c3_4); + _mm256_storeu_si256((__m256i*)(pc3 + 5 * 8), c3_5); + _mm256_storeu_si256((__m256i*)(pc3 + 6 * 8), c3_6); + _mm256_storeu_si256((__m256i*)(pc3 + 7 * 8), c3_7); + +} + +inline void avx_s8s8s32_gemm_4x8_packed_dot_add( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 4; + const int n_block = 8; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x8_kernel_avx2_me(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } + +} + +inline void avx_s8s8s32_gemm_4x64_packed_split_k( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 4; + const int n_block = 64; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x64_kernel_avx2_split_k(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } + +} + +inline void avx_s8s8s32_gemm_mx8_packed_dot_add( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 4; + const int n_block = 8; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[nbi * n_block]; + block_mx8_kernel_avx2_me(m, k, a, lda, b_ptr, ldb, c_ptr, ldc); + } +} + + +void block4x2_kernel_avx2_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + __m256i ma0_h; + __m256i ma1_h; + __m256i ma2_h; + __m256i ma3_h; + + __m256i mb0_l; + __m256i mb1_l; + __m256i mb0_h; + __m256i mb1_h; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa0 + 16))); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb0_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb0 + 16))); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + mb1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pb1 + 16))); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + mc0 = _mm256_add_epi32(mc0, _mm256_madd_epi16(ma0_h, mb0_h)); + mc1 = _mm256_add_epi32(mc1, _mm256_madd_epi16(ma0_h, mb1_h)); + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + ma1_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa1 + 16))); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + mc2 = _mm256_add_epi32(mc2, _mm256_madd_epi16(ma1_h, mb0_h)); + mc3 = _mm256_add_epi32(mc3, _mm256_madd_epi16(ma1_h, mb1_h)); + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + ma2_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa2 + 16))); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + mc4 = _mm256_add_epi32(mc4, _mm256_madd_epi16(ma2_h, mb0_h)); + mc5 = _mm256_add_epi32(mc5, _mm256_madd_epi16(ma2_h, mb1_h)); + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + ma3_h = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(pa3 + 16))); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + mc6 = _mm256_add_epi32(mc6, _mm256_madd_epi16(ma3_h, mb0_h)); + mc7 = _mm256_add_epi32(mc7, _mm256_madd_epi16(ma3_h, mb1_h)); + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + } + + //leftover + if (0x10 & k_leftover) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + pb1 += 16; + } + + if (0x08 & k_leftover) { + //a + __m256i ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa0)); + + //b + __m256i mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb0)); + __m256i mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) pa3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 8; + pa1 += 8; + pa2 += 8; + pa3 += 8; + + pb0 += 8; + pb1 += 8; + } + + size_t leftover = k_leftover & 0x07; + + if (leftover) { + int8_t ga0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga2[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t ga3[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + int8_t gb0[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + int8_t gb1[8] __attribute__((aligned(16))) = {0, 0, 0, 0, 0, 0, 0, 0}; + + for (size_t i = 0; i < leftover; ++i) { + ga0[i] = pa0[i]; + ga1[i] = pa1[i]; + ga2[i] = pa2[i]; + ga3[i] = pa3[i]; + + gb0[i] = pb0[i]; + gb1[i] = pb1[i]; + } + + //a + ma0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga0)); + + //b + mb0_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb0)); + mb1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) gb1)); + + //the 0 row + mc0 = _mm256_mullo_epi32(ma0_l, mb0_l); + mc1 = _mm256_mullo_epi32(ma0_l, mb1_l); + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga1)); + + mc2 = _mm256_mullo_epi32(ma1_l, mb0_l); + mc3 = _mm256_mullo_epi32(ma1_l, mb1_l); + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga2)); + + mc4 = _mm256_mullo_epi32(ma2_l, mb0_l); + mc5 = _mm256_mullo_epi32(ma2_l, mb1_l); + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi32(_mm_loadl_epi64((__m128i*) ga3)); + + mc6 = _mm256_mullo_epi32(ma3_l, mb0_l); + mc7 = _mm256_mullo_epi32(ma3_l, mb1_l); + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum2, 1); + + //the 2 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum4, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum4, 1); + + //the 3 row + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum6 = _mm256_hadd_epi32(sum6, zero); + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, zero, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum6, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum6, 1); +} + +inline void block4x2_kernel_avx2_me_k16( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc, const int stride) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 4; // k / 32 + size_t k_leftover = k - (nk << 4); // k % 32 + + __m256i ma0_l; + __m256i ma1_l; + __m256i ma2_l; + __m256i ma3_l; + + __m256i mb0_l; + __m256i mb1_l; + + __m256i mc0; + __m256i mc1; + __m256i mc2; + __m256i mc3; + __m256i mc4; + __m256i mc5; + __m256i mc6; + __m256i mc7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + + //b + mb0_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + mb1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + mc0 = _mm256_madd_epi16(ma0_l, mb0_l); + mc1 = _mm256_madd_epi16(ma0_l, mb1_l); + + + sum0 = _mm256_add_epi32(mc0, sum0); + sum1 = _mm256_add_epi32(mc1, sum1); + + //the 1 row + ma1_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + + mc2 = _mm256_madd_epi16(ma1_l, mb0_l); + mc3 = _mm256_madd_epi16(ma1_l, mb1_l); + + + sum2 = _mm256_add_epi32(mc2, sum2); + sum3 = _mm256_add_epi32(mc3, sum3); + + //the 2 row + ma2_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa2)); + + mc4 = _mm256_madd_epi16(ma2_l, mb0_l); + mc5 = _mm256_madd_epi16(ma2_l, mb1_l); + + + sum4 = _mm256_add_epi32(mc4, sum4); + sum5 = _mm256_add_epi32(mc5, sum5); + + //the 3 row + ma3_l = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa3)); + + mc6 = _mm256_madd_epi16(ma3_l, mb0_l); + mc7 = _mm256_madd_epi16(ma3_l, mb1_l); + + + sum6 = _mm256_add_epi32(mc6, sum6); + sum7 = _mm256_add_epi32(mc7, sum7); + + pa0 += 16; + pa1 += 16; + pa2 += 16; + pa3 += 16; + + pb0 += 16; + pb1 += 16; + } + + CHECK_EQ(k_leftover, 0); + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1 * stride] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum2 = _mm256_hadd_epi32(sum2, sum3); + sum2 = _mm256_hadd_epi32(sum2, zero); + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum2, 0); + pc1[1 * stride] = _mm256_extract_epi32(sum2, 1); + + //the 2 row + sum4 = _mm256_hadd_epi32(sum4, sum5); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc2[0] = _mm256_extract_epi32(sum4, 0); + pc2[1 * stride] = _mm256_extract_epi32(sum4, 1); + + //the 3 row + sum6 = _mm256_hadd_epi32(sum6, sum7); + sum6 = _mm256_hadd_epi32(sum6, zero); + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, zero, 0x31)); + + pc3[0] = _mm256_extract_epi32(sum6, 0); + pc3[1 * stride] = _mm256_extract_epi32(sum6, 1); +} + +/** + * b packed + * @param k + * @param a + * @param lda + * @param b + * @param ldb + * @param c + * @param ldc + */ +inline void block2x4_kernel_avx2_me_k16( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + + size_t nk = k >> 4; // k / 16 + size_t k_leftover = k - (nk << 4); // k % 16 + + __m256i ma0; + __m256i ma1; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3;; + + __m256i temp_0; + __m256i temp_1; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + temp_0 = _mm256_madd_epi16(ma0, mb1); + temp_1 = _mm256_madd_epi16(ma1, mb1); + + sum2 = _mm256_add_epi32(sum2, temp_0); + sum3 = _mm256_add_epi32(sum3, temp_1); + + + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + temp_0 = _mm256_madd_epi16(ma0, mb2); + temp_1 = _mm256_madd_epi16(ma1, mb2); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_0 = _mm256_madd_epi16(ma0, mb3); + temp_1 = _mm256_madd_epi16(ma1, mb3); + sum6 = _mm256_add_epi32(sum6, temp_0); + sum7 = _mm256_add_epi32(sum7, temp_1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + CHECK_EQ(k_leftover, 0); + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc0[2] = _mm256_extract_epi32(sum4, 0); + pc0[3] = _mm256_extract_epi32(sum4, 1); + + //the 2 row + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum1, 0); + pc1[1] = _mm256_extract_epi32(sum1, 1); + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + pc1[2] = _mm256_extract_epi32(sum5, 0); + pc1[3] = _mm256_extract_epi32(sum5, 1); +} + +/** + * b packed + * @param k + * @param a + * @param lda + * @param b + * @param ldb + * @param c + * @param ldc + */ +inline void block2x4_kernel_avx2_me_k16_packed( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block4x2_kernel_avx2\n"); + + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * 16; + const int8_t* pb2 = pb0 + 2 * 16; + const int8_t* pb3 = pb0 + 3 * 16; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + + size_t nk = k >> 4; // k / 16 + size_t k_leftover = k - (nk << 4); // k % 16 + + __m256i ma0; + __m256i ma1; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3;; + + __m256i temp_0; + __m256i temp_1; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + temp_0 = _mm256_madd_epi16(ma0, mb1); + temp_1 = _mm256_madd_epi16(ma1, mb1); + + sum2 = _mm256_add_epi32(sum2, temp_0); + sum3 = _mm256_add_epi32(sum3, temp_1); + + + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + temp_0 = _mm256_madd_epi16(ma0, mb2); + temp_1 = _mm256_madd_epi16(ma1, mb2); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_0 = _mm256_madd_epi16(ma0, mb3); + temp_1 = _mm256_madd_epi16(ma1, mb3); + sum6 = _mm256_add_epi32(sum6, temp_0); + sum7 = _mm256_add_epi32(sum7, temp_1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16 * 4; + pb1 += 16 * 4; + pb2 += 16 * 4; + pb3 += 16 * 4; + } + + CHECK_EQ(k_leftover, 0); + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc0[2] = _mm256_extract_epi32(sum4, 0); + pc0[3] = _mm256_extract_epi32(sum4, 1); + + //the 2 row + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum1, 0); + pc1[1] = _mm256_extract_epi32(sum1, 1); + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + pc1[2] = _mm256_extract_epi32(sum5, 0); + pc1[3] = _mm256_extract_epi32(sum5, 1); +} + + +/** + * b packed + * @param k + * @param a + * @param lda + * @param b + * @param ldb + * @param c + * @param ldc + */ +inline void block1x8_kernel_avx2_me_k16( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + const int8_t* pb4 = pb0 + 4 * ldb; + const int8_t* pb5 = pb0 + 5 * ldb; + const int8_t* pb6 = pb0 + 6 * ldb; + const int8_t* pb7 = pb0 + 7 * ldb; + + int* pc0 = c; + + size_t nk = k >> 4; // k / 16 + size_t k_leftover = k - (nk << 4); // k % 16 + + __m256i ma0; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3; + __m256i mb4; + __m256i mb5; + __m256i mb6; + __m256i mb7; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + __m256i temp_0; + __m256i temp_1; + __m256i temp_2; + __m256i temp_3; + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma0, mb1); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_2 = _mm256_madd_epi16(ma0, mb2); + temp_3 = _mm256_madd_epi16(ma0, mb3); + + sum2 = _mm256_add_epi32(sum2, temp_2); + sum3 = _mm256_add_epi32(sum3, temp_3); + + + mb4 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb4)); + mb5 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb5)); + + temp_0 = _mm256_madd_epi16(ma0, mb4); + temp_1 = _mm256_madd_epi16(ma0, mb5); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb6 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb6)); + mb7 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb7)); + temp_2 = _mm256_madd_epi16(ma0, mb6); + temp_3 = _mm256_madd_epi16(ma0, mb7); + sum6 = _mm256_add_epi32(sum6, temp_2); + sum7 = _mm256_add_epi32(sum7, temp_3); + + pa0 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + pb4 += 16; + pb5 += 16; + pb6 += 16; + pb7 += 16; + } + + CHECK_EQ(k_leftover, 0); + + //store + + //the 0 row + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, sum0, 0x81)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 8)); + sum0 = _mm256_add_epi32(sum0, _mm256_srli_si256(sum0, 4)); + pc0[0] = _mm256_extract_epi32(sum0, 0); + + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, sum1, 0x81)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 8)); + sum1 = _mm256_add_epi32(sum1, _mm256_srli_si256(sum1, 4)); + pc0[1] = _mm256_extract_epi32(sum1, 0); + + sum2 = _mm256_add_epi32(sum2, _mm256_permute2x128_si256(sum2, sum2, 0x81)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 8)); + sum2 = _mm256_add_epi32(sum2, _mm256_srli_si256(sum2, 4)); + pc0[2] = _mm256_extract_epi32(sum2, 0); + + sum3 = _mm256_add_epi32(sum3, _mm256_permute2x128_si256(sum3, sum3, 0x81)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 8)); + sum3 = _mm256_add_epi32(sum3, _mm256_srli_si256(sum3, 4)); + pc0[3] = _mm256_extract_epi32(sum3, 0); + + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, sum4, 0x81)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 8)); + sum4 = _mm256_add_epi32(sum4, _mm256_srli_si256(sum4, 4)); + pc0[4] = _mm256_extract_epi32(sum4, 0); + + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, sum5, 0x81)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 8)); + sum5 = _mm256_add_epi32(sum5, _mm256_srli_si256(sum5, 4)); + pc0[5] = _mm256_extract_epi32(sum5, 0); + + sum6 = _mm256_add_epi32(sum6, _mm256_permute2x128_si256(sum6, sum6, 0x81)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 8)); + sum6 = _mm256_add_epi32(sum6, _mm256_srli_si256(sum6, 4)); + pc0[6] = _mm256_extract_epi32(sum6, 0); + + sum7 = _mm256_add_epi32(sum7, _mm256_permute2x128_si256(sum7, sum7, 0x81)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 8)); + sum7 = _mm256_add_epi32(sum7, _mm256_srli_si256(sum7, 4)); + pc0[7] = _mm256_extract_epi32(sum7, 0); +} + + +/** + * b packed + * @param k + * @param a + * @param lda + * @param b + * @param ldb + * @param c + * @param ldc + */ +inline void block2x4_kernel_avx2_me_k16_pad( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + //printf("block4x2_kernel_avx2\n"); + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + + size_t nk = k >> 4; // k / 32 + size_t k_leftover = k - (nk << 4); // k % 32 + + __m256i ma0; + __m256i ma1; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3;; + + __m256i temp_0; + __m256i temp_1; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + temp_0 = _mm256_madd_epi16(ma0, mb1); + temp_1 = _mm256_madd_epi16(ma1, mb1); + + sum2 = _mm256_add_epi32(sum2, temp_0); + sum3 = _mm256_add_epi32(sum3, temp_1); + + + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + temp_0 = _mm256_madd_epi16(ma0, mb2); + temp_1 = _mm256_madd_epi16(ma1, mb2); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_0 = _mm256_madd_epi16(ma0, mb3); + temp_1 = _mm256_madd_epi16(ma1, mb3); + sum6 = _mm256_add_epi32(sum6, temp_0); + sum7 = _mm256_add_epi32(sum7, temp_1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc0[2] = _mm256_extract_epi32(sum4, 0); + pc0[3] = _mm256_extract_epi32(sum4, 1); + + //the 2 row + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum1, 0); + pc1[1] = _mm256_extract_epi32(sum1, 1); + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + pc1[2] = _mm256_extract_epi32(sum5, 0); + pc1[3] = _mm256_extract_epi32(sum5, 1); +} + +/** + * b packed + * @param k + * @param a + * @param lda + * @param b + * @param ldb + * @param c + * @param ldc + */ +inline void block2x4_kernel_avx2_me_k16_pad_s8s8fp32( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, float* c, const int32_t ldc, const float* scale) { + //printf("block4x2_kernel_avx2\n"); + + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + float* pc0 = c; + float* pc1 = c + 1 * ldc; + + size_t nk = k >> 4; // k / 32 + size_t k_leftover = k - (nk << 4); // k % 32 + + __m256i ma0; + __m256i ma1; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3; + + + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + __m256i temp_0; + __m256i temp_1; + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + temp_0 = _mm256_madd_epi16(ma0, mb1); + temp_1 = _mm256_madd_epi16(ma1, mb1); + + sum2 = _mm256_add_epi32(sum2, temp_0); + sum3 = _mm256_add_epi32(sum3, temp_1); + + + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + temp_0 = _mm256_madd_epi16(ma0, mb2); + temp_1 = _mm256_madd_epi16(ma1, mb2); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_0 = _mm256_madd_epi16(ma0, mb3); + temp_1 = _mm256_madd_epi16(ma1, mb3); + sum6 = _mm256_add_epi32(sum6, temp_0); + sum7 = _mm256_add_epi32(sum7, temp_1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + //store + __m256i zero = _mm256_setzero_si256(); + __m256 temp_0; + __m256 temp_1; + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + // pc0[0] = _mm256_extract_epi32(sum0, 0); + // pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + // pc0[2] = _mm256_extract_epi32(sum4, 0); + // pc0[3] = _mm256_extract_epi32(sum4, 1); + // printf_intrin_var(sum0); + // printf_intrin_var(sum4); + sum4 = _mm256_blend_epi32(sum0, _mm256_permute4x64_epi64(sum4, 0xc0), 0x0c); + // printf_intrin_var(sum4); + temp_0 = _mm256_broadcast_ps((const __m128*)scale); + temp_1 = _mm256_cvtepi32_ps(sum4); + temp_0 = _mm256_mul_ps(temp_0, temp_1); + __m128 write_128 = _mm256_extractf128_ps(temp_0, 0x00); + _mm_storeu_ps(pc0, write_128); + + + + //the 2 row + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + sum5 = _mm256_blend_epi32(sum1, _mm256_permute4x64_epi64(sum5, 0xc0), 0x0c); + temp_0 = _mm256_broadcast_ps((const __m128*)scale); + temp_1 = _mm256_cvtepi32_ps(sum5); + temp_0 = _mm256_mul_ps(temp_0, temp_1); + write_128 = _mm256_extractf128_ps(temp_0, 0x00); + _mm_storeu_ps(pc1, write_128); +} + + +inline void block2x64_4_kernel_avx2_me_k16_s8s8s8( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int8_t* c, const int32_t ldc, const float* scale_in, + float* scale_out) { + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int8_t* pc0 = c; + int8_t* pc1 = c + 1 * ldc; + + size_t nk = k >> 4; // k / 32 + size_t k_leftover = k - (nk << 4); // k % 32 + + __m256i ma0; + __m256i ma1; + + __m256i mb0; + __m256i mb1; + __m256i mb2; + __m256i mb3;; + + __m256i temp_0; + __m256i temp_1; + + __m256i sum0 = _mm256_setzero_si256(); + __m256i sum1 = _mm256_setzero_si256(); + __m256i sum2 = _mm256_setzero_si256(); + __m256i sum3 = _mm256_setzero_si256(); + __m256i sum4 = _mm256_setzero_si256(); + __m256i sum5 = _mm256_setzero_si256(); + __m256i sum6 = _mm256_setzero_si256(); + __m256i sum7 = _mm256_setzero_si256(); + + for (size_t k = 0; k < nk; ++k) { + //a + ma0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa0)); + ma1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pa1)); + //b + mb0 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb0)); + + //the 0 row + temp_0 = _mm256_madd_epi16(ma0, mb0); + temp_1 = _mm256_madd_epi16(ma1, mb0); + sum0 = _mm256_add_epi32(sum0, temp_0); + sum1 = _mm256_add_epi32(sum1, temp_1); + + + //the 1 row + mb1 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb1)); + temp_0 = _mm256_madd_epi16(ma0, mb1); + temp_1 = _mm256_madd_epi16(ma1, mb1); + + sum2 = _mm256_add_epi32(sum2, temp_0); + sum3 = _mm256_add_epi32(sum3, temp_1); + + + mb2 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb2)); + temp_0 = _mm256_madd_epi16(ma0, mb2); + temp_1 = _mm256_madd_epi16(ma1, mb2); + + sum4 = _mm256_add_epi32(sum4, temp_0); + sum5 = _mm256_add_epi32(sum5, temp_1); + + //the 3 row + + mb3 = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*) pb3)); + temp_0 = _mm256_madd_epi16(ma0, mb3); + temp_1 = _mm256_madd_epi16(ma1, mb3); + sum6 = _mm256_add_epi32(sum6, temp_0); + sum7 = _mm256_add_epi32(sum7, temp_1); + + pa0 += 16; + pa1 += 16; + + pb0 += 16; + pb1 += 16; + pb2 += 16; + pb3 += 16; + } + + //store + __m256i zero = _mm256_setzero_si256(); + + //the 0 row + sum0 = _mm256_hadd_epi32(sum0, sum2); + sum0 = _mm256_hadd_epi32(sum0, zero); + sum0 = _mm256_add_epi32(sum0, _mm256_permute2x128_si256(sum0, zero, 0x31)); + + pc0[0] = _mm256_extract_epi32(sum0, 0); + pc0[1] = _mm256_extract_epi32(sum0, 1); + + //the 1 row + sum4 = _mm256_hadd_epi32(sum4, sum6); + sum4 = _mm256_hadd_epi32(sum4, zero); + sum4 = _mm256_add_epi32(sum4, _mm256_permute2x128_si256(sum4, zero, 0x31)); + + pc0[2] = _mm256_extract_epi32(sum4, 0); + pc0[3] = _mm256_extract_epi32(sum4, 1); + + //the 2 row + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_hadd_epi32(sum1, sum3); + sum1 = _mm256_add_epi32(sum1, _mm256_permute2x128_si256(sum1, zero, 0x31)); + + pc1[0] = _mm256_extract_epi32(sum1, 0); + pc1[1] = _mm256_extract_epi32(sum1, 1); + + //the 3 row + sum5 = _mm256_hadd_epi32(sum5, sum7); + sum5 = _mm256_hadd_epi32(sum5, zero); + sum5 = _mm256_add_epi32(sum5, _mm256_permute2x128_si256(sum5, zero, 0x31)); + + pc1[2] = _mm256_extract_epi32(sum5, 0); + pc1[3] = _mm256_extract_epi32(sum5, 1); +} +#if defined(__AVX512F__) +inline __m512i avx512_reduce_4(__m512i& x0, __m512i& x1, __m512i& x2, __m512i& x3) { + __m512i temp0 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 + }, x0); + __m512i temp1 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 + }, x1); + __m512i temp2 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 + }, x2); + __m512i temp3 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 + }, x3); + temp0 = _mm512_add_epi32(temp0, x0); + temp1 = _mm512_add_epi32(temp1, x1); + temp2 = _mm512_add_epi32(temp2, x2); + temp3 = _mm512_add_epi32(temp3, x3); + temp0 = _mm512_mask_blend_epi32(0xFF00, temp0, temp1); + temp2 = _mm512_mask_blend_epi32(0xFF00, temp2, temp3); + temp1 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 13, 14, 15 + }, temp0); + temp3 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 13, 14, 15 + }, temp2); + temp0 = _mm512_add_epi32(temp0, temp1); + temp2 = _mm512_add_epi32(temp2, temp3); + temp2 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 + }, temp2); + temp0 = _mm512_mask_blend_epi32(0xF0F0, temp0, temp2); + temp1 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 + }, temp0); + temp0 = _mm512_add_epi32(temp0, temp1); + temp1 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13 + }, temp0); + temp0 = _mm512_add_epi32(temp0, temp1); + temp0 = _mm512_permutexvar_epi32((__m512i)(__v16si) { + 0, 8, 4, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }, temp0); + return temp0; +} +inline __m512i avx512_loadfp32_int8(const float* ptr, __m512& in_scale) { + __m512i temp_low = _mm512_castsi256_si512(_mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( + _mm512_mul_ps(_mm512_loadu_ps(ptr), in_scale), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))); + __m512i temp_hi = _mm512_castsi256_si512(_mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( + _mm512_mul_ps(_mm512_loadu_ps(ptr + 16), in_scale), + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))); + temp_hi = _mm512_permutexvar_epi16((__m512i)(__v32hi) { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15 + }, temp_hi); + return _mm512_mask_blend_epi16(0xFFFF0000, temp_low, temp_hi); +} + +void block4x4_kernel_avx512_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, int* c, const int32_t ldc) { + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + int* pc0 = c; + int* pc1 = c + 1 * ldc; + int* pc2 = c + 2 * ldc; + int* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + __m512i sum0 = _mm512_setzero_si512(); + __m512i sum1 = _mm512_setzero_si512(); + __m512i sum2 = _mm512_setzero_si512(); + __m512i sum3 = _mm512_setzero_si512(); + __m512i sum4 = _mm512_setzero_si512(); + __m512i sum5 = _mm512_setzero_si512(); + __m512i sum6 = _mm512_setzero_si512(); + __m512i sum7 = _mm512_setzero_si512(); + __m512i sum8 = _mm512_setzero_si512(); + __m512i sum9 = _mm512_setzero_si512(); + __m512i sum10 = _mm512_setzero_si512(); + __m512i sum11 = _mm512_setzero_si512(); + __m512i sum12 = _mm512_setzero_si512(); + __m512i sum13 = _mm512_setzero_si512(); + __m512i sum14 = _mm512_setzero_si512(); + __m512i sum15 = _mm512_setzero_si512(); + + for (size_t k = 0; k < nk; ++k) { + __m512i temp0; + __m512i temp1; + __m512i temp2; + __m512i temp3; + __m512i a0 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa0)); + __m512i a1 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa1)); + __m512i a2 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa2)); + __m512i a3 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa3)); + + __m512i b0 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb0)); + temp0 = _mm512_madd_epi16(a0, b0); + temp1 = _mm512_madd_epi16(a1, b0); + temp2 = _mm512_madd_epi16(a2, b0); + temp3 = _mm512_madd_epi16(a3, b0); + sum0 = _mm512_add_epi32(sum0, temp0); + sum4 = _mm512_add_epi32(sum4, temp1); + sum8 = _mm512_add_epi32(sum8, temp2); + sum12 = _mm512_add_epi32(sum12, temp3); + + __m512i b1 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb1)); + temp0 = _mm512_madd_epi16(a0, b1); + temp1 = _mm512_madd_epi16(a1, b1); + temp2 = _mm512_madd_epi16(a2, b1); + temp3 = _mm512_madd_epi16(a3, b1); + sum1 = _mm512_add_epi32(sum1, temp0); + sum5 = _mm512_add_epi32(sum5, temp1); + sum9 = _mm512_add_epi32(sum9, temp2); + sum13 = _mm512_add_epi32(sum13, temp3); + + __m512i b2 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb2)); + temp0 = _mm512_madd_epi16(a0, b2); + temp1 = _mm512_madd_epi16(a1, b2); + temp2 = _mm512_madd_epi16(a2, b2); + temp3 = _mm512_madd_epi16(a3, b2); + sum2 = _mm512_add_epi32(sum2, temp0); + sum6 = _mm512_add_epi32(sum6, temp1); + sum10 = _mm512_add_epi32(sum10, temp2); + sum14 = _mm512_add_epi32(sum14, temp3); + + __m512i b3 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb3)); + temp0 = _mm512_madd_epi16(a0, b3); + temp1 = _mm512_madd_epi16(a1, b3); + temp2 = _mm512_madd_epi16(a2, b3); + temp3 = _mm512_madd_epi16(a3, b3); + sum3 = _mm512_add_epi32(sum3, temp0); + sum7 = _mm512_add_epi32(sum7, temp1); + sum11 = _mm512_add_epi32(sum11, temp2); + sum15 = _mm512_add_epi32(sum15, temp3); + + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + + } + + __m512i temp0 = avx512_reduce_4(sum0, sum1, sum2, sum3); + _mm512_mask_storeu_epi32(pc0, 0x000F, temp0); + __m512i temp1 = avx512_reduce_4(sum4, sum5, sum6, sum7); + _mm512_mask_storeu_epi32(pc1, 0x000F, temp1); + __m512i temp2 = avx512_reduce_4(sum8, sum9, sum10, sum11); + _mm512_mask_storeu_epi32(pc2, 0x000F, temp2); + __m512i temp3 = avx512_reduce_4(sum12, sum13, sum14, sum15); + _mm512_mask_storeu_epi32(pc3, 0x000F, temp3); + + // printf_intrin_var(temp0); + + + // exit(0); + + + // pc0[0]=_mm512_reduce_add_epi32(sum0); + // pc0[1]=_mm512_reduce_add_epi32(sum1); + // pc0[2]=_mm512_reduce_add_epi32(sum2); + // pc0[3]=_mm512_reduce_add_epi32(sum3); + // pc1[0]=_mm512_reduce_add_epi32(sum4); + // pc1[1]=_mm512_reduce_add_epi32(sum5); + // pc1[2]=_mm512_reduce_add_epi32(sum6); + // pc1[3]=_mm512_reduce_add_epi32(sum7); + // pc2[0]=_mm512_reduce_add_epi32(sum8); + // pc2[1]=_mm512_reduce_add_epi32(sum9); + // pc2[2]=_mm512_reduce_add_epi32(sum10); + // pc2[3]=_mm512_reduce_add_epi32(sum11); + // pc3[0]=_mm512_reduce_add_epi32(sum12); + // pc3[1]=_mm512_reduce_add_epi32(sum13); + // pc3[2]=_mm512_reduce_add_epi32(sum14); + // pc3[3]=_mm512_reduce_add_epi32(sum15); +} + +void block4x4_kernel_avx512_me( + const int32_t k, const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, float* c, const int32_t ldc, const float* scale) { + const int8_t* pa0 = a; + const int8_t* pa1 = pa0 + 1 * lda; + const int8_t* pa2 = pa0 + 2 * lda; + const int8_t* pa3 = pa0 + 3 * lda; + + const int8_t* pb0 = b; + const int8_t* pb1 = pb0 + 1 * ldb; + const int8_t* pb2 = pb0 + 2 * ldb; + const int8_t* pb3 = pb0 + 3 * ldb; + + float* pc0 = c; + float* pc1 = c + 1 * ldc; + float* pc2 = c + 2 * ldc; + float* pc3 = c + 3 * ldc; + + size_t nk = k >> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + __m512i sum0 = _mm512_setzero_si512(); + __m512i sum1 = _mm512_setzero_si512(); + __m512i sum2 = _mm512_setzero_si512(); + __m512i sum3 = _mm512_setzero_si512(); + __m512i sum4 = _mm512_setzero_si512(); + __m512i sum5 = _mm512_setzero_si512(); + __m512i sum6 = _mm512_setzero_si512(); + __m512i sum7 = _mm512_setzero_si512(); + __m512i sum8 = _mm512_setzero_si512(); + __m512i sum9 = _mm512_setzero_si512(); + __m512i sum10 = _mm512_setzero_si512(); + __m512i sum11 = _mm512_setzero_si512(); + __m512i sum12 = _mm512_setzero_si512(); + __m512i sum13 = _mm512_setzero_si512(); + __m512i sum14 = _mm512_setzero_si512(); + __m512i sum15 = _mm512_setzero_si512(); + + for (size_t k = 0; k < nk; ++k) { + __m512i temp0; + __m512i temp1; + __m512i temp2; + __m512i temp3; + __m512i a0 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa0)); + __m512i a1 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa1)); + __m512i a2 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa2)); + __m512i a3 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pa3)); + + __m512i b0 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb0)); + temp0 = _mm512_madd_epi16(a0, b0); + temp1 = _mm512_madd_epi16(a1, b0); + temp2 = _mm512_madd_epi16(a2, b0); + temp3 = _mm512_madd_epi16(a3, b0); + sum0 = _mm512_add_epi32(sum0, temp0); + sum4 = _mm512_add_epi32(sum4, temp1); + sum8 = _mm512_add_epi32(sum8, temp2); + sum12 = _mm512_add_epi32(sum12, temp3); + + __m512i b1 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb1)); + temp0 = _mm512_madd_epi16(a0, b1); + temp1 = _mm512_madd_epi16(a1, b1); + temp2 = _mm512_madd_epi16(a2, b1); + temp3 = _mm512_madd_epi16(a3, b1); + sum1 = _mm512_add_epi32(sum1, temp0); + sum5 = _mm512_add_epi32(sum5, temp1); + sum9 = _mm512_add_epi32(sum9, temp2); + sum13 = _mm512_add_epi32(sum13, temp3); + + __m512i b2 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb2)); + temp0 = _mm512_madd_epi16(a0, b2); + temp1 = _mm512_madd_epi16(a1, b2); + temp2 = _mm512_madd_epi16(a2, b2); + temp3 = _mm512_madd_epi16(a3, b2); + sum2 = _mm512_add_epi32(sum2, temp0); + sum6 = _mm512_add_epi32(sum6, temp1); + sum10 = _mm512_add_epi32(sum10, temp2); + sum14 = _mm512_add_epi32(sum14, temp3); + + __m512i b3 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb3)); + temp0 = _mm512_madd_epi16(a0, b3); + temp1 = _mm512_madd_epi16(a1, b3); + temp2 = _mm512_madd_epi16(a2, b3); + temp3 = _mm512_madd_epi16(a3, b3); + sum3 = _mm512_add_epi32(sum3, temp0); + sum7 = _mm512_add_epi32(sum7, temp1); + sum11 = _mm512_add_epi32(sum11, temp2); + sum15 = _mm512_add_epi32(sum15, temp3); + + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + + } + + const __m512 scale_float4 = _mm512_mask_loadu_ps(_mm512_setzero_ps(), 0x000F, scale); + + __m512i temp0 = avx512_reduce_4(sum0, sum1, sum2, sum3); + __m512 wirte_0 = _mm512_cvt_roundepi32_ps(temp0, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_0 = _mm512_mul_ps(wirte_0, scale_float4); + _mm512_mask_storeu_ps(pc0, 0x000F, wirte_0); + + __m512i temp1 = avx512_reduce_4(sum4, sum5, sum6, sum7); + __m512 wirte_1 = _mm512_cvt_roundepi32_ps(temp1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_1 = _mm512_mul_ps(wirte_1, scale_float4); + _mm512_mask_storeu_ps(pc1, 0x000F, wirte_1); + + __m512i temp2 = avx512_reduce_4(sum8, sum9, sum10, sum11); + __m512 wirte_2 = _mm512_cvt_roundepi32_ps(temp2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_2 = _mm512_mul_ps(wirte_2, scale_float4); + _mm512_mask_storeu_ps(pc2, 0x000F, wirte_2); + + __m512i temp3 = avx512_reduce_4(sum12, sum13, sum14, sum15); + __m512 wirte_3 = _mm512_cvt_roundepi32_ps(temp3, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_3 = _mm512_mul_ps(wirte_3, scale_float4); + _mm512_mask_storeu_ps(pc3, 0x000F, wirte_3); + + // __m512i temp2=avx512_reduce_4(sum8,sum9,sum10,sum11); + // _mm512_mask_storeu_epi32(pc2,0x000F,temp2); + // __m512i temp3=avx512_reduce_4(sum12,sum13,sum14,sum15); + // _mm512_mask_storeu_epi32(pc3,0x000F,temp3); + + // printf_intrin_var(temp0); + + + // exit(0); + + + // pc0[0]=_mm512_reduce_add_epi32(sum0); + // pc0[1]=_mm512_reduce_add_epi32(sum1); + // pc0[2]=_mm512_reduce_add_epi32(sum2); + // pc0[3]=_mm512_reduce_add_epi32(sum3); + // pc1[0]=_mm512_reduce_add_epi32(sum4); + // pc1[1]=_mm512_reduce_add_epi32(sum5); + // pc1[2]=_mm512_reduce_add_epi32(sum6); + // pc1[3]=_mm512_reduce_add_epi32(sum7); + // pc2[0]=_mm512_reduce_add_epi32(sum8); + // pc2[1]=_mm512_reduce_add_epi32(sum9); + // pc2[2]=_mm512_reduce_add_epi32(sum10); + // pc2[3]=_mm512_reduce_add_epi32(sum11); + // pc3[0]=_mm512_reduce_add_epi32(sum12); + // pc3[1]=_mm512_reduce_add_epi32(sum13); + // pc3[2]=_mm512_reduce_add_epi32(sum14); + // pc3[3]=_mm512_reduce_add_epi32(sum15); +} + +void block4x4_kernel_avx512_me( + const int32_t k, const float* a, const int32_t lda, const float scale_a, + const int8_t* b, const int32_t ldb, float* c, const int32_t ldc, const float* scale) { + // LOG(INFO)<<"in_scale = "<> 5; // k / 32 + size_t k_leftover = k - (nk << 5); // k % 32 + __m512i sum0 = _mm512_setzero_si512(); + __m512i sum1 = _mm512_setzero_si512(); + __m512i sum2 = _mm512_setzero_si512(); + __m512i sum3 = _mm512_setzero_si512(); + __m512i sum4 = _mm512_setzero_si512(); + __m512i sum5 = _mm512_setzero_si512(); + __m512i sum6 = _mm512_setzero_si512(); + __m512i sum7 = _mm512_setzero_si512(); + __m512i sum8 = _mm512_setzero_si512(); + __m512i sum9 = _mm512_setzero_si512(); + __m512i sum10 = _mm512_setzero_si512(); + __m512i sum11 = _mm512_setzero_si512(); + __m512i sum12 = _mm512_setzero_si512(); + __m512i sum13 = _mm512_setzero_si512(); + __m512i sum14 = _mm512_setzero_si512(); + __m512i sum15 = _mm512_setzero_si512(); + __m512 in_scale = _mm512_set1_ps(scale_a); + + for (size_t k = 0; k < nk; ++k) { + __m512i temp0; + __m512i temp1; + __m512i temp2; + __m512i temp3; + + __m512i a0 = avx512_loadfp32_int8(pa0, in_scale); + __m512i a1 = avx512_loadfp32_int8(pa1, in_scale); + __m512i a2 = avx512_loadfp32_int8(pa2, in_scale); + __m512i a3 = avx512_loadfp32_int8(pa3, in_scale); + + __m512i b0 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb0)); + temp0 = _mm512_madd_epi16(a0, b0); + temp1 = _mm512_madd_epi16(a1, b0); + temp2 = _mm512_madd_epi16(a2, b0); + temp3 = _mm512_madd_epi16(a3, b0); + sum0 = _mm512_add_epi32(sum0, temp0); + sum4 = _mm512_add_epi32(sum4, temp1); + sum8 = _mm512_add_epi32(sum8, temp2); + sum12 = _mm512_add_epi32(sum12, temp3); + + __m512i b1 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb1)); + temp0 = _mm512_madd_epi16(a0, b1); + temp1 = _mm512_madd_epi16(a1, b1); + temp2 = _mm512_madd_epi16(a2, b1); + temp3 = _mm512_madd_epi16(a3, b1); + sum1 = _mm512_add_epi32(sum1, temp0); + sum5 = _mm512_add_epi32(sum5, temp1); + sum9 = _mm512_add_epi32(sum9, temp2); + sum13 = _mm512_add_epi32(sum13, temp3); + + __m512i b2 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb2)); + temp0 = _mm512_madd_epi16(a0, b2); + temp1 = _mm512_madd_epi16(a1, b2); + temp2 = _mm512_madd_epi16(a2, b2); + temp3 = _mm512_madd_epi16(a3, b2); + sum2 = _mm512_add_epi32(sum2, temp0); + sum6 = _mm512_add_epi32(sum6, temp1); + sum10 = _mm512_add_epi32(sum10, temp2); + sum14 = _mm512_add_epi32(sum14, temp3); + + __m512i b3 = _mm512_cvtepi8_epi16(_mm256_loadu_si256((__m256i*)pb3)); + temp0 = _mm512_madd_epi16(a0, b3); + temp1 = _mm512_madd_epi16(a1, b3); + temp2 = _mm512_madd_epi16(a2, b3); + temp3 = _mm512_madd_epi16(a3, b3); + sum3 = _mm512_add_epi32(sum3, temp0); + sum7 = _mm512_add_epi32(sum7, temp1); + sum11 = _mm512_add_epi32(sum11, temp2); + sum15 = _mm512_add_epi32(sum15, temp3); + + + pa0 += 32; + pa1 += 32; + pa2 += 32; + pa3 += 32; + + pb0 += 32; + pb1 += 32; + pb2 += 32; + pb3 += 32; + + } + + const __m512 scale_float4 = _mm512_mask_loadu_ps(_mm512_setzero_ps(), 0x000F, scale); + + __m512i temp0 = avx512_reduce_4(sum0, sum1, sum2, sum3); + __m512 wirte_0 = _mm512_cvt_roundepi32_ps(temp0, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_0 = _mm512_mul_ps(wirte_0, scale_float4); + _mm512_mask_storeu_ps(pc0, 0x000F, wirte_0); + + __m512i temp1 = avx512_reduce_4(sum4, sum5, sum6, sum7); + __m512 wirte_1 = _mm512_cvt_roundepi32_ps(temp1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_1 = _mm512_mul_ps(wirte_1, scale_float4); + _mm512_mask_storeu_ps(pc1, 0x000F, wirte_1); + + __m512i temp2 = avx512_reduce_4(sum8, sum9, sum10, sum11); + __m512 wirte_2 = _mm512_cvt_roundepi32_ps(temp2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_2 = _mm512_mul_ps(wirte_2, scale_float4); + _mm512_mask_storeu_ps(pc2, 0x000F, wirte_2); + + __m512i temp3 = avx512_reduce_4(sum12, sum13, sum14, sum15); + __m512 wirte_3 = _mm512_cvt_roundepi32_ps(temp3, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + wirte_3 = _mm512_mul_ps(wirte_3, scale_float4); + _mm512_mask_storeu_ps(pc3, 0x000F, wirte_3); + + // __m512i temp2=avx512_reduce_4(sum8,sum9,sum10,sum11); + // _mm512_mask_storeu_epi32(pc2,0x000F,temp2); + // __m512i temp3=avx512_reduce_4(sum12,sum13,sum14,sum15); + // _mm512_mask_storeu_epi32(pc3,0x000F,temp3); + + // printf_intrin_var(temp0); + + + // exit(0); + + + // pc0[0]=_mm512_reduce_add_epi32(sum0); + // pc0[1]=_mm512_reduce_add_epi32(sum1); + // pc0[2]=_mm512_reduce_add_epi32(sum2); + // pc0[3]=_mm512_reduce_add_epi32(sum3); + // pc1[0]=_mm512_reduce_add_epi32(sum4); + // pc1[1]=_mm512_reduce_add_epi32(sum5); + // pc1[2]=_mm512_reduce_add_epi32(sum6); + // pc1[3]=_mm512_reduce_add_epi32(sum7); + // pc2[0]=_mm512_reduce_add_epi32(sum8); + // pc2[1]=_mm512_reduce_add_epi32(sum9); + // pc2[2]=_mm512_reduce_add_epi32(sum10); + // pc2[3]=_mm512_reduce_add_epi32(sum11); + // pc3[0]=_mm512_reduce_add_epi32(sum12); + // pc3[1]=_mm512_reduce_add_epi32(sum13); + // pc3[2]=_mm512_reduce_add_epi32(sum14); + // pc3[3]=_mm512_reduce_add_epi32(sum15); +} +/** +* b must packed +*/ +inline void avx512_s8s8s32_gemm_4x4_packed( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 4; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0 ," << m; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0 ," << n; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x4_kernel_avx512_me(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } +} + +inline void avx512_s8s8s32_gemm_4x4_packed( + const int32_t m, const int32_t n, const int32_t k, + const float* a, const int32_t lda, const float scale_a, + const int8_t* b, const int32_t ldb, + float* c, const int32_t ldc, const float* scale) { + const int m_block = 4; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0 ," << m; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0 ," << n; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const float* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + + float* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x4_kernel_avx512_me(k, a_ptr, lda, scale_a, b_ptr, ldb, c_ptr, ldc, scale); + } + } +} + +inline void avx512_s8s8s32_gemm_4x4_packed( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + float* c, const int32_t ldc, const float* scale) { + const int m_block = 4; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0 ," << m; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0 ," << n; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + + float* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x4_kernel_avx512_me(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc, scale); + } + } +} + +#endif + +/** +* b must packed +*/ +inline void avx_s8s8s32_gemm_2x4_packed( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 2; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + // block4x2_kernel_avx2_me(k,a_ptr,lda,b_ptr,ldb,c_ptr,ldc,1); + // block4x2_kernel_avx2_me_k16(k,a_ptr,lda,b_ptr,ldb,c_ptr,ldc,1); + block2x4_kernel_avx2_me_k16(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } +} + +inline void avx_s8s8s32_gemm_2x4_packed_omp_packed( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 2; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block2x4_kernel_avx2_me_k16_packed(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } +} + +/** +* b must packed +*/ +inline void avx_s8s8s32_gemm_2x4_packed_omp( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 2; + const int n_block = 4; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + // auto ker = [&](const int ithr, const int nthr) { + // for (int mbi = 0; mbi < mb; mbi++) { + // for (int nbi = 0; nbi < nb; nbi++) { + // const int8_t* a_ptr = &a[mbi * m_block * lda]; + // const int8_t* b_ptr = &b[nbi * n_block * ldb]; + // int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + // block2x4_kernel_avx2_me_k16(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + // } + // } + // }; + ////#pragma omp parallel + // { + // ker(anakin_get_thread_num(), anakin_get_num_threads()); + // } + +#if USE_OMP_IN_INTRINSIC_PACKED_FC +#pragma omp parallel for schedule(static) if (anakin_get_max_threads() > 1) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block2x4_kernel_avx2_me_k16(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } + + +} + +inline void avx_s8s8s32_gemm_1x8_packed_omp( + const int32_t m, const int32_t n, const int32_t k, + const int8_t* a, const int32_t lda, + const int8_t* b, const int32_t ldb, + int32_t* c, const int32_t ldc) { + const int m_block = 1; + const int n_block = 8; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; +#if USE_OMP_IN_INTRINSIC_PACKED_FC +#pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + int32_t* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block1x8_kernel_avx2_me_k16(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc); + } + } +} +#if 0 +template +SaberStatus PackedFC::init(int n, int k, int8_t* weights) { + CHECK_EQ(k % 16, 0); + _inner_weights.re_alloc(Shape({1, 1, n, k}), AK_INT8); + int8_t* out_ptr = static_cast(_inner_weights.mutable_data()); + + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + int in_index = i * n + j; + int out_index = j * k + i; + out_ptr[out_index] = weights[in_index]; + } + } + + jit::jit_int8_packed_fc_config_t int8_generate_config; + int8_generate_config.m_block_size = 2; + int8_generate_config.n_block_size = 4; + int8_generate_config.k_block_number = k / 16; + _packed_gemm = new jit::jit_s8s8s32_packed_gemm(int8_generate_config); + _packed_gemm->dump_code(_packed_gemm->getCode()); + return SaberSuccess; +} +#endif +template +SaberStatus PackedFC::init(int n, int k, Tensor& weights_tensor, + float input_scale, + float output_scale, PackedFCAlg alg) { + _alg = alg; + + if (B_Dtype == AK_INT8) { + LOG(INFO) << "init = " << alg; + + if (alg == DotAdd) { + CHECK_EQ(k % 16, 0); + packed_weights_k2(_inner_weights, weights_tensor, n, k, 8); + return SaberSuccess; + } else if (alg == DotReductionPacked) { + CHECK_EQ(k % 16, 0); + packed_weights_transpose_k(_inner_weights, weights_tensor, n, k, 4, 16); + return SaberSuccess; + } else if (alg == DotSplitK) { + CHECK_EQ(k % 2, 0); + packed_weights_k2(_inner_weights, weights_tensor, n, k, 64); + return SaberSuccess; + } else { + CHECK_EQ(k % 16, 0); + _inner_weights.re_alloc(Shape({1, 1, n, k}), AK_INT8); + int8_t* out_ptr = static_cast(_inner_weights.mutable_data()); + + const int8_t* weights = static_cast(weights_tensor.data()); + + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + int in_index = i * n + j; + int out_index = j * k + i; + out_ptr[out_index] = weights[in_index]; + } + } + + jit::jit_int8_packed_fc_config_t int8_generate_config; + int8_generate_config.m_block_size = 2; + int8_generate_config.n_block_size = 4; + int8_generate_config.k_block_number = k / 16; + // _packed_gemm = new jit::jit_s8s8s32_packed_gemm(int8_generate_config); + // _packed_gemm->dump_code(_packed_gemm->getCode()); + return SaberSuccess; + } + } else { + CHECK_EQ(weights_tensor.get_dtype(), AK_FLOAT); + _inner_weights.re_alloc(Shape({1, 1, n, k}), AK_INT8); + Tensor temp_tensor(Shape({1, 1, n, k}), AK_INT8); + int8_t* out_ptr = static_cast(_inner_weights.mutable_data()); + utils::ScaleUtils::scale_gemm_xw_weights_to_nchw_host(temp_tensor, weights_tensor); + const int8_t* weights = static_cast(temp_tensor.data()); + + // printf_pointer(weights,n*k); + // printf_pointer(temp_tensor.get_scale().data(),n); + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + int in_index = i * n + j; + int out_index = j * k + i; + out_ptr[out_index] = weights[in_index]; + } + } + + _inner_weights.set_scale(temp_tensor.get_scale()); + auto weights_scales = _inner_weights.get_scale(); + _scale.clear(); + + for (auto weights_scale : weights_scales) { + _scale.push_back(input_scale * weights_scale / output_scale); + } + + return SaberSuccess; + } + +} + +#if 0 +SaberStatus PackedFC::dispatch(const int m, const int n, const int k, const int8_t* a, + int* c) { + const int8_t* b = static_cast(_inner_weights.data()); + + // if (m == 1 || m % 2 == 1) { + // avx_s8s8s32_gemm_1x8_packed_omp(m, n, k, a, k, b, k, c, n); + // } else { + // avx_s8s8s32_gemm_2x4_packed_omp(m, n, k, a, k, b, k, c, n); + // } + // LOG(INFO)<<"m = "<jit_ker(&int8_config); + // } + // } + + return SaberSuccess; +} + +SaberStatus PackedFC::dispatch(const int m, const int n, const int k, const int8_t* a, + float* c) { + + const int8_t* b = static_cast(_inner_weights.data()); + const float* sclae = _inner_weights.get_scale().data(); + const int m_block = 2; + const int n_block = 4; + const int lda = k; + const int ldb = k; + const int ldc = n; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; +#if USE_OMP_IN_INTRINSIC_PACKED_FC +#pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + float* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block2x4_kernel_avx2_me_k16_pad_s8s8fp32(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc, sclae); + } + } + + return SaberSuccess; +} + +SaberStatus PackedFC::dispatch(const int m, const int n, const int k, const Tensor& a, + float* c) { + if (jit::mayiuse(jit::avx512_core) && a.get_dtype() == AK_INT8 && _scale.size() > 0) { + const int8_t* a_scale_ptr = static_cast(_scale_inputs.data()); + const int8_t* b = static_cast(_inner_weights.data()); + const float* sclae = _scale.data(); + // printf_pointer(sclae,_scale.size()); + const int m_block = 4; + const int n_block = 4; + const int lda = k; + const int ldb = k; + const int ldc = n; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + LOG(INFO) << "it is scale gemm "; +#if USE_OMP_IN_INTRINSIC_PACKED_FC + #pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a_scale_ptr[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + float* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + block4x4_kernel_avx512_scale_me(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc, sclae); + } + } + + } else { + CHECK_EQ(a.get_dtype(), AK_FLOAT); + utils::try_expand_tensor(_scale_inputs, a.valid_shape()); + utils::ScaleUtils::scale_fp32_int8(_scale_inputs, a); + const int8_t* a_scale_ptr = static_cast(_scale_inputs.data()); + const int8_t* b = static_cast(_inner_weights.data()); + const float* sclae = _scale.data(); + // printf_pointer(sclae,_scale.size()); + const int m_block = 2; + const int n_block = 4; + const int lda = k; + const int ldb = k; + const int ldc = n; + int mb = m / m_block; + int nb = n / n_block; + int m_remainder = m % m_block; + int n_remainder = n % n_block; + CHECK_EQ(m_remainder, 0) << "only support remainder = 0"; + CHECK_EQ(n_remainder, 0) << "only support remainder = 0"; + +#if USE_OMP_IN_INTRINSIC_PACKED_FC +#pragma omp parallel for schedule(static) +#endif + + for (int mbi = 0; mbi < mb; mbi++) { + for (int nbi = 0; nbi < nb; nbi++) { + const int8_t* a_ptr = &a_scale_ptr[mbi * m_block * lda]; + const int8_t* b_ptr = &b[nbi * n_block * ldb]; + float* c_ptr = &c[mbi * m_block * n + nbi * n_block]; + // LOG(INFO)<<"are you ok"; + // printf_pointer(a_ptr,2*k); + + block2x4_kernel_avx2_me_k16_pad_s8s8fp32(k, a_ptr, lda, b_ptr, ldb, c_ptr, ldc, sclae); + } + } + } + + return SaberSuccess; +} + +#endif + +template < DataType datatype> +struct MyDataTrait { + typedef __invalid_type Dtype; +}; +template <> +struct MyDataTrait { + typedef float Dtype; +}; +template <> +struct MyDataTrait { + typedef int Dtype; +}; +template <> +struct MyDataTrait { + typedef int8_t Dtype; +}; +template <> +struct MyDataTrait { + typedef uint8_t Dtype; +}; +template <> +SaberStatus PackedFC::dispatch(const int m, const int n, const int k, + const Tensor& tensor_a, + Tensor& tensor_c) { + CHECK_EQ(tensor_a.get_dtype(), AK_INT8); + CHECK(tensor_c.get_dtype() == AK_INT32 || tensor_c.get_dtype() == AK_FLOAT); + const int8_t* b = static_cast(_inner_weights.data()); + const int8_t* a = static_cast(tensor_a.data()); + int* c = static_cast(tensor_c.mutable_data()); + + if (_alg == DotAdd) { +#if defined(__AVX2__) and defined(__FMA__) + // avx_s8s8s32_gemm_mx8_packed_dot_add(m, n, k, a, k, b, k, c, n); + avx_s8s8s32_gemm_4x8_packed_dot_add(m, n, k, a, k, b, k, c, n); +#else + LOG(FATAL) << "not impl"; +#endif + } else if (_alg == DotReductionPacked) { +#if defined(__AVX2__) and defined(__FMA__) + avx_s8s8s32_gemm_2x4_packed_omp_packed(m, n, k, a, k, b, k, c, n); +#else + LOG(FATAL) << "not impl"; +#endif + } else if (_alg == DotSplitK) { +#if defined(__AVX2__) and defined(__FMA__) + avx_s8s8s32_gemm_4x64_packed_split_k(m, n, k, a, k, b, k, c, n); +#else + LOG(FATAL) << "not impl"; +#endif + } else { +#if defined(__AVX512F__) + avx512_s8s8s32_gemm_4x4_packed(m, n, k, a, k, b, k, c, n); +#elif defined(__AVX2__) and defined(__FMA__) + avx_s8s8s32_gemm_2x4_packed_omp(m, n, k, a, k, b, k, c, n); +#else + LOG(FATAL) << "not impl"; +#endif + } + + return SaberSuccess; +} +template <> +SaberStatus PackedFC::dispatch(const int m, const int n, const int k, + const Tensor& tensor_a, + Tensor& tensor_c) { + CHECK_EQ(tensor_a.get_dtype(), AK_FLOAT); + CHECK_EQ(tensor_c.get_dtype(), AK_FLOAT); + CHECK_EQ(_scale.size(), n); + CHECK_EQ(tensor_a.get_scale().size(), 1); + const float scale_a = 1.f / tensor_a.get_scale()[0]; + const float* sclae = _scale.data(); + const int8_t* b = static_cast(_inner_weights.data()); + const float* a = static_cast(tensor_a.data()); + float* c = static_cast(tensor_c.mutable_data()); +#if defined(__AVX512F__) + avx512_s8s8s32_gemm_4x4_packed(m, n, k, a, k, scale_a, b, k, c, n, sclae); +#else + LOG(FATAL) << "not impl"; +#endif + return SaberSuccess; +} +//template <> +//SaberStatus PackedFC::dispatch(const int m, const int n, const int k, const Tensor& tensor_a, +// Tensor &tensor_c) { +// CHECK_EQ(_scale.size(),n); +// CHECK_EQ(tensor_a.get_scale().size(),1); +// const float scale_a=1.f/tensor_a.get_scale()[0]; +// const float* sclae=_scale.data(); +// const int8_t* b = static_cast(_inner_weights.data()); +// const int8_t * a= static_cast(tensor_a.data()); +// float* c= static_cast(tensor_c.mutable_data()); +// avx512_s8s8s32_gemm_4x4_packed(m, n, k, a, k,scale_a, b, k, c, n,sclae); +// return SaberSuccess; +//} + +template class PackedFC; +template class PackedFC; +//template class PackedFC; +#else + +template <> +SaberStatus PackedFC:: +init(int n, int k, Tensor& weights_tensor,float input_scale,float output_scale,PackedFCAlg alg) { + LOG(FATAL) << "not impl"; + return SaberSuccess; +} + +template <> +SaberStatus PackedFC:: +init(int n, int k, Tensor& weights_tensor,float input_scale,float output_scale,PackedFCAlg alg) { + LOG(FATAL) << "not impl"; + return SaberSuccess; +} + +template <> +SaberStatus PackedFC:: +dispatch(const int m, const int n, const int k, const Tensor& tensor_a, + Tensor& tensor_c) { + LOG(FATAL) << "not impl"; + return SaberSuccess; +}; + +template <> +SaberStatus PackedFC:: +dispatch(const int m, const int n, const int k, const Tensor& tensor_a, + Tensor& tensor_c) { + LOG(FATAL) << "not impl"; + return SaberSuccess; +}; + +#endif + +} +} diff --git a/saber/funcs/impl/x86/intrinsic_packed_fc.h b/saber/funcs/impl/x86/intrinsic_packed_fc.h new file mode 100644 index 000000000..71d1a4b49 --- /dev/null +++ b/saber/funcs/impl/x86/intrinsic_packed_fc.h @@ -0,0 +1,184 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_INTRINSIC_PACKED_FC_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_INTRINSIC_PACKED_FC_H +#include "saber/core/tensor.h" +#include "saber/funcs/gemm.h" +#include "jit_generator.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" + +namespace anakin { +namespace saber { +namespace jit{ +static int print_buffer[32] {0}; +struct jit_s8s8s32_packed_gemm: public jit_generator { + + jit_s8s8s32_packed_gemm(jit_int8_packed_fc_config_t ajcp) : jcp(ajcp) { + +// real_printf(123); +// real_printf_fp32(); + print_func_ptr = (void*)&real_printf; + print_vec_func_ptr = (void*)&real_printf_fp32; + this->generate(); + jit_ker = (void (*)(jit_int8_packed_fc_call_t*))this->getCode(); +// LOG(INFO) << "gen done"; + + } + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_s8s8s32_packed_gemm); + + void (*jit_ker)(jit_int8_packed_fc_call_t*); + + + +private: + void cal_one_block(); + void load_and_init(); + void reduction_and_store2mem(); + static void real_printf(size_t x) { + printf("real_printf %d , %p \n", x, x); + } + static void real_printf_fp32() { + for (int i = 0; i < 8; i++) { + printf("avx printf[%d] = %d\n",i, print_buffer[i]); + } + for (int i = 0; i < 8; i++) { + print_buffer[i]=-i; + } + } + + + + void* print_func_ptr{nullptr}; + void* print_vec_func_ptr{nullptr}; + void print_jit(Xbyak::Reg64 reg) { + save_common_regs(); + mov(rax, (size_t)print_func_ptr); + mov(abi_param1, reg); + call(rax); + restore_common_regs(); + } + + void print_jit_vec(Xbyak::Ymm reg) { + save_common_regs(); + mov(rax, (size_t)print_vec_func_ptr); + mov(r15, (size_t)&print_buffer[0]); + vmovdqu(ptr[r15], reg); + call(rax); + restore_common_regs(); + } + + void print_jit_vec(Xbyak::Xmm reg) { + save_common_regs(); + mov(rax, (size_t)print_vec_func_ptr); + mov(r15, (size_t)&print_buffer[0]); + movdqu(ptr[r15], reg); + call(rax); + restore_common_regs(); + } + + using reg64_t = const Xbyak::Reg64; + reg64_t reg_input = rax; + reg64_t reg_output = rbx; + reg64_t reg_weights = rcx; + reg64_t reg_k_block_size = rdx; + reg64_t reg_k_block_num = r8; + // reg64_t reg_debug=r9; + + reg64_t reg_lda = rsi; + reg64_t reg_ldb = r9; + reg64_t temp_0 = rsi; + reg64_t temp_1 = r9; + reg64_t reg_ldc = rsi; + + + + reg64_t address_a_0 = r10; + reg64_t address_a_1 = r11; + reg64_t address_b_0 = r12; + reg64_t address_b_1 = r13; + reg64_t address_b_2 = r14; + reg64_t address_b_3 = r15; + + + + Xbyak::Ymm sum_row0_col0 = Xbyak::Ymm(0); + Xbyak::Ymm sum_row0_col1 = Xbyak::Ymm(1); + Xbyak::Ymm sum_row0_col2 = Xbyak::Ymm(2); + Xbyak::Ymm sum_row0_col3 = Xbyak::Ymm(3); + Xbyak::Ymm c_row0_col0_1 = Xbyak::Ymm(0); + Xbyak::Ymm c_row0_col2_3 = Xbyak::Ymm(1); + Xbyak::Ymm c_row0_col0_1_2_3 = Xbyak::Ymm(0); + Xbyak::Xmm c_row0_col0_1_2_3_m128 = Xbyak::Xmm(0); + + Xbyak::Ymm sum_row1_col0 = Xbyak::Ymm(4); + Xbyak::Ymm sum_row1_col1 = Xbyak::Ymm(5); + Xbyak::Ymm sum_row1_col2 = Xbyak::Ymm(6); + Xbyak::Ymm sum_row1_col3 = Xbyak::Ymm(7); + Xbyak::Ymm c_row1_col0_1 = Xbyak::Ymm(4); + Xbyak::Ymm c_row1_col2_3 = Xbyak::Ymm(5); + Xbyak::Ymm c_row1_col0_1_2_3 = Xbyak::Ymm(4); + Xbyak::Xmm c_row1_col0_1_2_3_m128 = Xbyak::Xmm(4); + + + Xbyak::Ymm a0 = Xbyak::Ymm(8); + Xbyak::Ymm a1 = Xbyak::Ymm(9); + Xbyak::Ymm b0 = Xbyak::Ymm(10); + Xbyak::Ymm b1 = Xbyak::Ymm(11); + Xbyak::Ymm b2 = Xbyak::Ymm(12); + Xbyak::Ymm b3 = Xbyak::Ymm(13); + Xbyak::Xmm a0_xmm = Xbyak::Xmm(8); + Xbyak::Xmm a1_xmm = Xbyak::Xmm(9); + Xbyak::Xmm b0_xmm = Xbyak::Xmm(10); + Xbyak::Xmm b1_xmm = Xbyak::Xmm(11); + Xbyak::Xmm b2_xmm = Xbyak::Xmm(12); + Xbyak::Xmm b3_xmm = Xbyak::Xmm(13); + Xbyak::Ymm zero_in_reduction = Xbyak::Ymm(8); + Xbyak::Ymm temp0_in_reduction = Xbyak::Ymm(9); + Xbyak::Ymm temp1_in_reduction = Xbyak::Ymm(10); + Xbyak::Ymm temp2_in_reduction = Xbyak::Ymm(11); + Xbyak::Ymm temp3_in_reduction = Xbyak::Ymm(12); + + Xbyak::Ymm vtemp_0 = Xbyak::Ymm(14); + Xbyak::Ymm vtemp_1 = Xbyak::Ymm(15); + Xbyak::Ymm vtemp_3 = Xbyak::Ymm(16); + Xbyak::Ymm vtemp_4 = Xbyak::Ymm(17); + jit_int8_packed_fc_config_t jcp; + const size_t aligned_length = 16; + + void generate(); +}; +} + +enum PackedFCAlg : int{ + DotReduction=0, + DotAdd, + DotReductionPacked, + DotSplitK, +}; + +template +class PackedFC { +public: + PackedFC(){ + _scale_inputs.re_alloc(Shape({1,1,1,64}),AK_INT8); + } + ~PackedFC(){ + delete _packed_gemm; + } +// SaberStatus init(int n,int k,int8_t* weights); + SaberStatus init(int n, int k, Tensor& weights_tensor,float input_scale=1.f,float output_scale=1.f,PackedFCAlg alg=DotReduction); + + SaberStatus dispatch(const int m, const int n, const int k, const Tensor&tensor_a, + Tensor &tensor_c); + + Tensor _inner_weights; +private: + + Tensor _scale_inputs; + jit::jit_s8s8s32_packed_gemm* _packed_gemm{nullptr}; + std::vector _scale; + PackedFCAlg _alg; +}; + +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_INTRINSIC_PACKED_FC_H diff --git a/saber/funcs/impl/x86/kernel/.DS_Store b/saber/funcs/impl/x86/kernel/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/saber/funcs/impl/x86/kernel/.DS_Store differ diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_conv.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_conv.cpp index 5ae62f209..426c59228 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx2_conv.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx2_conv.cpp @@ -2,17 +2,19 @@ #include "saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.h" #include "saber/funcs/impl/x86/kernel/jit_avx2_conv.h" #include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/saber_normal_activation.h" +#include "debug.h" namespace anakin { namespace saber { using namespace jit; -using jit_conv_ker_t = void (*)(jit_conv_call_t *); +using jit_conv_ker_t = void (*)(jit_conv_call_t*); -inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, - const void *src, const void *dst, - const void *filt, const void *bias, +inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t& p, + const void* src, const void* dst, + const void* filt, const void* bias, int channel, int kh_padding) { #define PIPELINE(field) \ do { \ @@ -34,24 +36,27 @@ inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, template <> SaberStatus JitAvx2Conv::check_conf( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { - ConvParam *conv_param = ¶m.conv_param; - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); + ConvParam* conv_param = ¶m.conv_param; + const Tensor* weights = conv_param->weight(); + const Tensor* bias = conv_param->bias(); const jit_conv_conf_t jcp = kernel->jcp; - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; // check format - if (((inputs[0]->get_layout() != Layout_NCHW) && ( - inputs[0]->get_layout() != Layout_NCHW_C8)) - || (outputs[0]->get_layout() != Layout_NCHW_C8) - || (weights->get_layout() != Layout_NCHW)) - { - LOG(ERROR) << "wrong format"; + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + bool is_layout_ok = (input_layout == Layout_NCHW || input_layout == Layout_NCHW_C8 + || input_layout == Layout_NCHW_C8R) + && (output_layout == Layout_NCHW || output_layout == Layout_NCHW_C8 + || output_layout == Layout_NCHW_C8R); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format layout " << inputs[0]->get_layout() << "," << outputs[0]->get_layout(); return SaberUnImplError; } @@ -61,19 +66,19 @@ SaberStatus JitAvx2Conv::check_conf( && jcp.l_pad == conv_param->pad_w && jcp.stride_h == conv_param->stride_h && jcp.stride_w == conv_param->stride_w - && jcp.dilate_h == conv_param->dilation_h - && jcp.dilate_w == conv_param->dilation_w; - + && jcp.dilate_h == conv_param->dilation_h - 1 + && jcp.dilate_w == conv_param->dilation_w - 1; +// LOG(INFO) << "jcp.t_pad " << jcp.t_pad << "," << conv_param->pad_h; // check shape bool shape_ok = true && jcp.kh == weights->height() && jcp.kw == weights->width() && jcp.ngroups == 1 && jcp.mb == input->num() - && jcp.ic == input->channel() + && jcp.ic == utils::round_up(input->channel(), 8) && jcp.ih == input->height() && jcp.iw == input->width() - && jcp.oc == output->channel() + && jcp.oc == utils::round_up(output->channel(), 8) && jcp.oh == output->height() && jcp.ow == output->width(); @@ -87,22 +92,27 @@ SaberStatus JitAvx2Conv::check_conf( template<> SaberStatus JitAvx2Conv::create( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, Context &ctx) { + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, Context& ctx) { + DLOG(INFO) << "input layout " << inputs[0]->get_layout() << " , output layout " << + outputs[0]->get_layout(); SaberStatus status = SaberSuccess; - ConvParam *conv_param = ¶m.conv_param; - ActivationParam *act_param = nullptr; - const Tensor *weights = conv_param->weight(); - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; + ConvParam* conv_param = ¶m.conv_param; + ActivationParam* act_param = nullptr; + const Tensor* weights = conv_param->weight(); + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; + // check conf if (kernel) { status = check_conf(inputs, outputs, param); - if(status != SaberNotInitialized) { + + if (status != SaberNotInitialized) { return status; } } + // init conf conf.src_fmt = input->get_layout(); conf.ngroups = 1; @@ -115,6 +125,18 @@ SaberStatus JitAvx2Conv::create( conf.oh = output->height(); conf.ow = output->width(); + if (input->get_layout() == Layout_NCHW_C8R) { + conf.ic = utils::round_up(input->channel(), 8); + conf.src_fmt = Layout_NCHW_C8; + DLOG(INFO) << "input->get_layout == Layout_NCHW_C8R"; + } + + if (output->get_layout() == Layout_NCHW_C8R) { + conf.oc = utils::round_up(output->channel(), 8); + } + + DLOG(INFO) << "oc = " << conf.oc << ", ic = " << conf.ic; + conf.kh = weights->height(); conf.kw = weights->width(); conf.stride_h = conv_param->stride_h; @@ -124,97 +146,543 @@ SaberStatus JitAvx2Conv::create( conf.dilate_h = conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1); conf.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); - conf.with_bias = (conv_param->bias()!= NULL); + conf.with_sum = false; + + if (param.eltwise_param.has_eltwise){ + conf.with_sum = true; + } + conf.with_bias = (conv_param->bias() != NULL)&&(conv_param->bias()->valid_size()>0); conf.with_relu = conv_param->activation_param.has_active; - + if (conf.with_relu) { act_param = &(conv_param->activation_param); conf.relu_negative_slope = act_param->negative_slope; } + status = jit_avx2_conv_act_kernel::init_conf(conf); + if (status == SaberSuccess) { if (kernel != nullptr) { delete kernel; kernel = nullptr; } + kernel = new jit_avx2_conv_act_kernel(this->conf); } else { return SaberUnImplError; } + // reorder weights - Tensor *weights_reorder = conv_param->mutable_weight(); + Tensor* weights_reorder = conv_param->mutable_weight(); weights_internal.reset(new Tensor(weights_reorder->valid_shape())); if (inputs[0]->get_layout() == Layout_NCHW) { weight_reorder_OIhwi8o(*weights_reorder, *weights_internal); - } else if (inputs[0]->get_layout() == Layout_NCHW_C8) { + } else if (inputs[0]->get_layout() == Layout_NCHW_C8 + || inputs[0]->get_layout() == Layout_NCHW_C8R) { weight_reorder_OIhw8i8o(*weights_reorder, *weights_internal); } if (conf.with_bias) { - Shape bias_s({1,conf.oc,1,1}, Layout_NCHW); + Shape bias_s({1, conf.oc, 1, 1}, Layout_NCHW); bias_internal.reset(new Tensor(bias_s)); bias_internal->set_shape(conv_param->bias()->valid_shape(), bias_s); bias_internal->copy_from(*conv_param->bias()); } + if (outputs[0]->get_layout() == Layout_NCHW) { + Shape shape = outputs[0]->valid_shape(); + int n_value = shape[0], c_value = shape[1], h_value = shape[2], w_value = shape[3]; + Shape new_shape({n_value, utils::round_up(c_value, 8) / 8, h_value, w_value, 8}, Layout_NCHW_C8); + _temp_output.reshape(new_shape); + } + return SaberSuccess; } template <> SaberStatus JitAvx2Conv::init( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, Context &ctx) { - - ConvParam *conv_param = ¶m.conv_param; - if (((inputs[0]->get_layout() != Layout_NCHW) && ( - inputs[0]->get_layout() != Layout_NCHW_C8)) - || (outputs[0]->get_layout() != Layout_NCHW_C8) - || (conv_param->weight()->get_layout() != Layout_NCHW)) { - - LOG(ERROR) << "wrong format"; + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + ConvParam* conv_param = ¶m.conv_param; + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + bool is_layout_ok = (input_layout == Layout_NCHW || input_layout == Layout_NCHW_C8 + || input_layout == Layout_NCHW_C8R) + && (output_layout == Layout_NCHW || output_layout == Layout_NCHW_C8 + || output_layout == Layout_NCHW_C8R); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format layout " << inputs[0]->get_layout() << "," << outputs[0]->get_layout(); return SaberUnImplError; } + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } +void conv_basic_check(Tensor& tensor_in, Tensor& tensor_out, + const float* weights, const float* bias, int group, + int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, + int pad_w, int pad_h, bool flag_bias, bool flag_relu, float beta = 0.f) { + + auto src_data = reinterpret_cast(tensor_in.data()); + auto dst_data_ref = reinterpret_cast(tensor_out.mutable_data()); + Tensor bk; + bk.re_alloc(tensor_out.valid_shape(), AK_FLOAT); + bk.copy_from(tensor_out); + auto weights_data = weights; + bool with_bias = flag_bias; + auto bias_data = bias; + + int in_num = tensor_out.num(); + int out_channels = tensor_out.channel(); + int out_h = tensor_out.height(); + int out_w = tensor_out.width(); + + int in_channel = tensor_in.channel(); + int in_h = tensor_in.height(); + int in_w = tensor_in.width(); + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + #pragma omp parallel for num_threads(8) collapse(5) schedule(static) + + for (int n = 0; n < in_num; ++n) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + g * out_c_group * out_h * out_w + + oc * out_h * out_w + oh * out_w + ow; + float bias_d = with_bias ? (float)(bias_data[g * out_c_group + oc]) : 0.f; + dst_data_ref[out_idx] = bias_d + dst_data_ref[out_idx] * beta; + + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dilation_w); + int ih = oh * stride_h - pad_h + kh * (dilation_h); + + if (iw < 0 || iw >= in_w) { + continue; + } + + if (ih < 0 || ih >= in_h) { + continue; + } + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + + ic * in_h * in_w + + ih * in_w + + iw; + int widx = g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + + kh * kernel_w + + kw; + + dst_data_ref[out_idx] + += src_data[iidx] + * weights_data[widx]; + } + } + } + + if (flag_relu) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > 0.f ? dst_data_ref[out_idx] : 0.f; + } + } + } + } + } + } +} + +static inline void conv_basic_check_nchwc(const float* src_data, float* dst_data_ref, int in_num, + int in_channel, int in_h, int in_w, + int out_channels, int out_h, int out_w, + const float* weights, const float* bias, + int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, + int pad_w, int pad_h, bool flag_bias, bool flag_relu) { + + // #pragma omp parallel for num_threads(8) collapse(5) schedule(static) + int in_channel_div8 = utils::div_up(in_channel, 8); + int out_channel_div8 = utils::div_up(out_channels, 8); + + for (int n = 0; n < in_num; ++n) { + for (int oc = 0; oc < out_channel_div8; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * out_channel_div8 * out_h * out_w * 8 + + oc * out_h * out_w * 8 + + oh * out_w * 8 + + ow * 8; + float result[8] = {0.f}; + + if (flag_bias) { + for (int i = 0; i < 8; i++) { + result[i] = bias[oc * 8 + i]; + } + } + + for (int ic = 0; ic < in_channel_div8; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dilation_w); + int ih = oh * stride_h - pad_h + kh * (dilation_h); + + if (iw < 0 || iw >= in_w) { + continue; + } + + if (ih < 0 || ih >= in_h) { + continue; + } + + for (int inner_oc = 0; inner_oc < 8; inner_oc++) { + for (int inner_ic = 0; inner_ic < 8; inner_ic++) { + + int iidx = n * in_channel_div8 * in_h * in_w * 8 + + ic * in_h * in_w * 8 + + ih * in_w * 8 + + iw * 8 + inner_ic; + int widx = oc * in_channel_div8 * kernel_h * kernel_w * 8 * 8 + + ic * kernel_h * kernel_w * 8 * 8 + + kh * kernel_w * 8 * 8 + + kw * 8 * 8 + + inner_ic * 8 + inner_oc; + + result[inner_oc] + += src_data[iidx] + * weights[widx]; + + } + } + } + } + } + + for (int inner_oc = 0; inner_oc < 8; inner_oc++) { + if (flag_relu) { + dst_data_ref[out_idx + inner_oc] = result[inner_oc] > 0.f ? result[inner_oc] : 0.f; + } else { + dst_data_ref[out_idx + inner_oc] = result[inner_oc]; + } + } + + } + } + } + } +} +#if defined(__AVX2__) and defined(__FMA__) +static inline void conv_basic_check_nchwc_avx2(const float* src_data, float* dst_data_ref, + int in_num, + int in_channel, int in_h, int in_w, + int out_channels, int out_h, int out_w, + const float* weights, const float* bias, + int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, + int pad_w, int pad_h, bool flag_bias, bool flag_relu) { + + // #pragma omp parallel for num_threads(8) collapse(5) schedule(static) + int in_channel_div8 = utils::div_up(in_channel, 8); + int out_channel_div8 = utils::div_up(out_channels, 8); + + for (int n = 0; n < in_num; ++n) { + for (int oc = 0; oc < out_channel_div8; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * out_channel_div8 * out_h * out_w * 8 + + oc * out_h * out_w * 8 + + oh * out_w * 8 + + ow * 8; + __m256 result = _mm256_setzero_ps(); + + if (flag_bias) { + result = _mm256_loadu_ps(bias + oc * 8); + } + + for (int ic = 0; ic < in_channel_div8; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dilation_w); + int ih = oh * stride_h - pad_h + kh * (dilation_h); + + if (iw < 0 || iw >= in_w) { + continue; + } + + if (ih < 0 || ih >= in_h) { + continue; + } + + const float* inpute_base = src_data + n * in_channel_div8 * in_h * in_w * 8 + + ic * in_h * in_w * 8 + + ih * in_w * 8 + + iw * 8; + __m256 input_8 = _mm256_loadu_ps(inpute_base); + // LOG(INFO)<<":::"<= in_w) { + continue; + } + + if (ih < 0 || ih >= in_h) { + continue; + } + + const float* inpute_base = src_data + n * in_channel_div8 * in_h * in_w * 8 + + ic * in_h * in_w * 8 + + ih * in_w * 8 + + iw * 8; + // LOG(INFO)<<":::"< SaberStatus JitAvx2Conv::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { - - ConvParam *conv_param = ¶m.conv_param; - const Tensor *bias = conv_param->bias(); + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + + ConvParam* conv_param = ¶m.conv_param; + + bool with_bias=(conv_param->bias() != NULL)&&(conv_param->bias()->valid_size()>0); + + + const float* ptr_src = reinterpret_cast(inputs[0]->data()); + const float* ptr_weights = reinterpret_cast(weights_internal->data()); + const float* ptr_bias = (conv_param->bias() != NULL)&&(conv_param->bias()->valid_size()>0) ? reinterpret_cast(bias_internal->data()) : nullptr; + float* ptr_dst = nullptr; + + // if(inputs[0]->get_layout()==Layout_NCHW_C8R&&outputs[0]->get_layout()==Layout_NCHW_C8R){ + //// Shape in_nchw=inputs[0]->valid_shape(); + //// in_nchw.set_layout_without_shape(Layout_NCHW); + //// Tensor temp_in(in_nchw); + //// Shape out_nchw=outputs[0]->valid_shape(); + //// out_nchw.set_layout_without_shape(Layout_NCHW); + //// Tensor temp_out(out_nchw); + //// reorder_nchwc8_nchw(*inputs[0],temp_in); + //// conv_basic_check(temp_in,temp_out, static_cast(conv_param->weight()->data()), + //// static_cast(conv_param->bias()->data()),conv_param->group,conv_param->weight()->width(), + //// conv_param->weight()->height(),conv_param->stride_w,conv_param->stride_h,conv_param->dilation_w, + //// conv_param->dilation_h,conv_param->pad_w,conv_param->pad_h,conv_param->bias()!=nullptr, + //// conv_param->activation_param.active==Active_relu,0); + //// input_reorder_nChwc8(temp_out,*outputs[0]); + // + //// LOG(INFO)<valid_shape()<<",out = "<valid_shape(); + //// weight_reorder_nchw2nchw8o8i(*conv_param->mutable_weight(),*weights_internal); + // conv_basic_check_nchwc_avx2_h4(ptr_src,reinterpret_cast(outputs[0]->mutable_data()),inputs[0]->num(),inputs[0]->channel(),inputs[0]->height(), + // inputs[0]->width(),outputs[0]->channel(),outputs[0]->height(),outputs[0]->width(), + // ptr_weights,ptr_bias,conv_param->weight()->width(), + // conv_param->weight()->height(),conv_param->stride_w,conv_param->stride_h,conv_param->dilation_w, + // conv_param->dilation_h,conv_param->pad_w,conv_param->pad_h,conv_param->bias()!=nullptr, conv_param->activation_param.active==Active_relu); + // return SaberSuccess; + // + // } + + + if (outputs[0]->get_layout() == Layout_NCHW) { + ptr_dst = reinterpret_cast(_temp_output.mutable_data()); + } else { + ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + } - const float *ptr_src = reinterpret_cast(inputs[0]->data()); - const float *ptr_weights = reinterpret_cast(weights_internal->data()); - const float *ptr_bias = bias? reinterpret_cast(bias_internal->data()) : nullptr; - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - const auto &jcp = kernel->jcp; + DLOG(INFO) << "input layout " << inputs[0]->get_layout() << " , output layout " << + outputs[0]->get_layout() << "," << anakin_get_thread_num() << "," << anakin_get_num_threads() << "::" << + conf.with_relu << "," << conf.with_bias; + const auto& jcp = kernel->jcp; int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking); const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh; auto ker = [&](const int ithr, const int nthr) { size_t start{0}, end{0}; - utils::balance211(work_amount, nthr, ithr, start, end); - + balance211(work_amount, nthr, ithr, start, end); int icbb = 0; + while (icbb < jcp.nb_ic) { int icb_step = jcp.nb_ic_blocking; int icb_step_rem = jcp.nb_ic - icbb; + if (icb_step_rem < jcp.nb_ic_blocking_max) { icb_step = icb_step_rem; } size_t n{0}, g{0}, ocbb{0}, oh{0}; - utils::nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { int ocb = ocbb * jcp.nb_oc_blocking; int ocb_num = jcp.nb_oc_blocking; @@ -224,8 +692,8 @@ SaberStatus JitAvx2Conv::dispatch( par_conv.flags = 0; const int ij = oh * jcp.stride_h; const int i_t_overflow = utils::max(0, jcp.t_pad - ij); - const int i_b_overflow = utils::max(jcp.ih, ij - + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; + const int i_b_overflow = utils::max(jcp.ih, ij + + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; const size_t _oc = g * jcp.nb_oc + ocb; const size_t _ic = g * jcp.nb_ic + icb; @@ -234,25 +702,28 @@ SaberStatus JitAvx2Conv::dispatch( const int wgt_ic = jcp.ic == 3 ? 0 : icb; const int ih = utils::max(ij - jcp.t_pad + utils::div_up(i_t_overflow, - (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); + (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); - par_conv.src = (jcp.src_fmt == Layout_NCHW)? ptr_src + n * jcp.ic * jcp.ih * jcp.iw + - src_ic * jcp.ih * jcp.iw + ih * jcp.iw : - ptr_src + n * jcp.ic * jcp.ih * jcp.iw + src_ic * jcp.ih * jcp.iw * 8 + par_conv.src = (jcp.src_fmt == Layout_NCHW) ? ptr_src + n * jcp.ic * jcp.ih * jcp.iw + + src_ic * jcp.ih * jcp.iw + ih * jcp.iw : + ptr_src + n * jcp.ic * jcp.ih * jcp.iw + src_ic * jcp.ih * jcp.iw * 8 + ih * jcp.iw * 8; - - par_conv.dst = ptr_dst + n * jcp.oc * jcp.oh * jcp.ow + _oc * jcp.oh * jcp.ow * 8 + + par_conv.dst = ptr_dst + n * jcp.oc * jcp.oh * jcp.ow + _oc * jcp.oh * jcp.ow * 8 + oh * jcp.ow * 8; - + const int wh = utils::div_up(i_t_overflow, (jcp.dilate_h + 1)); - par_conv.filt = ptr_weights + ocb * jcp.ic * jcp.kh * jcp.kw * 8 * 8 + + par_conv.filt = (jcp.src_fmt == Layout_NCHW) ? ptr_weights + ocb * jcp.kh * jcp.kw * jcp.ic * 8 + + wh * jcp.kw * jcp.ic * 8 + wgt_ic * 8 : + ptr_weights + ocb * jcp.ic * jcp.kh * jcp.kw * 8 + wgt_ic * jcp.kh * jcp.kw * 8 * 8 + wh * jcp.kw * 8 * 8; if (icb == 0) { - if (bias) { + if (with_bias) { par_conv.bias = ptr_bias + _oc * 8; } + par_conv.flags |= FLAG_IC_FIRST; } @@ -270,16 +741,21 @@ SaberStatus JitAvx2Conv::dispatch( kernel->jit_ker(&par_conv); } - utils::nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, - oh, jcp.oh); + + nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); } + icbb += icb_step; } }; -#pragma omp parallel + #pragma omp parallel { - ker(omp_get_thread_num(), omp_get_num_threads()); + ker(anakin_get_thread_num(), anakin_get_num_threads()); + } + + if (outputs[0]->get_layout() == Layout_NCHW) { + reorder_nchwc8_nchw(_temp_output, *outputs[0]); } return SaberSuccess; diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_conv.h b/saber/funcs/impl/x86/kernel/jit_avx2_conv.h index eb936455d..7d8b91aec 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx2_conv.h +++ b/saber/funcs/impl/x86/kernel/jit_avx2_conv.h @@ -55,6 +55,7 @@ class JitAvx2Conv : public ImplBase< jit::jit_avx2_conv_act_kernel *kernel = nullptr; std::shared_ptr > weights_internal; std::shared_ptr > bias_internal; + Tensor _temp_output; SaberStatus check_conf(const std::vector *>& inputs, std::vector*>& outputs, ConvEltwiseParam ¶m); diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp index 5cef3cb69..fba8d0bb6 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx2_conv_kernel.cpp @@ -393,7 +393,7 @@ void jit_avx2_conv_act_kernel::generate() { SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t& jcp) { if (!mayiuse(avx2)) { - LOG(ERROR) << "init a AVX2 kernel in a non-avx2 machine is not permitted"; + LOG(FATAL) << "init a AVX2 kernel in a non-avx2 machine is not permitted"; return SaberUnImplError; } @@ -456,7 +456,9 @@ SaberStatus jit_avx2_conv_act_kernel::init_conf(jit_conv_conf_t& jcp) { && utils::implication(mimo, jcp.ic % simd_w == 0); if (!args_ok) { - LOG(ERROR) << "arguments check failed"; + LOG(FATAL) << "arguments check failed "<<(jcp.oc % simd_w)<<",("< 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1))) + <<(utils::implication(mimo, jcp.ic % simd_w == 0)); return SaberUnImplError; } diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_deconv.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_deconv.cpp new file mode 100644 index 000000000..d14b0cc5c --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_deconv.cpp @@ -0,0 +1,320 @@ +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_deconv.h" +#include "x86_utils.h" +#include "tensor_op.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +using jit_deconv_ker_t = void (*)(jit_deconv_call_t*); + +inline void jit_deconv_ker_pipeline(jit_deconv_ker_t ker, jit_deconv_call_t& p, + const void* src, const void* dst, const void* filt, + const void* bias, int channel, int kh_padding) { + +#define PIPELINE(field) \ + do { \ + p.field = p.field ## _prf; \ + p.field ## _prf = field; \ + } while (0) + PIPELINE(src); + PIPELINE(dst); + PIPELINE(filt); + PIPELINE(bias); + PIPELINE(channel); + PIPELINE(kh_padding); + + if (p.src&&ker) { + ker(&p); + }else{ + + } +} + +template <> +SaberStatus JitAvx2Deconv::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) { + + ConvParam* conv_param = &(param); + const Tensor* weights = conv_param->weight(); + const jit_deconv_conf_t jcp = kernel->jcp; + Tensor* input = outputs[0]; + Tensor* output = inputs[0]; + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == 1 + && jcp.mb == input->num() + && jcp.ic == input->channel() + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } +} + +template <> +SaberStatus JitAvx2Deconv::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param, Context& ctx) { + + SaberStatus status = SaberSuccess; + ConvParam* conv_param = &(param); + ActivationParam* act_param = nullptr; + const Tensor* weights = conv_param->weight(); + Tensor* input = outputs[0]; + Tensor* output = inputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + + if (status != SaberNotInitialized) { + LOG(INFO) << "check_conf != SaberNotInitialized"; + return status; + } + } + + // init conf + conf.src_fmt = input->get_layout(); + + if (input->get_layout() == Layout_NCHW_C8R) { + conf.src_fmt = Layout_NCHW_C8; + } + + conf.ngroups = 1; + + conf.ndims = input->dims(); + conf.mb = input->num(); + + // swap param + conf.ic = input->channel(); + conf.ih = input->height(); + conf.iw = input->width(); + + conf.oc = output->channel(); + conf.oc_without_padding = conf.oc; + conf.oh = output->height(); + conf.ow = output->width(); + + conf.kh = weights->height(); + conf.kw = weights->width(); + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + conf.dilate_h = conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1); + conf.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); + + conf.with_bias = (conv_param->bias() != nullptr && conv_param->bias()->valid_size() > 0); + conf.with_relu = conv_param->activation_param.has_active; + conf.with_sum = false; + + if (conf.with_relu) { + return SaberUnImplError; + } + + if (conf.dilate_h != 0 || conf.dilate_w != 0) { + return SaberUnImplError; + } + + if (conf.with_relu) { + act_param = &(conv_param->activation_param); + conf.relu_negative_slope = act_param->negative_slope; + } + + status = jit_avx2_deconv_act_kernel::init_conf(conf); + + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + + kernel = new jit_avx2_deconv_act_kernel(this->conf); + } else { + return SaberUnImplError; + } + + // reorder weights + Tensor* weights_reorder = conv_param->mutable_weight(); + + weights_internal.reset(new Tensor(weights_reorder->valid_shape())); + + if (conf.src_fmt == Layout_NCHW_C8) { + weight_reorder_OIhw8o8i(*weights_reorder, *weights_internal); + } + + return SaberSuccess; +} + +template <> +SaberStatus JitAvx2Deconv::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param, Context& ctx) { + + ConvParam* conv_param = &(param); + + if ((inputs[0]->get_layout() != Layout_NCHW_C8R) + || (outputs[0]->get_layout() != Layout_NCHW_C8R) + || (conv_param->weight()->get_layout() != Layout_NCHW)) { + LOG(FATAL) << "data layout is not supported " << inputs[0]->get_layout() << "," << + outputs[0]->get_layout(); + return SaberUnImplError; + } + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitAvx2Deconv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param) { + using namespace std; + ConvParam* conv_param = &(param); + const Tensor* bias = conv_param->bias(); + + auto diff_src = reinterpret_cast(outputs[0]->data()); + auto weights = reinterpret_cast(weights_internal->data()); + auto diff_dst = reinterpret_cast(inputs[0]->data()); + const float* diff_bias = (bias != nullptr + && bias->valid_size() > 0) ? reinterpret_cast(bias->data()) : nullptr; + + const auto& jcp = kernel->jcp; + + + size_t diff_src_h_stride = jcp.iw * jcp.ic_block; + size_t diff_src_C_stride = jcp.ih * jcp.iw * jcp.ic_block; + size_t diff_src_n_stride = jcp.ih * jcp.iw * jcp.ic; + size_t diff_dst_h_stride = jcp.ow * jcp.oc_block; + size_t diff_dst_C_stride = jcp.oh * jcp.ow * jcp.oc_block; + size_t diff_dst_n_stride = jcp.oh * jcp.ow * jcp.oc; + size_t wht_h_stride = jcp.kw * jcp.ic_block * jcp.oc_block; + size_t wht_ic_stride = jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; + size_t wht_oc_stride = jcp.kh * jcp.kw * jcp.ic * jcp.oc_block; + size_t wht_g_stride = wht_oc_stride / jcp.ngroups; + + bool is_fast_path = jcp.dilate_h == 0 && jcp.stride_h == 1; + + auto ker = [&](const int ithr, const int nthr) { + int start{0}, end{0}, start_copy; + int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking; + int work_amount = jcp.ngroups * jcp.mb * ic_chunks * jcp.ih; + balance211(work_amount, nthr, ithr, start, end); + start_copy = start; + + jit_deconv_call_t par_deconv; + par_deconv.src_prf = nullptr; + par_deconv.dst_prf = nullptr; + par_deconv.filt_prf = nullptr; + par_deconv.bias_prf = nullptr; + par_deconv.kh_padding_prf = 0; + par_deconv.channel_prf = 0; + + for (int ocb_l2 = 0; ocb_l2 < jcp.nb_oc; ocb_l2 += jcp.nb_oc_L2) { + start = start_copy; + int n{0}, g{0}, icc{0}, ih_s{0}; + + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, icc, ic_chunks, ih_s, jcp.ih); + + while (start < end) { + int icb = icc * jcp.nb_ic_blocking; + int g_icb = g * jcp.nb_ic + icb; + int g_ocb = g * jcp.nb_oc; + + int work_rem = end - start; + int ih_e = ih_s + work_rem > jcp.ih ? jcp.ih : ih_s + work_rem; + + auto diff_src_w = diff_src + n * diff_src_n_stride + g_icb * + diff_src_C_stride; //diff_src_d.blk_off(n, g_icb); + auto diff_dst_w = diff_dst + n * diff_dst_n_stride + (g_ocb + ocb_l2) * diff_dst_C_stride; + auto wht_w = weights + g * wht_g_stride + ocb_l2 * wht_oc_stride + icb * wht_ic_stride; + auto bias_w = diff_bias ? diff_bias + g_icb * jcp.ic_block : nullptr; + + for (int ocb = ocb_l2; + ocb < utils::min(jcp.nb_oc, ocb_l2 + jcp.nb_oc_L2); ++ocb) { + for (int ij = ih_s; ij < ih_e; ++ij) { + int oj, k_len, k_lo; + + if (is_fast_path) { // dilate == 0 && stride == 1 + int i_t_overflow = utils::max(0, jcp.kh - 1 - ij + - jcp.t_pad); + int i_b_overflow = utils::max(0, jcp.kh - jcp.ih + ij + - jcp.b_pad); + k_len = jcp.kh - i_t_overflow - i_b_overflow; + k_lo = i_b_overflow; + oj = ij + jcp.t_pad - i_b_overflow; + } else { + int i_t_overflow = utils::max(0, (jcp.kh - 1 - ij + - jcp.t_pad) / jcp.stride_h); + int i_b_overflow = utils::max(0, (jcp.kh - jcp.ih + ij + - jcp.b_pad) / jcp.stride_h); + int overflow_kh_hi = jcp.kh - 1 - std::abs((jcp.ih - 1 + + jcp.b_pad - ij) % jcp.stride_h); + int overflow_kh_lo = (ij + jcp.t_pad) + % jcp.stride_h; + + k_len = (overflow_kh_hi - overflow_kh_lo) + / jcp.stride_h + 1 - i_t_overflow + - i_b_overflow; + k_lo = overflow_kh_lo + i_b_overflow * jcp.stride_h; + oj = (ij + jcp.t_pad - k_lo) / jcp.stride_h; + } + + assert(k_len >= 0); + + jit_deconv_ker_pipeline(kernel->jit_ker, par_deconv, + diff_src_w + ij * diff_src_h_stride, + diff_dst_w + oj * diff_dst_h_stride, + wht_w + k_lo * wht_h_stride, + bias_w, ocb, k_len); + } + + diff_dst_w += diff_dst_C_stride; + wht_w += wht_oc_stride; + } + + nd_iterator_jump(start, end, n, jcp.mb, g, jcp.ngroups, icc, ic_chunks, ih_s, jcp.ih); + } + } + + jit_deconv_ker_pipeline(kernel->jit_ker, par_deconv, + diff_src, diff_dst, weights, 0, 0, 1); + }; + + #pragma omp parallel + { + ker(omp_get_thread_num(), omp_get_num_threads()); + } + + return SaberSuccess; +} + +template class JitAvx2Deconv; +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_deconv.h b/saber/funcs/impl/x86/kernel/jit_avx2_deconv.h new file mode 100644 index 000000000..1af476ab8 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_deconv.h @@ -0,0 +1,52 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_H +#include + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +template +class JitAvx2Deconv : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + JitAvx2Deconv() : kernel(nullptr) {} + ~JitAvx2Deconv() { + if (kernel) { + delete kernel; + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m, Context&ctx) override; + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m) override; +private: + jit_deconv_conf_t conf; + jit_avx2_deconv_act_kernel *kernel = nullptr; + std::shared_ptr > weights_internal; + std::shared_ptr > bias_internal; + SaberStatus check_conf(const std::vector *>& inputs, + std::vector*>& outputs, + ConvParam ¶m); +}; + +} +} +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.cpp new file mode 100644 index 000000000..e1c85a282 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.cpp @@ -0,0 +1,457 @@ +#include "saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.h" +#define GET_OFF(field) offsetof(jit_deconv_call_t, field) + +namespace anakin { +namespace saber { +namespace jit { + +using namespace Xbyak; + +void jit_avx2_deconv_act_kernel::prepare_output(int ur_w) +{ + int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking; + for (int k = 0; k < jcp.nb_ic_blocking; k++) { + // vmovups(); + for (int j = 0; j < ur_w; j++) { + Ymm ymm = ymm_out(j, k); + vxorpd(ymm, ymm, ymm); + } + } +} + +void jit_avx2_deconv_act_kernel::store_output(int ur_w) +{ + Label no_update_label; + Label store_label; + + mov(reg_channel, ptr[param + GET_OFF(channel)]); + if (jcp.with_bias) { + mov(reg_bias, ptr[param1 + GET_OFF(bias)]); + } + + cmp(reg_channel, 0); + je(no_update_label, T_NEAR); + for (int k = 0; k < jcp.nb_ic_blocking; k++) { + for (int j = 0; j < ur_w; j++) { + Ymm ymm = ymm_out(j, k); + size_t aux_src_offset = (size_t)typesize + * ((size_t)k * jcp.ih * jcp.iw + j) * jcp.ic_block; + vadd(ymm, make_safe_addr(reg_src, aux_src_offset, + reg_long_offt)); + } + } + jmp(store_label, T_NEAR); + + L(no_update_label); + if (jcp.with_bias) { + for (int k = 0; k < jcp.nb_ic_blocking; k++) { + int bias_offset = typesize * k * jcp.ic_block; + for (int j = 0; j < ur_w; j++) { + Ymm ymm = ymm_out(j, k); + vadd(ymm, make_safe_addr(reg_bias, bias_offset, reg_long_offt)); + } + } + } + + L(store_label); + for (int k = 0; k < jcp.nb_ic_blocking; k++) { + for (int j = 0; j < ur_w; j++) { + Ymm ymm = ymm_out(j, k); + size_t aux_src_offset = (size_t)typesize + * ((size_t)k * jcp.ih * jcp.iw + j) * jcp.ic_block; + vmovups(make_safe_addr(reg_src, aux_src_offset, + reg_long_offt), ymm); + } + } + +} + +void jit_avx2_deconv_act_kernel::compute_loop_fma( + int ur_w, int l_overflow, int r_overflow) +{ + Label kh_label; + Label kd_label; + Label skip_kd_loop; + Label store_output_label; + int kw = jcp.kw; + int ow = jcp.ow; + + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + int l_pad = jcp.l_pad; + int dilate_w = jcp.dilate_w + 1; + int stride_w = jcp.stride_w; + int stride_h = jcp.stride_h; + + int ker_pipeline_depth = 1; + assert(ker_reg_base_idx + ker_pipeline_depth <= 15); + assert(oc_block >= ker_pipeline_depth); + + int num_ker_loads = oc_block * kw; + int num_inp_prfs = ur_w * utils::min(kw, stride_w) + + utils::max(0, kw - stride_w); + int num_prfs = num_ker_loads + num_inp_prfs; + int num_fmas = num_ker_loads * ur_w / stride_w; + int prf_inst_spacing = utils::max(1, num_fmas / num_prfs); + int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; + + prepare_output(ur_w); + + mov(aux_reg_dst, reg_dst); + mov(aux_reg_ker, reg_ker); + + mov(aux_reg_dst_prf, reg_dst_prf); + mov(aux_reg_ker_prf, reg_ker_prf); + + mov(reg_kj, reg_kh); + + cmp(reg_kj, 0); + je(store_output_label, T_NEAR); + + L(kh_label); { + for (int ki = 0; ki < kw; ki++) { + for (int oc = 0; oc < oc_block; oc++) { + int aux_kernel_offset = typesize * ((oc * oc_block + + ki * ic_block * oc_block)); + vmovups(ymm_wei, make_safe_addr(aux_reg_ker, aux_kernel_offset, reg_long_offt)); + + int jj_start = get_iw_start(ki, l_overflow); + int jj_end = get_iw_end(ur_w, ki, r_overflow); + assert(stride_w != 1 + || jj_start == utils::max(0, + l_overflow - (kw - 1 - ki) * dilate_w)); + assert(stride_w != 1 + || jj_end == ur_w - utils::max(0, + r_overflow - ki * dilate_w)); + + for (int jj = jj_start; jj < jj_end; jj += stride_w) { + assert((jj + l_pad - ki * dilate_w) % stride_w == 0); + int aux_dst_offset = typesize * + (((jj + l_pad - ki * dilate_w) + / stride_w) * jcp.oc_block + oc); + vbroadcastss(ymm_temp, ptr[aux_reg_dst + aux_dst_offset]); + vfmadd231ps(ymm_out(jj, 0), ymm_wei, ymm_temp); + } + } + } + + add(aux_reg_ker, typesize * stride_h * kw * oc_block * ic_block); + sub(aux_reg_dst, typesize * (jcp.dilate_h + 1) * ow * oc_block); + add(aux_reg_ker_prf, typesize * stride_h * kw * oc_block * ic_block); + sub(aux_reg_dst_prf, typesize * (jcp.dilate_h + 1) * ow * oc_block); + + dec(reg_kj); + cmp(reg_kj, 0); + jg(kh_label, T_NEAR); + } + + L(store_output_label); { + store_output(ur_w); + } +} + +void jit_avx2_deconv_act_kernel::compute_loop_fma_core(int ur_w, int l_overflow, int r_overflow) { + int kw = jcp.kw; + int ow = jcp.ow; + int dilate_w = jcp.dilate_w + 1; + int stride_w = jcp.stride_w; + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + int nb_ic_block = jcp.nb_ic_blocking; + Label kh_label; + Label skip_kh_loop; + Label kd_label; + Label skip_kd_loop; + + int shift_ker_ptr = typesize * kw * oc_block * ic_block; + int shift_dst_ptr = typesize * (jcp.dilate_h + 1) * ow * oc_block; + + auto output_offset = [=](int oi, int oc, int ki) { + return typesize * + (((oi + jcp.l_pad - ki * dilate_w) / stride_w) * oc_block + oc); + }; + auto kernel_offset = [=](int icb, int oc, int ki) { + int blk_idx = icb * jcp.kh * jcp.kw + ki; + int blk_offset = blk_idx * jcp.oc_block * jcp.ic_block; + int oc_offset = oc * jcp.oc_block; + return typesize * (blk_offset + oc_offset); + }; + + prepare_output(ur_w); + + mov(aux_reg_dst, reg_dst); + mov(aux_reg_ker, reg_ker); + + mov(reg_kj, reg_kh); + + cmp(reg_kj, 0); + je(skip_kh_loop, T_NEAR); + + L(kh_label); + { + for (int ki = 0; ki < kw; ki++) { + int jj_start = get_iw_start(ki, l_overflow); + int jj_end = get_iw_end(ur_w, ki, r_overflow); + for (int oc = 0; oc < oc_block; oc++) { + if (jcp.kernel_kind == expl_bcast) { + for (int jj = jj_start; jj < jj_end; jj++) { + int aux_output_offset = output_offset(jj, oc, ki); + vbroadcastss(ymm_inp(jj, nb_ic_block), + ptr[aux_reg_dst + aux_output_offset]); + } + } + for (int ii = 0; ii < nb_ic_block; ii++) { + int aux_kernel_offset = kernel_offset(ii, oc, ki); + if (jj_end - jj_start > 0) { + vmovups(ymm_wei, make_safe_addr(aux_reg_ker, + aux_kernel_offset, reg_long_offt)); + } + for (int jj = jj_start; jj < jj_end; jj += stride_w) { + if (jcp.kernel_kind == expl_bcast) { + vfmadd231ps(ymm_out(jj, ii), + ymm_inp(jj, nb_ic_block), ymm_wei); + } else { + vbroadcastss(ymm_temp, ptr[aux_reg_dst + output_offset(jj, oc, ki)]); + vfmadd231ps(ymm_out(jj, ii), ymm_wei, ymm_temp); + } + } + } + } + } + add(aux_reg_ker, shift_ker_ptr); + sub(aux_reg_dst, shift_dst_ptr); + dec(reg_kj); + cmp(reg_kj, 0); + jg(kh_label, T_NEAR); + } + L(skip_kh_loop); + store_output(ur_w); +} + +inline void jit_avx2_deconv_act_kernel::compute_loop( + int ur_w, int l_overflow, int r_overflow) +{ + + if (jcp.ver == ver_fma) + if (jcp.kernel_kind == embd_bcast && jcp.nb_ic_blocking == 1) + compute_loop_fma(ur_w, l_overflow, r_overflow); + else + compute_loop_fma_core(ur_w, l_overflow, r_overflow); + else + assert("!unknown convolution version"); +} + +void jit_avx2_deconv_act_kernel::generate() { + int iw = jcp.iw; + int kw = jcp.kw; + int ur_w = jcp.ur_w; + int ic_block = jcp.ic_block; + int oc_block = jcp.oc_block; + int ur_w_tail = jcp.ur_w_tail; + int dilate_w = jcp.dilate_w + 1; + int stride_w = jcp.stride_w; + + int dst_shift = jcp.typesize_in * (ur_w / stride_w) * ic_block; + int src_shift = jcp.typesize_out * ur_w * oc_block; + + preamble(); + + mov(reg_src, ptr[param + GET_OFF(src)]); + mov(reg_dst, ptr[param + GET_OFF(dst)]); + mov(reg_ker, ptr[param + GET_OFF(filt)]); + + mov(reg_kh, ptr[param + GET_OFF(kh_padding)]); + mov(reg_src_prf, ptr[param + GET_OFF(src_prf)]); + mov(reg_dst_prf, ptr[param + GET_OFF(dst_prf)]); + mov(reg_ker_prf, ptr[param + GET_OFF(filt_prf)]); + + int l_overflow = utils::max(0, ((kw - 1) * dilate_w - jcp.l_pad) / stride_w); + int r_overflow = utils::max(0, ((kw - 1) * dilate_w + - utils::max(0, jcp.r_pad)) / stride_w); + int r_overflow1 = utils::max(0, ((kw - 1) * dilate_w + - utils::max(0, jcp.r_pad) - ur_w_tail) / stride_w); + + int n_oi = iw / ur_w; + if (r_overflow1 > 0) n_oi--; + + if (ur_w == iw) { + compute_loop(ur_w, l_overflow, r_overflow); + } else if (n_oi == 0) { + compute_loop(ur_w, l_overflow, r_overflow1); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + add(reg_src_prf, src_shift); + add(reg_dst_prf, dst_shift); + if (ur_w_tail != 0) + compute_loop(ur_w_tail, 0, r_overflow); + } else { + xor_(reg_oi, reg_oi); + if (l_overflow > 0) { + compute_loop(ur_w, l_overflow, 0); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + add(reg_src_prf, src_shift); + add(reg_dst_prf, dst_shift); + + inc(reg_oi); + } + if ((l_overflow <= 0 && n_oi > 0) + || (l_overflow > 0 && n_oi > 1)) { + Label ow_loop_label; + L(ow_loop_label); { + compute_loop(ur_w, 0, 0); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + add(reg_src_prf, src_shift); + add(reg_dst_prf, dst_shift); + + inc(reg_oi); + cmp(reg_oi, n_oi); + jl(ow_loop_label, T_NEAR); + } + } + if (r_overflow1 > 0) { + compute_loop(ur_w, 0, r_overflow1); + add(reg_src, src_shift); + add(reg_dst, dst_shift); + add(reg_src_prf, src_shift); + add(reg_dst_prf, dst_shift); + } + if (ur_w_tail != 0) { + compute_loop(ur_w_tail, 0, r_overflow); + } + } + + postamble(); +} + +SaberStatus jit_avx2_deconv_act_kernel::init_conf(jit_deconv_conf_t &jcp) { + if (!mayiuse(avx2)) { + LOG(ERROR) << "init a AVX2 kernel in a non-avx2 machine is not permitted"; + return SaberUnImplError; + } + + unsigned int L1_cache_size = get_cache_size(1, true); + + const int simd_w = cpu_isa_traits::vlen / sizeof(float); + int ndims = jcp.ndims; + + jcp.r_pad = (jcp.ow - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) + - (jcp.iw + jcp.l_pad - 1); + jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.ih + jcp.t_pad - 1); + + jcp.oc_block = simd_w; + jcp.ic_block = simd_w; + + jcp.nb_ic = jcp.ic / jcp.ic_block; + jcp.nb_oc = jcp.oc / jcp.oc_block; + + jcp.ur_w = jcp.stride_w; + + int regs = 14; + if (jcp.iw <= regs) { + jcp.ur_w = jcp.iw; + } else { + for (int ur_w = regs; ur_w > 0; --ur_w) { + if (ur_w % jcp.stride_w == 0) { + jcp.ur_w = ur_w; + break; + } + } + } + + int l_overflow = utils::max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) + - jcp.l_pad) / jcp.stride_w); + int r_overflow1 = utils::max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) + - utils::max(0, jcp.r_pad) - jcp.iw % jcp.ur_w) / jcp.stride_w); + int n_oi = jcp.iw / jcp.ur_w; + if (r_overflow1 > 0) n_oi--; + + if (mayiuse(avx2)) { + jcp.ver = ver_fma; + jcp.typesize_in = sizeof(float); + jcp.typesize_out = sizeof(float); + } + else + return SaberUnImplError; + + jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + + bool large_code_size = (jcp.ur_w != jcp.ow) + && ((l_overflow <= 0 && n_oi > 0) ||(l_overflow > 0 && n_oi > 1)) + && (r_overflow1 > 0) && (l_overflow > 0); + if (large_code_size) { + const int max_code_size = 12 * 1024; + const int num_ops_per_reg = 3 + jcp.oc_block * jcp.kw; + int mult = 1; + if (l_overflow > 0) mult += 1; + if (r_overflow1 > 0) mult += 1; + for (int ur_w = jcp.ur_w; ur_w > regs/2; --ur_w) { + if ((ur_w / jcp.stride_w) * mult * num_ops_per_reg * 9.2 + < max_code_size) { + if (ur_w % jcp.stride_w == 0) { + jcp.ur_w = ur_w; + break; + } + } + } + } + + if (jcp.ver == ver_fma && mayiuse(avx2)) { + int try_nb_ic_blocking = 2; + unsigned int ker_inp_size = typesize * jcp.iw * jcp.ic_block + * try_nb_ic_blocking * jcp.kh; + unsigned int ker_out_size = typesize * jcp.ow * jcp.oc_block; + unsigned int ker_wei_size = typesize * jcp.kh * jcp.kw * jcp.ic_block + * jcp.oc_block * try_nb_ic_blocking; + unsigned int ker_total_size = ker_inp_size + ker_out_size + + ker_wei_size; + if (!(jcp.kw == 1 || (jcp.kw == 5 && jcp.iw < 8) + || (jcp.kw < 5 && ((jcp.iw <= 5 || (jcp.iw > 8 && jcp.iw <= 13)) + || ker_total_size > L1_cache_size ))) + || jcp.stride_h > 1) { + jcp.kernel_kind = embd_bcast; + jcp.ur_w = utils::min(jcp.iw, regs); + jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; + if (!(jcp.kw > 3 || (jcp.kw == 3 && ker_total_size < L1_cache_size + && jcp.ow > 8)) && jcp.stride_h == 1) + if (jcp.nb_ic % try_nb_ic_blocking == 0) { + jcp.nb_ic_blocking = try_nb_ic_blocking; + jcp.ur_w = 15 / (jcp.nb_ic_blocking + 1); + if (jcp.iw < jcp.ur_w) jcp.ur_w = jcp.iw; + } + } else { + jcp.kernel_kind = expl_bcast; + jcp.nb_oc_blocking = 1; + jcp.nb_ic_blocking = 4; + if (jcp.nb_ic < jcp.nb_ic_blocking) jcp.nb_ic_blocking = jcp.nb_ic; + if (jcp.nb_ic % jcp.nb_ic_blocking != 0) + for (int i = jcp.nb_ic_blocking; i > 0; i--) { + if (jcp.nb_ic % i == 0) { + jcp.nb_ic_blocking = i; + break; + } + } + jcp.ur_w = 15 / (jcp.nb_ic_blocking + 1); + if (jcp.iw < jcp.ur_w) jcp.ur_w = jcp.iw; + } + } + jcp.ur_w_tail = jcp.iw % jcp.ur_w; + + if (l_overflow * jcp.stride_w > jcp.ur_w) + return SaberUnImplError; + int r_overflow_no_tail = utils::max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) + - utils::max(0, jcp.r_pad) - jcp.ur_w_tail) / jcp.stride_w); + if (r_overflow_no_tail * jcp.stride_w > jcp.ur_w) + return SaberUnImplError; + if ((jcp.iw > jcp.ur_w) && (jcp.ur_w % jcp.stride_w != 0)) + return SaberUnImplError; + + jcp.nb_oc_L2 = jcp.nb_oc; + + return SaberSuccess; +} +} // namespace jit +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.h new file mode 100644 index 000000000..4bda539a8 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_deconv_act_kernel.h @@ -0,0 +1,155 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_ACT_KERNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_ACT_KERNEL_H + +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/saber_types.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { +namespace jit { + +struct jit_avx2_deconv_act_kernel : public jit_generator { + +public: + jit_avx2_deconv_act_kernel(jit_deconv_conf_t ajcp): jcp(ajcp) + { + this->generate(); + jit_ker = (void (*)(jit_deconv_call_t *))this->getCode(); + } + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_deconv_act_kernel); + + static SaberStatus init_conf(jit_deconv_conf_t &jcp); + + jit_deconv_conf_t jcp; + void (*jit_ker)(jit_deconv_call_t *); +private: + using reg64_t = const Xbyak::Reg64; + enum { + typesize = sizeof(float), + ker_reg_base_idx = 14, + }; + + reg64_t param = abi_param1; + reg64_t reg_dst = r8; + reg64_t reg_ker = r9; + reg64_t reg_src = r10; + + reg64_t reg_dst_prf = r11; + reg64_t reg_ker_prf = r12; + reg64_t reg_src_prf = r13; + + reg64_t aux_reg_dst = r14; + reg64_t aux_reg_ker = r15; + + reg64_t aux_reg_dst_prf = rsi; + reg64_t aux_reg_ker_prf = rdx; + + reg64_t aux_reg_dst_d_prf = r13; + reg64_t aux_reg_dst_d = rbx; + reg64_t aux_reg_ker_d_prf = abi_not_param1; + reg64_t aux_reg_ker_d = r9; + reg64_t reg_ki = r10; + + reg64_t reg_kj = rax; + reg64_t reg_oi = rbx; + reg64_t reg_kh = abi_not_param1; + + reg64_t reg_channel = rsi; + + reg64_t reg_bias = rdx; + reg64_t reg_long_offt = r14; + + Xbyak::Ymm ymm_wei = Xbyak::Ymm(15); + Xbyak::Ymm ymm_temp = Xbyak::Ymm(14); + + inline Xbyak::Ymm ymm_ker(int i_ic) { + assert(i_ic < 2); + return Xbyak::Ymm(ker_reg_base_idx + i_ic); + } + + inline Xbyak::Ymm ymm_inp(int i_ic, int nb_x_blocking) { + int idx = i_ic + nb_x_blocking * jcp.ur_w; + assert(idx < 15); + return Xbyak::Ymm(idx); + } + + inline Xbyak::Ymm ymm_out(int i_ur, int i_oc) { + int idx = i_ur + i_oc * jcp.ur_w; + // print1(idx); + assert(idx < ker_reg_base_idx); + return Xbyak::Ymm(idx); + } + + inline void vadd(Xbyak::Ymm ymm, const Xbyak::Operand& op) { + vaddps(ymm, ymm, op); + } + + inline int get_iw_start(int ki, int l_overflow) + { + int res = (jcp.iw - 1 + jcp.r_pad) % jcp.stride_w + + l_overflow * jcp.stride_w + - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1); + while (res < 0) + res += jcp.stride_w; + + return res; + } + + inline int get_iw_end(int ur_w, int ki, int r_overflow) + { + if (utils::one_of(ur_w, jcp.iw, jcp.ur_w_tail)) + ur_w += utils::min(0, jcp.r_pad); // remove negative padding + int res = (ur_w - 1 + jcp.l_pad) % jcp.stride_w + + r_overflow * jcp.stride_w - ki * (jcp.dilate_w + 1); + while (res < 0) + res += jcp.stride_w; + + return ur_w - res; + } + + template + inline Xbyak::Address VEX_compress_addr(Xbyak::Reg64 base, + T raw_offt, bool bcast = false) + { + using Xbyak::Ymm; + using Xbyak::Reg64; + using Xbyak::Address; + using Xbyak::RegExp; + + assert(raw_offt <= INT_MAX); + auto offt = static_cast(raw_offt); + + int scale = 0; + + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + + auto re = RegExp() + base + offt; + if (scale) + re = re + reg_EVEX_max_8b_offt * scale; + + if (bcast) + return yword_b [re]; + else + return yword [re]; + } + + inline void prepare_output(int ur_w); + inline void store_output(int ur_w); + inline void compute_loop_fma(int ur_w, int l_overflow, int r_overflow); + inline void compute_loop_fma_core(int ur_w, int l_overflow, int r_overflow); + inline void compute_loop(int ur_w, int l_overflow, int r_overflow); + void generate(); +}; +} // namespace jit +} // namespace saber +} // namespace anakin +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_DECONV_ACT_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.cpp new file mode 100644 index 000000000..e4dcb7acc --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.cpp @@ -0,0 +1,348 @@ +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_group_conv.h" +#include "saber/funcs/impl/x86/x86_utils.h" +namespace anakin { +namespace saber { + +using namespace jit; + +using jit_conv_ker_t = void (*)(jit_conv_call_t*); + +inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t& p, + const void* src, const void* dst, + const void* filt, const void* bias, + int channel, int kh_padding) { +#define PIPELINE(field) \ + do { \ + p.field = p.field ## _prf; \ + p.field ## _prf = field; \ + } while (0) + + PIPELINE(src); + PIPELINE(dst); + PIPELINE(filt); + PIPELINE(bias); + PIPELINE(channel); + PIPELINE(kh_padding); + + if (p.src) { + ker(&p); + } +} + +template <> +SaberStatus JitAvx2GroupConv::check_conf( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + ConvParam* conv_param = &(param.conv_param); + const Tensor* weights = conv_param->weight(); + const Tensor* bias = conv_param->bias(); + const jit_conv_conf_t jcp = kernel->jcp; + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; + + // check format + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + bool is_layout_ok = (input_layout == Layout_NCHW || input_layout == Layout_NCHW_C8 + || input_layout == Layout_NCHW_C8R) + && (output_layout == Layout_NCHW || output_layout == Layout_NCHW_C8 + || output_layout == Layout_NCHW_C8R); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format layout " << inputs[0]->get_layout() << "," << outputs[0]->get_layout(); + return SaberUnImplError; + } + + // check param + bool param_ok = true + && jcp.t_pad == conv_param->pad_h + && jcp.l_pad == conv_param->pad_w + && jcp.stride_h == conv_param->stride_h + && jcp.stride_w == conv_param->stride_w + && jcp.dilate_h == conv_param->dilation_h - 1 + && jcp.dilate_w == conv_param->dilation_w - 1; + + // check shape + bool shape_ok = true + && jcp.kh == weights->height() + && jcp.kw == weights->width() + && jcp.ngroups == conv_param->group + && jcp.mb == input->num() + && jcp.ic == input->channel() / conv_param->group + && jcp.ih == input->height() + && jcp.iw == input->width() + && jcp.oc == output->channel() / conv_param->group + && jcp.oh == output->height() + && jcp.ow == output->width(); + + if (param_ok && shape_ok) { + return SaberSuccess; + } else { + LOG(INFO) << "param or shape changed, re-init kernel"; + return SaberNotInitialized; + } +} + +template <> +SaberStatus JitAvx2GroupConv::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, Context& ctx) { + SaberStatus status = SaberSuccess; + ConvParam* conv_param = &(param.conv_param); + ActivationParam* act_param = nullptr; + const Tensor* weights = conv_param->weight(); + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; + + // check conf + if (kernel) { + status = check_conf(inputs, outputs, param); + + if (status != SaberNotInitialized) { + return status; + } + } + + // init conf + conf.src_fmt = input->get_layout(); + conf.ngroups = conv_param->group; + conf.mb = input->num(); + conf.ic = input->channel(); + conf.ih = input->height(); + conf.iw = input->width(); + + conf.oc = output->channel(); + conf.oh = output->height(); + conf.ow = output->width(); + + if (input->get_layout() == Layout_NCHW_C8R) { + conf.ic = utils::round_up(input->channel(), 8); + conf.src_fmt = Layout_NCHW_C8; + DLOG(INFO) << "input->get_layout == Layout_NCHW_C8R"; + } + + if (output->get_layout() == Layout_NCHW_C8R) { + conf.oc = utils::round_up(output->channel(), 8); + } + + + conf.kh = weights->height(); + conf.kw = weights->width(); + conf.stride_h = conv_param->stride_h; + conf.stride_w = conv_param->stride_w; + conf.t_pad = conv_param->pad_h; + conf.l_pad = conv_param->pad_w; + conf.dilate_h = conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1); + conf.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); + + conf.with_bias = (conv_param->bias() != NULL)&&(conv_param->bias()->valid_size()>0); + conf.with_relu = conv_param->activation_param.has_active; + conf.with_sum = false; + + if (conf.with_relu) { + act_param = &(conv_param->activation_param); + conf.relu_negative_slope = act_param->negative_slope; + } + + status = jit_avx2_group_conv_act_kernel::init_conf(conf); + + if (status == SaberSuccess) { + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + + kernel = new jit_avx2_group_conv_act_kernel(this->conf); + } else { + return SaberUnImplError; + } + + // reorder weights + Shape weights_s({conf.oc, conf.ic, conf.kh, conf.kw}, Layout_NCHW); + Tensor* weights_reorder = conv_param->mutable_weight(); + + weights_internal.clear(); + + for (int i = 0; i < conf.ngroups; i++) { + Tensor weights_temp(static_cast(weights_reorder->data()) + i * weights_s.count(), + X86(), 0, weights_s, AK_FLOAT); + weights_internal.push_back(std::make_shared >(weights_s)); + + if (inputs[0]->get_layout() == Layout_NCHW) { + weight_reorder_OIhwi8o(weights_temp, *(weights_internal.back())); + } else if (inputs[0]->get_layout() == Layout_NCHW_C8 + || inputs[0]->get_layout() == Layout_NCHW_C8R) { + weight_reorder_OIhw8i8o(weights_temp, *(weights_internal.back())); + } + } + LOG(INFO)<<"ready to init bias "<(bias_s)); + bias_internal->set_shape(conv_param->bias()->valid_shape(), bias_s); + bias_internal->copy_from(*conv_param->bias()); + } + + if (outputs[0]->get_layout() == Layout_NCHW) { + Shape shape = outputs[0]->valid_shape(); + int n_value = shape[0], c_value = shape[1], h_value = shape[2], w_value = shape[3]; + Shape new_shape({n_value, utils::round_up(c_value, 8) / 8, h_value, w_value, 8}, Layout_NCHW_C8); + _temp_output.reshape(new_shape); + } + + return SaberSuccess; +} + +template <> +SaberStatus JitAvx2GroupConv::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, Context& ctx) { + ConvParam* conv_param = &(param.conv_param); + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + bool is_layout_ok = (input_layout == Layout_NCHW || input_layout == Layout_NCHW_C8 + || input_layout == Layout_NCHW_C8R) + && (output_layout == Layout_NCHW || output_layout == Layout_NCHW_C8 + || output_layout == Layout_NCHW_C8R); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format layout " << inputs[0]->get_layout() << "," << outputs[0]->get_layout(); + return SaberUnImplError; + } + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus JitAvx2GroupConv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + ConvParam* conv_param = &(param.conv_param); + bool with_bias=(conv_param->bias() != NULL)&&(conv_param->bias()->valid_size()>0); + + const float* ptr_src = reinterpret_cast(inputs[0]->data()); + const float* ptr_bias = with_bias ? reinterpret_cast(bias_internal->data()) : nullptr; + + float* ptr_dst = nullptr; + + if (outputs[0]->get_layout() == Layout_NCHW) { + ptr_dst = reinterpret_cast(_temp_output.mutable_data()); + } else { + ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + } + + + const auto& jcp = kernel->jcp; + + int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking); + const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh; + auto ker = [&](const int ithr, const int nthr) { + size_t start{0}, end{0}; + balance211(work_amount, nthr, ithr, start, end); + + int icbb = 0; + + while (icbb < jcp.nb_ic) { + int icb_step = jcp.nb_ic_blocking; + int icb_step_rem = jcp.nb_ic - icbb; + + if (icb_step_rem < jcp.nb_ic_blocking_max) { + icb_step = icb_step_rem; + } + + size_t n{0}, g{0}, ocbb{0}, oh{0}; + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + + for (size_t iwork = start; iwork < end; ++iwork) { + int ocb = ocbb * jcp.nb_oc_blocking; + int ocb_num = jcp.nb_oc_blocking; + const float* ptr_weights = reinterpret_cast(weights_internal[g]->data()); + + for (int icb = icbb; icb < icbb + icb_step; ++icb) { + jit_conv_call_t par_conv; + par_conv.flags = 0; + const int ij = oh * jcp.stride_h; + const int i_t_overflow = utils::max(0, jcp.t_pad - ij); + const int i_b_overflow = utils::max(jcp.ih, ij + + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih; + + const size_t _oc = g * jcp.nb_oc + ocb; + const size_t _ic = g * jcp.nb_ic + icb; + + const int src_ic = jcp.ic == 3 ? 0 : _ic; + const int wgt_ic = jcp.ic == 3 ? 0 : icb; + + const int ih = utils::max(ij - jcp.t_pad + utils::div_up(i_t_overflow, + (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0); + + par_conv.src = (jcp.src_fmt == Layout_NCHW) ? ptr_src + n * jcp.ngroups * jcp.ic * jcp.ih * jcp.iw + + src_ic * 8 * jcp.ih * jcp.iw + ih * jcp.iw : ptr_src + + n * jcp.ngroups * jcp.ic * jcp.ih * jcp.iw + src_ic * jcp.ih * jcp.iw * 8 + + ih * jcp.iw * 8; + + par_conv.dst = ptr_dst + n * jcp.ngroups * jcp.oc * jcp.oh * jcp.ow + _oc * jcp.oh * jcp.ow * 8 + + oh * jcp.ow * 8; + + const int wh = utils::div_up(i_t_overflow, (jcp.dilate_h + 1)); + + par_conv.filt = (jcp.src_fmt == Layout_NCHW) ? ptr_weights + ocb * jcp.kh * jcp.kw * jcp.ic * 8 + + wh * jcp.kw * jcp.ic * 8 + wgt_ic * 8 : ptr_weights + ocb * jcp.ic * jcp.kh * jcp.kw * 8 + + wgt_ic * jcp.kh * jcp.kw * 8 * 8 + wh * jcp.kw * 8 * 8; + + if (icb == 0) { + if (with_bias) { + par_conv.bias = ptr_bias + _oc * 8; + } + + par_conv.flags |= FLAG_IC_FIRST; + } + + if (jcp.with_relu && icb + 1 == jcp.nb_ic) { + par_conv.flags |= FLAG_IC_LAST; + } + + par_conv.oc_blocks = utils::min(ocb + ocb_num, jcp.nb_oc) - ocb; + par_conv.kw_padding = 0; + + const int kh_padding = jcp.kh - + utils::div_up(i_t_overflow, (jcp.dilate_h + 1)) - + utils::div_up(i_b_overflow, (jcp.dilate_h + 1)); + par_conv.kh_padding = utils::max(0, kh_padding); + + kernel->jit_ker(&par_conv); + } + + nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh); + } + + icbb += icb_step; + } + }; + + #pragma omp parallel + { + ker(anakin_get_thread_num(), anakin_get_num_threads()); + } + + if (outputs[0]->get_layout() == Layout_NCHW) { + reorder_nchwc8_nchw(_temp_output, *outputs[0]); + } + return SaberSuccess; +} + +template class JitAvx2GroupConv; + + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.h b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.h new file mode 100755 index 000000000..b309a94c1 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_GROUP_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_GROUP_CONV_H + +#include + +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h" + +namespace anakin { +namespace saber { + +template +class JitAvx2GroupConv : public ImplBase< + X86, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + JitAvx2GroupConv() {kernel = nullptr;} + ~JitAvx2GroupConv() { + if (kernel) { + delete kernel; + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m, Context&ctx) override; + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m, Context&ctx) override; + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m) override; +private: + jit::jit_conv_conf_t conf; + jit::jit_avx2_group_conv_act_kernel *kernel = nullptr; + std::vector >> weights_internal; + std::shared_ptr > bias_internal; + Tensor _temp_output; + SaberStatus check_conf(const std::vector *>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m); +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX2_CONV_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.cpp new file mode 100644 index 000000000..942e0c104 --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.cpp @@ -0,0 +1,494 @@ +#include "saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h" +#define GET_OFF(field) offsetof(jit_conv_call_t, field) +namespace anakin { +namespace saber { +namespace jit { + +using namespace Xbyak; + +inline void jit_avx2_group_conv_act_kernel::oh_step_unroll_kw(int ur_w, + int pad_l, int pad_r, int oc_blocks) { + int ic = jcp.ic; + int iw = jcp.iw; + int ih = jcp.ih; + int id = 1; + int kw = jcp.kw; + int kh = jcp.kh; + int kd = 1; + int nb_ic = jcp.nb_ic; + int stride_w = jcp.stride_w; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int oc_blk = jcp.oc_block; + + for (int ki = 0; ki < kw; ki++) { + int jj_start = utils::max(0, utils::div_up(pad_l - ki * dilate_w, stride_w)); + int jj_end = ur_w - utils::max(0, + utils::div_up(ki * dilate_w + pad_r - (kw - 1) * dilate_w, stride_w)); + + for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) { + for (int jj = jj_start; jj < jj_end; jj++) { + size_t inp_off = 0; + + if (jcp.src_fmt == Layout_NCHW) { + inp_off = sizeof(float) * ((size_t)ifm2 * id * ih * iw + + (ki * dilate_w + jj * stride_w - pad_l)); + } else { + inp_off = sizeof(float) * ((ki * dilate_w + jj * stride_w + - pad_l) * ic_blk + ifm2); + } + + vbroadcastss(Ymm(oc_blocks * ur_w + jj), + make_safe_addr(aux_reg_input, inp_off, reg_long_offt)); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + int ker_off = ii * nb_ic * kd * kh * kw * ic_blk * oc_blk + + ki * ic_blk * oc_blk + ifm2 * oc_blk; + vmovups(ymm15, ptr[aux_reg_kernel + sizeof(float) * ker_off]); + + for (int jj = jj_start; jj < jj_end; jj++) { + vfmadd231ps(Ymm(ur_w * ii + jj), + Ymm(oc_blocks * ur_w + jj), ymm15); + } + } + } + } +} + +inline void jit_avx2_group_conv_act_kernel::oh_step_nopad(int ur_w, + int pad_l, int pad_r, char pad_tag, + int oc_blocks, char oc_blocks_tag) { + jit_tagged_label kw_label("kw", pad_tag, oc_blocks_tag); + + int iw = jcp.iw; + int ih = jcp.ih; + int id = 1; + int kw = jcp.kw; + int kh = jcp.kh; + int kd = 1; + int nb_ic = jcp.nb_ic; + int stride_w = jcp.stride_w; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int oc_blk = jcp.oc_block; + + xor_(ki_iter, ki_iter); + L(kw_label); + { + int jj_start = 0; + int jj_end = ur_w; + + for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) { + for (int jj = jj_start; jj < jj_end; jj++) { + size_t inp_off=0; + + if (jcp.src_fmt == Layout_NCHW) + inp_off = sizeof(float) * ((size_t)ifm2 * id * ih * iw + + (jj * stride_w - pad_l)); + else + inp_off = sizeof(float) * ((jj * stride_w - pad_l) * ic_blk + + ifm2); + + vbroadcastss(Ymm(oc_blocks * ur_w + jj), + make_safe_addr(aux_reg_input, inp_off, reg_long_offt)); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + int aux_kernel_offset = + ii * nb_ic * kd * kh * kw * ic_blk * oc_blk + ifm2 * oc_blk; + vmovups(ymm15, ptr[aux_reg_kernel + + sizeof(float) * aux_kernel_offset]); + + for (int jj = jj_start; jj < jj_end; jj++) { + vfmadd231ps(Ymm(ur_w * ii + jj), + Ymm(oc_blocks * ur_w + jj), ymm15); + } + } + } + + add(aux_reg_kernel, sizeof(float) * oc_blk * ic_blk); + add(aux_reg_input, sizeof(float) * ((jcp.src_fmt == Layout_NCHW) + ? dilate_w : ic_blk * dilate_w)); + + inc(ki_iter); + cmp(ki_iter, kw); + jl(kw_label, T_NEAR); + } +} + +inline void jit_avx2_group_conv_act_kernel::width_blk_step(int ur_w, + int pad_l, int pad_r, char pad_tag, + int oc_blocks, char oc_blocks_tag) { + int iw = jcp.iw; + int kw = jcp.kw; + int ow = jcp.ow; + int oh = jcp.oh; + int od = 1; + int dilate_h = jcp.dilate_h + 1; + int dilate_w = jcp.dilate_w + 1; + int ic_blk = jcp.ic_block; + int oc_blk = jcp.oc_block; + + bool dw = jcp.is_dw; + const int inp_mult = (jcp.src_fmt == Layout_NCHW) + ? dilate_h : ic_blk * dilate_h; + const int inp_off = (jcp.src_fmt == Layout_NCHW) + ? dilate_w : ic_blk * dilate_w; + + jit_tagged_label init_done_label("init", pad_tag, oc_blocks_tag); + jit_tagged_label init_first_label("first", pad_tag, oc_blocks_tag); + + if (!jcp.with_sum) { + //if (dw) { + // jmp(init_first_label, T_NEAR); + //} + test(reg_ci_flag, FLAG_IC_FIRST); + jne(init_first_label, T_NEAR); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + size_t offt = + sizeof(float) * ((size_t)ii * od * oh * ow + jj) * oc_blk; + vmovups(Ymm(ur_w * ii + jj), + make_safe_addr(reg_output, offt, reg_long_offt)); + } + } + + if (jcp.with_sum && jcp.with_bias) { + //if (!dw) { + test(reg_ci_flag, FLAG_IC_FIRST); + je(init_done_label, T_NEAR); + //} + + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), + yword[reg_bias + sizeof(float) * ii * oc_blk]); + + } + } + } + + jmp(init_done_label); + + L(init_first_label); + + if (this->jcp.with_bias) { + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + vmovups(Ymm(ur_w * ii + jj), + yword[reg_bias + sizeof(float) * ii * oc_blk]); + } + } + } else { + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + uni_vpxor(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj)); + } + } + } + + L(init_done_label); + + mov(aux_reg_input, reg_input); + mov(aux_reg_kernel, reg_kernel); + + Label skip_kh_loop; + + mov(kj, reg_kh); + + if ((jcp.kh - 1) * (jcp.dilate_h + 1) < utils::max(jcp.t_pad, jcp.b_pad)) { + cmp(kj, 0); + je(skip_kh_loop, T_NEAR); + } + + jit_tagged_label kh_label("kh", pad_tag, oc_blocks_tag); + + L(kh_label); + { + if (jcp.kw >= 5 && pad_l == 0 && pad_r == 0) { + oh_step_nopad(ur_w, pad_l, pad_r, pad_tag, oc_blocks, + oc_blocks_tag); + sub(aux_reg_input, sizeof(float) * kw * inp_off); + add(aux_reg_input, sizeof(float) * iw * inp_mult); + } else { + oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks); + add(aux_reg_kernel, sizeof(float) * kw * oc_blk * ic_blk); + add(aux_reg_input, sizeof(float) * iw * inp_mult); + } + + dec(kj); + cmp(kj, 0); + jg(kh_label, T_NEAR); + } + + L(skip_kh_loop); + + jit_tagged_label done_label("done", pad_tag, oc_blocks_tag); + jit_tagged_label regular_store_label("store", pad_tag, oc_blocks_tag); + + if (this->jcp.with_relu) { + assert(oc_blocks * ur_w < 15); + //if (!dw) { + test(reg_ci_flag, FLAG_IC_LAST); + je(regular_store_label, T_NEAR); + //} + vxorps(yzero, yzero, yzero); + + if (jcp.relu_negative_slope == 0) { + ymm_relu_ns = yzero; + } else { + mov(imm_addr64, float2int(jcp.relu_negative_slope)); + movq(xmm_relu_ns, imm_addr64); + uni_vbroadcastss(ymm_relu_ns, xmm_relu_ns); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + const size_t o_off = sizeof(float) * ((size_t)ii * od * oh * ow + + jj) * oc_blk; + Ymm reg_out = Ymm(ur_w * ii + jj); + + vcmpgtps(ymask, reg_out, yzero); + vmulps(ymm_res_ns, ymm_relu_ns, reg_out); + vblendvps(reg_out, ymm_res_ns, reg_out, ymask); + vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), + reg_out); + } + } + + jmp(done_label); + L(regular_store_label); + } + + for (int ii = 0; ii < oc_blocks; ii++) { + for (int jj = 0; jj < ur_w; jj++) { + const size_t o_off + = sizeof(float) * ((size_t)ii * od * oh * ow + jj) * oc_blk; + Ymm reg_out = Ymm(ur_w * ii + jj); + vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), reg_out); + } + } + + L(done_label); +} + +inline void jit_avx2_group_conv_act_kernel::solve_common( + int oc_blocks, char oc_blocks_tag) { + int ur_w = jcp.ur_w; + int ur_w_tail = jcp.ur_w_tail; + int n_oi = jcp.ow / ur_w; + int iw = jcp.iw; + int kw = jcp.kw; + int ic_blk = jcp.ic_block; + int oc_blk = jcp.oc_block; + int dilate_w = jcp.dilate_w + 1; + int str_w = jcp.stride_w; + const int inp_mult = (jcp.src_fmt == Layout_NCHW) ? 1 : ic_blk; + + int l_pad = jcp.l_pad; + int r_pad = utils::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w + - (iw + l_pad - 1)); + int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w + - (iw + l_pad - 1); + + if (r_pad1 > 0) { + n_oi--; + } + + if (l_pad > 0) { + n_oi--; + + if (n_oi < 0 && r_pad1 > 0) + width_blk_step(ur_w, l_pad, r_pad1, + 'l', oc_blocks, oc_blocks_tag); // "lrpad" + else + width_blk_step(ur_w, l_pad, 0, + 'l', oc_blocks, oc_blocks_tag); // "lpad" + + add(reg_input, sizeof(float) * (ur_w * str_w - l_pad) * inp_mult); + add(reg_output, sizeof(float) * ur_w * oc_blk); + } + + jit_tagged_label ow_loop_label("ow", oc_blocks_tag); + xor_(oi_iter, oi_iter); + + if (n_oi > 0) { + L(ow_loop_label); + + width_blk_step(ur_w, 0, 0, + 'm', oc_blocks, oc_blocks_tag); // "middle" + add(reg_input, sizeof(float) * ur_w * str_w * inp_mult); + add(reg_output, sizeof(float) * ur_w * oc_blk); + + inc(oi_iter); + cmp(oi_iter, n_oi); + jl(ow_loop_label, T_NEAR); + } + + if (r_pad1 > 0 && n_oi >= 0) { + width_blk_step(ur_w, 0, r_pad1, + 'r', oc_blocks, oc_blocks_tag); // "rpad" + add(reg_input, sizeof(float) * ur_w * str_w * inp_mult); + add(reg_output, sizeof(float) * ur_w * oc_blk); + } + + if (ur_w_tail != 0) + width_blk_step(ur_w_tail, 0, r_pad, + 't', oc_blocks, oc_blocks_tag); // "tail" +} + +void jit_avx2_group_conv_act_kernel::generate() { + this->preamble(); + mov(reg_input, ptr[this->param1 + GET_OFF(src)]); + mov(reg_output, ptr[this->param1 + GET_OFF(dst)]); + mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]); + + if (jcp.with_bias) { + mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]); + } + + mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); + mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]); + mov(reg_oc_blocks, ptr[this->param1 + GET_OFF(oc_blocks)]); + + int nb_oc_tail = jcp.nb_oc % jcp.nb_oc_blocking; + const char* tail_label = ".tail"; + const char* exit_label = ".exit"; + + //if (jcp.is_dw) { + // solve_common(jcp.ic_block, '0'); + // jmp(exit_label, T_NEAR); + //} + + if (jcp.nb_oc > jcp.nb_oc_blocking) { + cmp(reg_oc_blocks, jcp.nb_oc_blocking); + jne(nb_oc_tail ? tail_label : exit_label, T_NEAR); + + solve_common(jcp.nb_oc_blocking, '0' + jcp.nb_oc_blocking); + jmp(exit_label, T_NEAR); + + if (nb_oc_tail) { + L(tail_label); + cmp(reg_oc_blocks, nb_oc_tail); + jne(exit_label, T_NEAR); + solve_common(nb_oc_tail, '0' + nb_oc_tail); + } + + L(exit_label); + } else if (jcp.nb_oc == jcp.nb_oc_blocking) { + solve_common(jcp.nb_oc_blocking, '0' + jcp.nb_oc_blocking); + } else { + solve_common(nb_oc_tail, '0' + nb_oc_tail); + } + + this->postamble(); +} + + +SaberStatus jit_avx2_group_conv_act_kernel::init_conf(jit_conv_conf_t& jcp) { + if (!mayiuse(avx2)) { + LOG(ERROR) << "init a AVX2 kernel in a non-avx2 machine is not permitted"; + return SaberUnImplError; + } + + jcp.ic = jcp.ic / jcp.ngroups; + jcp.oc = jcp.oc / jcp.ngroups; + + jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.ih + jcp.t_pad - 1); + + const int simd_w = 8; + const bool flat = jcp.src_fmt == Layout_NCHW; + const bool mimo = !flat; + + bool ok_to_pad_channels = true && jcp.ngroups == 1; + + if (ok_to_pad_channels) { + jcp.oc = utils::rnd_up(jcp.oc, simd_w); + + if (mimo) { + jcp.ic = utils::rnd_up(jcp.ic, simd_w); + } + } + + jcp.ur_h = 1; /* no code-unrolling by h so far */ + jcp.ur_w = 3; + + jcp.oc_block = simd_w; + jcp.nb_oc = jcp.oc / jcp.oc_block; + jcp.nb_oc_blocking = 4; + + // AVX and AVX2 kernels need 2 and 1 temporary YMMs, respectively + // Thus, we can only assign 14 or 15 YMMs for data storage + const int num_avail_regs = mayiuse(avx2) ? 15 : 14; + + if (!mayiuse(avx2)) { + if ((jcp.nb_oc_blocking + 1) * jcp.ur_w > num_avail_regs) { + // current register assignment requires more YMMs than available + // adjust one of nb_oc_block, ur_w preserving to ur_w >= l_pad + if (jcp.ur_w > jcp.l_pad && jcp.ur_w > 1) { + jcp.ur_w -= 1; + } else { + for (int b = 3; b > 1; b--) { + if (jcp.nb_oc % b == 0) { + jcp.nb_oc_blocking = b; + break; + } + } + } + } + } + + if (jcp.ow < jcp.ur_w) { + jcp.ur_w = jcp.ow; + } + + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + bool args_ok = true + && jcp.oc % simd_w == 0 + && jcp.l_pad <= jcp.ur_w + && utils::implication(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) + || (jcp.stride_w == 1 && jcp.stride_h == 1)) + && utils::implication(mimo, jcp.ic % simd_w == 0); + + if (!args_ok) { + LOG(ERROR) << "arguments check failed"; + return SaberUnImplError; + } + + int r_pad_no_tail = utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + + if (r_pad_no_tail > jcp.ur_w * jcp.stride_w && jcp.ow / jcp.ur_w > 1) { + /* recalculate ur_w, nb_oc_blocking and ur_w_tail */ + jcp.ur_w = utils::min(r_pad_no_tail / jcp.stride_w + jcp.ur_w_tail, + utils::min(jcp.ow, num_avail_regs / 2)); + jcp.nb_oc_blocking = (num_avail_regs - jcp.ur_w) / jcp.ur_w; + jcp.ur_w_tail = jcp.ow % jcp.ur_w; + /* check again ... */ + r_pad_no_tail = utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + + if (jcp.ur_w < utils::max(jcp.l_pad, r_pad_no_tail)) { + return SaberUnImplError; + } + } + + assert(jcp.nb_oc_blocking > 0); + assert(jcp.ur_w * (jcp.nb_oc_blocking + 1) <= num_avail_regs); + + jcp.ic_block = flat ? jcp.ic : simd_w; + jcp.nb_ic = jcp.ic / jcp.ic_block; + + jcp.nb_ic_blocking = 12; + jcp.nb_ic_blocking_max = 16; + + return SaberSuccess; +} + +} // namespace jit +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h new file mode 100644 index 000000000..4a2c21c4f --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_avx2_group_conv_kernel.h @@ -0,0 +1,69 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_GROUP_CONV_KERNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX2_GROUP_CONV_KERNEL_H + +#include +#include + +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" +#include "saber/saber_types.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { +namespace jit { + +struct jit_avx2_group_conv_act_kernel: public jit_generator { + + jit_avx2_group_conv_act_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) { + this->generate(); + jit_ker = (void (*)(jit_conv_call_t *))this->getCode(); + } + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_group_conv_act_kernel); + + static SaberStatus init_conf(jit_conv_conf_t &jcp); + + jit_conv_conf_t jcp; + void (*jit_ker)(jit_conv_call_t *); + +private: + using reg64_t = const Xbyak::Reg64; + reg64_t reg_input = rax; + reg64_t aux_reg_input = r8; + reg64_t reg_kernel = rdx; + reg64_t aux_reg_kernel = r9; + reg64_t reg_output = rsi; + reg64_t reg_bias = rbx; + + reg64_t kj = r10; + reg64_t oi_iter = r11; + reg64_t ki_iter = r12; + reg64_t reg_kh = abi_not_param1; + reg64_t reg_oc_blocks = r14; + reg64_t imm_addr64 = r15; + reg64_t reg_long_offt = r15; + Xbyak::Reg32 reg_ci_flag = r13d; + + Xbyak::Xmm xmm_relu_ns = Xbyak::Xmm(13); + Xbyak::Ymm ymm_relu_ns = Xbyak::Ymm(13); + Xbyak::Ymm ymm_res_ns = Xbyak::Ymm(12); + Xbyak::Ymm yzero = Xbyak::Ymm(15); + Xbyak::Ymm ymask = Xbyak::Ymm(14); + + inline void oh_step_unroll_kw(int ur_w, int pad_l, int pad_r, + int oc_blocks); + inline void oh_step_nopad(int ur_w, int pad_l, int pad_r, + char pad_label, int oc_blocks, char oc_blocks_label); + inline void width_blk_step(int ur_w, int pad_l, int pad_r, + char pad_label, int oc_blocks, char oc_blocks_label); + inline void solve_common(int oc_blocks, char oc_blocks_label); + + void generate(); +}; + +} // namespace jit +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX2_GROUP_CONV_ACT_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv.cpp index ad2c16e4e..b53293822 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv.cpp @@ -3,16 +3,16 @@ #include "saber/funcs/impl/x86/kernel/jit_avx512_conv.h" #include "saber/funcs/impl/x86/kernel/jit_call_conf.h" #include "saber/funcs/impl/x86/x86_utils.h" - +#include "tensor_op.h" namespace anakin { namespace saber { using namespace jit; -using jit_conv_ker_t = void (*)(jit_conv_call_t *); +using jit_conv_ker_t = void (*)(jit_conv_call_t*); -inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, - const void *src, const void *dst, const void *filt, const void *bias, +inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t& p, + const void* src, const void* dst, const void* filt, const void* bias, int channel, int kh_padding) { #define PIPELINE(field) \ do { \ @@ -35,31 +35,39 @@ inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_t &p, template <> SaberStatus JitAvx512Conv::check_conf( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { - ConvParam *conv_param = &(param.conv_param); - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = &(param.conv_param); + const Tensor* weights = conv_param->weight(); + const Tensor* bias = conv_param->bias(); const jit_conv_conf_t jcp = kernel->jcp; - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; - conf.is_1stconv = utils::one_of(input->channel(), 1, 3); - + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; // check format + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + conf.is_1stconv = input_layout == Layout_NCHW + && utils::one_of(input->channel(), 1, 3); //utils::one_of(input->channel(), 1, 3); + + if (conf.is_1stconv) { - if (!(inputs[0]->get_layout() == Layout_NCHW && - (outputs[0]->get_layout() == Layout_NCHW_C16 || - outputs[0]->get_layout() == Layout_NHWC) && - weights->get_layout() == Layout_NCHW)) { - LOG(ERROR) << "1stconv wrong format "; + bool is_layout_ok = (input_layout == Layout_NCHW) + && (output_layout == Layout_NHWC || output_layout == Layout_NCHW_C16 + || output_layout == Layout_NCHW_C16R || output_layout == Layout_NCHW) + && weights->get_layout() == Layout_NCHW; + + if (!is_layout_ok) { + LOG(FATAL) << "1stconv wrong format "; return SaberUnImplError; } } else { - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) - || (conv_param->weight()->get_layout() != Layout_NCHW)) { - LOG(ERROR) << "wrong format"; + bool is_layout_ok = (input_layout == Layout_NCHW_C16 || input_layout == Layout_NCHW_C16R) && + (output_layout == Layout_NCHW_C16 || output_layout == Layout_NCHW_C16R) && + (conv_param->weight()->get_layout() == Layout_NCHW); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format"; return SaberUnImplError; } } @@ -70,8 +78,8 @@ SaberStatus JitAvx512Conv::check_conf( && jcp.l_pad == conv_param->pad_w && jcp.stride_h == conv_param->stride_h && jcp.stride_w == conv_param->stride_w - && jcp.dilate_h == conv_param->dilation_h - && jcp.dilate_w == conv_param->dilation_w; + && jcp.dilate_h == (conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1)) + && jcp.dilate_w == (conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1)); // check shape bool shape_ok = true @@ -96,21 +104,22 @@ SaberStatus JitAvx512Conv::check_conf( template <> SaberStatus JitAvx512Conv::create( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) { + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { SaberStatus status; - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = nullptr; - const Tensor *weights = conv_param->weight(); - Tensor *output = outputs[0]; - Tensor *input = inputs[0]; + ConvParam* conv_param = &(param.conv_param); + ActivationParam* act_param = nullptr; + const Tensor* weights = conv_param->weight(); + Tensor* output = outputs[0]; + Tensor* input = inputs[0]; // check conf if (kernel) { status = check_conf(inputs, outputs, param); - if(status != SaberNotInitialized) { + + if (status != SaberNotInitialized) { return status; } } @@ -138,12 +147,15 @@ SaberStatus JitAvx512Conv::create( conf.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); conf.with_relu = conv_param->activation_param.has_active; + if (conf.with_relu) { act_param = &(conv_param->activation_param); conf.relu_negative_slope = static_cast(act_param->negative_slope); } - conf.with_bias = (conv_param->bias() != NULL); + conf.with_bias = (conv_param->bias() != nullptr && conv_param->bias()->valid_size() > 0); + + conf.with_sum = false; conf.dst_dt = output->get_dtype(); if (outputs[0]->get_layout() == Layout_NHWC) { @@ -152,25 +164,49 @@ SaberStatus JitAvx512Conv::create( conf.output_nhwc = false; } - status = jit_conv_kernel::init_conf(conf); + status = jit_conv_act_kernel::init_conf(conf); + if (status == SaberSuccess) { if (kernel != nullptr) { delete kernel; kernel = nullptr; } - kernel = new jit_conv_kernel(conf); + + kernel = new jit_conv_act_kernel(conf); } else { + LOG(FATAL) << "jit_conv_act_kernel SaberUnImplError "; return SaberUnImplError; } // reorder weights - Tensor *weights_reorder = conv_param->mutable_weight(); + Tensor* weights_reorder = conv_param->mutable_weight(); weights_internal.reset(new Tensor(weights_reorder->valid_shape())); if (inputs[0]->get_layout() == Layout_NCHW) { weight_reorder_OIhwi16o(*weights_reorder, *weights_internal); - } else if (inputs[0]->get_layout() == Layout_NCHW_C16) { + } else if (inputs[0]->get_layout() == Layout_NCHW_C16 + || inputs[0]->get_layout() == Layout_NCHW_C16R) { weight_reorder_OIhw16i16o(*weights_reorder, *weights_internal); + } else { + LOG(FATAL) << "unsupport "; + } + + if (output[0].get_dtype() == AK_UINT8) { + CHECK(output[0].get_scale().size() > 0); + float scale = 1.f / (output[0].get_scale()[0] * (127.f / 255.f)); + utils::ScaleUtils::scale_fp32_fp32(*weights_internal, scale); + + if ((conv_param->bias() != nullptr && conv_param->bias()->valid_size() > 0)) { + utils::try_expand_tensor(bias_internal, conv_param->bias()->valid_shape()); + bias_internal.copy_from(*conv_param->bias()); + utils::ScaleUtils::scale_fp32_fp32(bias_internal, scale); + } + } + + if (output->get_layout() == Layout_NCHW) { + utils::try_expand_tensor(_inner_tensor, Shape({output->num(), conf.oc, conf.oh, conf.ow}, Layout_NCHW_C16R)); + DLOG(INFO) << "try_expand_tensor " << _inner_tensor.valid_size() << "," << conf.oc << "," << conf.oh + << "," << conf.ow; } return SaberSuccess; @@ -178,75 +214,76 @@ SaberStatus JitAvx512Conv::create( template <> SaberStatus JitAvx512Conv::init( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) { + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { SaberStatus ret = SaberSuccess; - ConvParam *conv_param = &(param.conv_param); - Tensor *input = inputs[0]; - conf.is_1stconv = utils::one_of(input->channel(), 1, 3); + ConvParam* conv_param = &(param.conv_param); + Tensor* input = inputs[0]; + + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + conf.is_1stconv = input_layout == Layout_NCHW; //utils::one_of(input->channel(), 1, 3); if (conf.is_1stconv) { - if (!(inputs[0]->get_layout() != Layout_NCHW && - (outputs[0]->get_layout() == Layout_NCHW_C16 || - outputs[0]->get_layout() != Layout_NHWC) && - conv_param->weight()->get_layout() != Layout_NCHW )) { - LOG(ERROR) << "data layout is not supported"; + bool is_layout_ok = (input_layout == Layout_NCHW) + && (output_layout == Layout_NHWC || output_layout == Layout_NCHW_C16 + || output_layout == Layout_NCHW_C16R || output_layout == Layout_NCHW) + && conv_param->weight()->get_layout() == Layout_NCHW; + + if (!is_layout_ok) { + LOG(FATAL) << "1stconv wrong format "; return SaberUnImplError; } } else { - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) - || (conv_param->weight()->get_layout() != Layout_NCHW)) { - LOG(ERROR) << "data layout is not supported"; + bool is_layout_ok = (input_layout == Layout_NCHW_C16 || input_layout == Layout_NCHW_C16R) && + (output_layout == Layout_NCHW_C16 || output_layout == Layout_NCHW_C16R) && + (conv_param->weight()->get_layout() == Layout_NCHW); + + if (!is_layout_ok) { + LOG(FATAL) << "wrong format"; return SaberUnImplError; } } this->_ctx = &ctx; ret = create(inputs, outputs, param, ctx); + if (ret != SaberSuccess) { - LOG(ERROR) << "create failed"; + LOG(FATAL) << "create failed"; return ret; } + return ret; } template <> -SaberStatus JitAvx512Conv::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { +SaberStatus JitAvx512Conv::dispatch_nchw_c16( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { - ConvParam *conv_param = &(param.conv_param); - const Tensor *bias = conv_param->bias(); - const DataType type = outputs[0]->get_dtype(); - const float *ptr_src = reinterpret_cast(inputs[0]->data()); - const float *ptr_weights = reinterpret_cast(weights_internal->data()); - const float *ptr_bias = reinterpret_cast(bias->data()); - - auto ptr_dst = NULL; - switch (type){ - case AK_UINT8: ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); break; - case AK_INT8: ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); break; - case AK_UINT32: ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); break; - case AK_INT32: ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); break; - case AK_FLOAT: ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); break; - default: LOG(FATAL) << "data type: " << type << " is unsupported now"; - } - //ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + ConvParam* conv_param = &(param.conv_param); + const Tensor* bias = conv_param->bias(); + const DataType type = outputs[0]->get_dtype(); - const auto &jcp = kernel->jcp; + const float* ptr_src = reinterpret_cast(inputs[0]->data()); + const float* ptr_weights = reinterpret_cast(weights_internal->data()); + const float* ptr_bias = reinterpret_cast(bias->data()); + DLOG(INFO) << "outputs " << outputs.size() << "," << outputs[0]->valid_shape(); + auto ptr_dst = static_cast(outputs[0]->mutable_data()); -#pragma omp parallel + const auto& jcp = kernel->jcp; + DLOG(INFO) << "dispatch_nchw_c16 " << jcp.is_1stconv << "," << jcp.output_nhwc; + #pragma omp parallel { - int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); + int ithr = anakin_get_thread_num(), nthr = anakin_get_num_threads(); int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; int start, end, start_copy; int work_amount = jcp.mb * jcp.ngroups * oc_chunks * jcp.oh; - utils::balance211(work_amount, nthr, ithr, start, end); + balance211(work_amount, nthr, ithr, start, end); start_copy = start; auto par_conv = jit_conv_call_t(); @@ -262,19 +299,14 @@ SaberStatus JitAvx512Conv::dispatch( wht_ic_stride = jcp.oc_block; } - // for output layout NHWC, dst_h_stride = ow * oc; - if (outputs[0]->get_layout() == Layout_NHWC) { - dst_h_stride = jcp.ow * oc_chunks * jcp.oc_block; - } - for (int icb_l2 = 0; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) { start = start_copy; int n{0}, g{0}, occ{0}, oh_s{0}; + if (jcp.loop_order == conv_loop_order_t::loop_cgn) { - utils::nd_iterator_init(start, occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); - } - else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { - utils::nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); + nd_iterator_init(start, occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { + nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); } while (start < end) { @@ -292,17 +324,13 @@ SaberStatus JitAvx512Conv::dispatch( (g_ocb * jcp.oh * jcp.ow + oh_s * jcp.ow) * jcp.oc_block; size_t src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + (g_icb + icb_l2) * jcp.ih * jcp.iw * jcp.ic_block + ih_s * jcp.iw * jcp.ic_block; - size_t weight_blk_off= ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block + - icb_l2 * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; + size_t weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block + + icb_l2 * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; if (jcp.is_1stconv) { src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + ih_s * jcp.iw; weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block; } - // for output layout NHWC, dst_blk_off = n * n_stride + h * h_stride + c_offset; - if (outputs[0]->get_layout() == Layout_NHWC) { - dst_blk_off = n * jcp.oh * jcp.ow * jcp.oc + oh_s * jcp.ow * jcp.oc + g_ocb * jcp.oc_block; - } auto bias_w = ptr_bias ? ptr_bias + bias_blk_off : 0; auto dst_w = ptr_dst + dst_blk_off; @@ -310,12 +338,12 @@ SaberStatus JitAvx512Conv::dispatch( auto wht_w = ptr_weights + weight_blk_off; for (int icb = icb_l2; - icb < utils::min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) { + icb < utils::min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) { auto src_c = src_w; auto dst_c = dst_w; - int offset = dst_blk_off; + for (int oj = oh_s, ij = ih_s; - oj < oh_e; ++oj, ij += jcp.stride_h) { + oj < oh_e; ++oj, ij += jcp.stride_h) { int i_t_overflow = -utils::min(0, ij); int i_b_overflow = utils::max(jcp.ih, ij + jcp.kh) - jcp.ih; @@ -325,20 +353,19 @@ SaberStatus JitAvx512Conv::dispatch( src_c + i_t_overflow * src_h_stride, dst_c, wht_w + i_t_overflow * wht_h_stride, bias_w, icb, kh_padding); - src_c += src_h_stride * jcp.stride_h; dst_c += dst_h_stride; - offset += dst_h_stride; } + src_w += src_c_stride; wht_w += wht_ic_stride; } if (jcp.loop_order == conv_loop_order_t::loop_cgn) { - utils::nd_iterator_jump(start, end, - occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + nd_iterator_jump(start, end, + occ, oc_chunks, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { - utils::nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); + nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); } } } @@ -351,7 +378,157 @@ SaberStatus JitAvx512Conv::dispatch( return SaberSuccess; } -template class JitAvx512Conv; +template <> +SaberStatus JitAvx512Conv::dispatch_nhwc( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + CHECK(outputs[0]->get_dtype() == AK_UINT8); + ConvParam* conv_param = &(param.conv_param); + const Tensor* bias = conv_param->bias(); + const DataType type = outputs[0]->get_dtype(); + + const float* ptr_src = reinterpret_cast(inputs[0]->data()); + const float* ptr_weights = reinterpret_cast(weights_internal->data()); + const float* ptr_bias = reinterpret_cast(bias_internal.data()); + + auto ptr_dst = static_cast(outputs[0]->mutable_data()); + + const auto& jcp = kernel->jcp; + DLOG(INFO) << "dispatch_nhwc " << jcp.is_1stconv << "," << jcp.output_nhwc; + #pragma omp parallel + { + int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); + int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; + int start, end, start_copy; + int work_amount = jcp.mb * jcp.ngroups * jcp.oh; + + balance211(work_amount, nthr, ithr, start, end); + start_copy = start; + + auto par_conv = jit_conv_call_t(); + size_t src_h_stride = jcp.iw * jcp.ic_block; + size_t src_c_stride = jcp.ih * jcp.iw * jcp.ic_block; + size_t dst_h_stride = jcp.ow * jcp.oc; + size_t wht_h_stride = jcp.kw * jcp.ic_block * jcp.oc_block; + size_t wht_ic_stride = jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; + + if (jcp.is_1stconv) { + src_h_stride = jcp.iw; + src_c_stride = jcp.ih * jcp.iw; + wht_ic_stride = jcp.oc_block; + } else { + LOG(FATAL) << "not support"; + } + + for (int icb_l2 = 0; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) { + start = start_copy; + int n{0}, g{0}, oh_s{0}; + + if (jcp.loop_order == conv_loop_order_t::loop_cgn) { + nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { + nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } + + while (start < end) { + for (int occ = 0; occ < oc_chunks; occ++) { + int ocb = occ * jcp.nb_oc_blocking; + int g_ocb = g * jcp.nb_oc + ocb; + int g_oc = g_ocb * jcp.oc_block; + int g_icb = g * jcp.nb_ic; + + int work_rem = end - start; + int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; + int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; + + size_t bias_blk_off = g_oc; + size_t dst_blk_off = n * jcp.oh * jcp.ow * jcp.oc + oh_s * jcp.ow * jcp.oc + g_ocb * jcp.oc_block; + size_t src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + + (g_icb + icb_l2) * jcp.ih * jcp.iw * jcp.ic_block + ih_s * jcp.iw * jcp.ic_block; + size_t weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block + + icb_l2 * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; + + if (jcp.is_1stconv) { + src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + ih_s * jcp.iw; + weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block; + } else { + LOG(FATAL) << "not support"; + } + + auto bias_w = ptr_bias ? ptr_bias + bias_blk_off : 0; + auto dst_w = ptr_dst + dst_blk_off; + auto src_w = ptr_src + src_blk_off; + auto wht_w = ptr_weights + weight_blk_off; + + for (int icb = icb_l2; + icb < utils::min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) { + auto src_c = src_w; + auto dst_c = dst_w; + + for (int oj = oh_s, ij = ih_s; + oj < oh_e; ++oj, ij += jcp.stride_h) { + + int i_t_overflow = -utils::min(0, ij); + int i_b_overflow = utils::max(jcp.ih, ij + jcp.kh) - jcp.ih; + int kh_padding = utils::max(0, jcp.kh - i_t_overflow - i_b_overflow); + + jit_conv_ker_pipeline(kernel->jit_ker, par_conv, + src_c + i_t_overflow * src_h_stride, + dst_c, wht_w + i_t_overflow * wht_h_stride, + bias_w, icb, kh_padding); + src_c += src_h_stride * jcp.stride_h; + dst_c += dst_h_stride; + } + + src_w += src_c_stride; + wht_w += wht_ic_stride; + } + } + + if (jcp.loop_order == conv_loop_order_t::loop_cgn) { + nd_iterator_jump(start, end, + g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } else if (jcp.loop_order == conv_loop_order_t::loop_gnc) { + nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, oh_s, jcp.oh); + } + } + } + + jit_conv_ker_pipeline(kernel->jit_ker, par_conv, + ptr_src, ptr_dst, ptr_weights, ptr_bias, 0, 0); + + } + + return SaberSuccess; +} +template <> +SaberStatus JitAvx512Conv::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + const auto& jcp = kernel->jcp; + + if (outputs[0]->get_layout() == Layout_NCHW) { + std::vector*> temp_tensor_vec; + temp_tensor_vec.push_back(&_inner_tensor); + // print_tensor(*inputs[0]); + dispatch_nchw_c16(inputs, temp_tensor_vec, param); + // LOG(INFO)<<"dispatch_nchw_c16 finish"; + // print_tensor(_inner_tensor); + utils::reorder_nchwc_nchw(_inner_tensor, *outputs[0]); + return SaberSuccess; + } else if (jcp.output_nhwc) { + return dispatch_nhwc(inputs, outputs, param); + } else { + return dispatch_nchw_c16(inputs, outputs, param); + } + +} + +// template class JitAvx512Conv; } // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv.h b/saber/funcs/impl/x86/kernel/jit_avx512_conv.h index 983fec4ca..f8d8f3ecb 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv.h +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv.h @@ -20,7 +20,7 @@ #include "saber/funcs/impl/impl_base.h" #include "saber/core/tensor.h" #include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_conv_act_kernel.h" #include "saber/saber_funcs_param.h" namespace anakin{ @@ -60,11 +60,19 @@ typedef typename DataTrait::Dtype OpDataType; private: jit::jit_conv_conf_t conf; - jit::jit_conv_kernel *kernel = nullptr; + jit::jit_conv_act_kernel *kernel = nullptr; std::shared_ptr > weights_internal; + Tensor bias_internal; SaberStatus check_conf(const std::vector*>& inputs, std::vector*>& outputs, ConvEltwiseParam ¶m); + SaberStatus dispatch_nchw_c16(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m); + SaberStatus dispatch_nhwc(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam ¶m); + Tensor _inner_tensor; }; diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.cpp index 6f3ddee35..a888a9b8b 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.cpp +++ b/saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.cpp @@ -95,7 +95,7 @@ struct memory_block_t { memory_block_t(LayoutType layout_type, Shape &shape) { int ndims = 0; - if (layout_type == Layout_NCHW_C16) { + if (layout_type == Layout_NCHW_C16R) { ndims = 4; } else if (layout_type == Layout_GOIHW16I16O) { @@ -106,7 +106,7 @@ struct memory_block_t { } shape_to_jit_dim(md_dims, shape); - if (layout_type == Layout_NCHW_C16) { + if (layout_type == Layout_NCHW_C16R) { fill_nChw16c(md_dims, ndims, strides); } else if (layout_type == Layout_GOIHW16I16O) { @@ -141,26 +141,6 @@ void JitAvx512Conv1x1::prepare_rtus() { return; } -template -void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end, - T nx, T &nx_start, T &nx_end, T nx_divider) { - const T grp_size = utils::div_up(nthr, nx_divider); - const T grp_count = utils::div_up(nthr, grp_size); - - T grp = ithr / grp_size; - T grp_ithr = ithr % grp_size; - T grp_nthr = grp_size; - T first_grps = nthr % grp_count; - if (first_grps > 0 && grp >= first_grps) { - ithr -= first_grps * grp_size; - grp_nthr--; - grp = ithr / grp_nthr + first_grps; - grp_ithr = ithr % grp_nthr; - } - utils::balance211(nx, grp_count, grp, nx_start, nx_end); - utils::balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); -} - template <> SaberStatus JitAvx512Conv1x1::check_conf( @@ -174,19 +154,14 @@ SaberStatus JitAvx512Conv1x1::check_conf( const jit_1x1_conv_conf_t jcp = kernel->jcp; Tensor *input = inputs[0]; Tensor *output = outputs[0]; + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); - // check format -// if (!(typeid(LayOutType_in) == typeid(NCHW_C16) && -// typeid(LayOutType_out) == typeid(NCHW_C16) && -// typeid(LayOutType_op) == typeid(NCHW))) { -// LOG(ERROR) << "wrong format"; -// return SaberUnImplError; -// } - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) + if ((inputs[0]->get_layout() != Layout_NCHW_C16R) + || (outputs[0]->get_layout() != Layout_NCHW_C16R) || (conv_param->weight()->get_layout() != Layout_NCHW)) { - LOG(ERROR) << "wrong format"; + LOG(FATAL) << "wrong format"; return SaberUnImplError; } @@ -203,17 +178,17 @@ SaberStatus JitAvx512Conv1x1::check_conf( && jcp.kw == weights->width() && jcp.ngroups == 1 && jcp.mb == input->num() - && jcp.ic == input->channel() + && jcp.ic == utils::round_up(input->channel(), 16) && jcp.ih == input->height() && jcp.iw == input->width() - && jcp.oc == output->channel() + && jcp.oc == utils::round_up(output->channel(), 16) && jcp.oh == output->height() && jcp.ow == output->width(); if (param_ok && shape_ok) { return SaberSuccess; } else { - LOG(INFO) << "param or shape changed, re-init kernel"; + LOG(FATAL) << "param or shape changed, re-init kernel"; return SaberNotInitialized; } @@ -244,11 +219,11 @@ SaberStatus JitAvx512Conv1x1::create( conf.ngroups = with_groups ? weights->num() : 1; conf.mb = input->num(); - conf.ic = input->channel() / conf.ngroups; + conf.ic = utils::round_up(input->channel(), 16) / conf.ngroups; conf.ih = input->height(); conf.iw = input->width(); - conf.oc = output->channel() / conf.ngroups; + conf.oc = utils::round_up(output->channel(), 16) / conf.ngroups; conf.oh = output->height(); conf.ow = output->width(); @@ -264,7 +239,7 @@ SaberStatus JitAvx512Conv1x1::create( act_param = &(conv_param->activation_param); conf.relu_negative_slope = static_cast(act_param->negative_slope); } - conf.with_bias = !(conv_param->bias() == nullptr); + conf.with_bias = (conv_param->bias() != nullptr&&conv_param->bias()->valid_size()>0); conv_d.n = input->num(); conv_d.ic = input->channel() / conf.ngroups; @@ -280,7 +255,7 @@ SaberStatus JitAvx512Conv1x1::create( prepare_rtus(); - status = jit_avx512_common_1x1_conv_kernel::init_conf(conf, conv_d, omp_get_max_threads(), reduce_src); + status = jit_avx512_common_1x1_conv_kernel::init_conf(conf, conv_d, anakin_get_max_threads(), reduce_src); if (status == SaberSuccess) { if (kernel != nullptr) { delete kernel; @@ -310,19 +285,16 @@ SaberStatus JitAvx512Conv1x1::init( ConvEltwiseParam ¶m, Context &ctx) { ConvParam *conv_param = &(param.conv_param); -// if (!(typeid(LayOutType_in) == typeid(NCHW_C16) && -// typeid(LayOutType_out) == typeid(NCHW_C16) && -// typeid(LayOutType_op) == typeid(NCHW)) -// ) { -// return SaberUnImplError; -// } - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) + + if ((inputs[0]->get_layout() != Layout_NCHW_C16R) + || (outputs[0]->get_layout() != Layout_NCHW_C16R) || (conv_param->weight()->get_layout() != Layout_NCHW)) { LOG(ERROR) << "wrong format"; return SaberUnImplError; } + CHECK_EQ(conv_param->pad_w,0)<<"pad must == 0"; + CHECK_EQ(conv_param->pad_h,0)<<"pad must == 0"; this->_ctx = &ctx; @@ -370,7 +342,7 @@ SaberStatus JitAvx512Conv1x1::dispatch( #pragma omp parallel { - int ithr = omp_get_thread_num(), nthr = omp_get_num_threads(); + int ithr = anakin_get_thread_num(), nthr = anakin_get_num_threads(); jit_1x1_conv_call_t p; @@ -402,16 +374,16 @@ SaberStatus JitAvx512Conv1x1::dispatch( iw = utils::max(ow * stride_w - pad_l, 0); rp.iw_start = iw; - p.bcast_dim = this_block_size(os, jcp.os, - bcast_step * os_block); + p.bcast_dim = utils::this_block_size(os, jcp.os, + bcast_step * os_block); rp.os = p.bcast_dim; }; auto init_load = [&](int ocb, int &load_step) { load_step = step(jcp.nb_load_blocking, ocb_end - ocb, jcp.nb_load_blocking_max); - p.load_dim = this_block_size(ocb * jcp.oc_block, - ocb_end * jcp.oc_block, load_step * jcp.oc_block); + p.load_dim = utils::this_block_size(ocb * jcp.oc_block, + ocb_end * jcp.oc_block, load_step * jcp.oc_block); }; auto init_reduce = [&](int icb) { @@ -422,8 +394,8 @@ SaberStatus JitAvx512Conv1x1::dispatch( | (icb + nb_ic_blocking_step >= nb_ic ? FLAG_REDUCE_LAST : 0); - p.reduce_dim = this_block_size(icb * jcp.ic_block, - jcp.ic, nb_ic_blocking_step * jcp.ic_block); + p.reduce_dim = utils::this_block_size(icb * jcp.ic_block, + jcp.ic, nb_ic_blocking_step * jcp.ic_block); rp.icb = p.reduce_dim / jcp.reduce_block; }; diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp deleted file mode 100644 index ad7917c92..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.cpp +++ /dev/null @@ -1,593 +0,0 @@ -#include - -#include "jit_avx512_conv_kernel.h" - -#define GET_OFF(field) offsetof(jit_conv_call_t, field) -#define KNx_L2_EFFECTIVE_CAPACITY ((512 - 64) * 1024) - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -static unsigned int L1_cache_size = get_cache_size(1, true); - -static inline void pick_loop_order(jit_conv_conf_t &jcp) { - // auto w = jcp.ow; - // auto h = jcp.oh; - switch (jcp.ver) { - case ver_fma: - jcp.loop_order = loop_cgn; - break; - default: - assert(!"unsupported convolution version"); - } -} - - -void jit_conv_kernel::prepare_output(int ur_w) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vpxord(zmm, zmm, zmm); - int aux_output_offset = get_output_offset(j, k); - mic_prefetcht1(EVEX_compress_addr(reg_out_prf, aux_output_offset)); - } -} - - -void jit_conv_kernel::store_output(int ur_w) { - - Label no_update_label, store_label, relu_label; - - mov(reg_channel, ptr[param1 + GET_OFF(channel)]); - if (jcp.with_bias) { - mov(reg_bias, ptr[param1 + GET_OFF(bias)]); - } - - if (!jcp.with_sum) { - cmp(reg_channel, 0); - je(no_update_label, T_NEAR); - } - - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - int aux_output_offset = get_output_offset(j, k); - vadd(zmm, reg_out, aux_output_offset); - } - } - - if (!jcp.with_sum) { - jmp(relu_label, T_NEAR); - } else { - cmp(reg_channel, 0); - jne(relu_label, T_NEAR); - } - - - L(no_update_label); - if (jcp.with_bias) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - int bias_offset = jcp.typesize_out * k * jcp.oc_block; - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vadd(zmm, reg_bias, bias_offset); - } - mic_prefetcht1(EVEX_compress_addr(reg_bias, bias_offset + 64)); - } - } - - L(relu_label); - if (jcp.with_relu) { - vpxord(zmm_zero, zmm_zero, zmm_zero); - if (jcp.relu_negative_slope == 0 || jcp.ver == ver_4vnni) { - zmm_relu_ns = zmm_zero; - } else { - mov(imm_addr64, float2int(jcp.relu_negative_slope)); - vmovq(xmm_relu_ns, imm_addr64); - vbroadcastss(zmm_relu_ns, xmm_relu_ns); - } - cmp(reg_channel, jcp.nb_ic - 1); - jl(store_label, T_NEAR); - for (int k = 0; k < jcp.nb_oc_blocking; k++) - for (int j = 0; j < ur_w; j++){ - Opmask kmask = Opmask(7); - Zmm zmm = zmm_out(j, k); - vcmp(kmask, zmm, zmm_zero, _cmp_lt_os); - vmul(zmm, kmask, zmm, zmm_relu_ns); - } - } - - L(store_label); - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - int aux_output_offset - = typesize * (k * jcp.oh * jcp.ow + j) * jcp.oc_block; - vmovups(EVEX_compress_addr(reg_out, aux_output_offset), zmm); - mic_prefetcht0(EVEX_compress_addr(reg_out_prf, aux_output_offset)); - } - } -} - - -void jit_conv_kernel::compute_loop_fma_core(int ur_w, - int pad_l, int pad_r) { - int kw = jcp.kw; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label, skip_kh_loop; - int shift_kernel_ptr = jcp.typesize_in * jcp.kw * jcp.oc_block - * jcp.ic_block; - int shift_input_ptr = jcp.typesize_in * jcp.iw - * (!jcp.is_1stconv ? ic_block : 1); - auto input_offset = [=](int oi, int ic, int ki) { - return jcp.typesize_in * ((ki + oi * stride_w - pad_l) * ic_block + ic); - }; - mov(aux_reg_inp, reg_inp); - mov(aux_reg_ker, reg_ker); - - prepare_output(ur_w); - - mov(reg_kj, reg_kh); - if (jcp.kh <= jcp.t_pad) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } - - L(kh_label); - { - for (int ki = 0; ki < kw; ki++) { - int jj_start = get_ow_start(ki, pad_l); - int jj_end = get_ow_end(ur_w, ki, pad_r); - for (int ic = 0; ic < ic_block; ic++) { - if (jcp.kernel_kind == expl_bcast) { - for (int jj = jj_start; jj < jj_end; jj++) { - int aux_input_offset = input_offset(jj, ic, ki); - vbroadcastss(zmm_inp(jj, nb_oc_block), - ptr[aux_reg_inp + aux_input_offset]); - } - } - for (int ii = 0; ii < nb_oc_block; ii++) { - int aux_kernel_offset = jcp.typesize_in - * (ii * jcp.nb_ic * jcp.kh * jcp.kw * ic_block - * oc_block + ki * ic_block * oc_block + ic * oc_block); - if (jj_end - jj_start > 0) { - vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); - } - for (int jj = jj_start; jj < jj_end; jj++) { - if (jcp.kernel_kind == expl_bcast) { - vfmadd231ps(zmm_out(jj, ii), - zmm_inp(jj, nb_oc_block), zmm_wei); - } - else { - vfmadd231ps(zmm_out(jj, ii), zmm_wei, - EVEX_compress_addr(aux_reg_inp, - input_offset(jj, ic, ki), true)); - } - } - } - } - } - add(aux_reg_ker, shift_kernel_ptr); - add(aux_reg_inp, shift_input_ptr); - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_label, T_NEAR); - } - - L(skip_kh_loop); - store_output(ur_w); -} - - -void jit_conv_kernel::compute_loop_fma(int ur_w, int pad_l, int pad_r) { - bool prf_ker = true; - bool prf_inp = true; - int iw = jcp.iw; - int ih = jcp.ih; - int kw = jcp.kw; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - int nb_oc_block = jcp.nb_oc_blocking; - Label kh_label; - - int ker_pipeline_depth = 4; - assert(ker_reg_base_idx + ker_pipeline_depth <= 32); - assert(oc_block >= ker_pipeline_depth); - - int num_ker_loads = ic_block * nb_oc_block * kw; - const int simd_w = 16; - int num_ker_prfs = prf_ker ? num_ker_loads : 0; - int num_inp_prfs = prf_inp ? - ur_w * utils::min(kw, stride_w) + utils::max(0, kw - stride_w) : 0; - if (jcp.is_1stconv && prf_inp) { - num_inp_prfs = utils::div_up(num_inp_prfs, simd_w) * ic_block; - } - int num_prfs = num_ker_prfs + num_inp_prfs; - int num_fmas = num_ker_loads * ur_w; - int prf_inst_spacing - = (prf_ker || prf_inp) ? utils::max(1, num_fmas / num_prfs) : 1; - int prf_inst_trigger = (num_fmas % prf_inst_spacing) / 2; - - mov(aux_reg_inp, reg_inp); - mov(aux_reg_ker, reg_ker); - - prepare_output(ur_w); - - mov(aux_reg_inp_prf, reg_inp_prf); - mov(aux_reg_ker_prf, reg_ker_prf); - mov(reg_kj, reg_kh); - Label skip_kh_loop; - if (jcp.kh <= jcp.t_pad) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } - align(16); - L(kh_label); - { - int step = 0; - int ker_prfs = 0; - for (int ki = 0; ki < kw; ki++) { - for (int ic = 0; ic < ic_block; ic++) { - int aux_kernel_offset = 0; - if (step == 0) { - for (int i = 0; i < ker_pipeline_depth; i++) { - aux_kernel_offset = get_kernel_offset(ki, ic, 0, i); - vmovups(zmm_ker(i), EVEX_compress_addr( - aux_reg_ker, aux_kernel_offset)); - } - } else if (step < num_ker_loads - ker_pipeline_depth + 1) { - int load_offset = ker_pipeline_depth - 1; - int ker_load_reg_idx - = (step + load_offset) % ker_pipeline_depth; - aux_kernel_offset = get_kernel_offset(ki,ic,0,load_offset); - vmovups(zmm_ker(ker_load_reg_idx), - EVEX_compress_addr(aux_reg_ker, aux_kernel_offset)); - } - - bool ker_prf_inserted = false; - Zmm zmm_kernel = zmm_ker(step % ker_pipeline_depth); - int j_start = get_ow_start(ki, pad_l); - int j_end = get_ow_end(ur_w, ki, pad_r); - for (int j = j_start; j < j_end; j++) { - int aux_input_offset = get_input_offset(ki, ic, j, pad_l); - vfmadd231ps(zmm_out(j, 0), zmm_kernel, - EVEX_compress_addr(aux_reg_inp, aux_input_offset, true)); - - int fma_idx = step * ur_w + j; - int prf_slot_idx = fma_idx / prf_inst_spacing; - if (fma_idx % prf_inst_spacing == prf_inst_trigger) { - if (prf_ker && !ker_prf_inserted - && ker_prfs < num_ker_prfs) { - int ker_prf_offset - = jcp.typesize_in * ker_prfs * jcp.oc_block; - mic_prefetcht2(EVEX_compress_addr( - aux_reg_ker_prf, ker_prf_offset)); - ker_prf_inserted = true; - ker_prfs++; - } else if (prf_inp) { - int inp_prf_idx = prf_slot_idx - ker_prfs; - if (inp_prf_idx < num_inp_prfs) { - int inp_prf_stride = utils::max(kw, stride_w); - int inp_prf_offset; - if (!jcp.is_1stconv) { - inp_prf_offset - = ic_block * jcp.typesize_in - * ((inp_prf_idx / kw) - * inp_prf_stride - + (inp_prf_idx % kw)); - } else { - int ic_prf_stride = jcp.typesize_in*iw*ih; - int iw_prf_stride = jcp.typesize_in*simd_w; - inp_prf_offset = ((inp_prf_idx / ic_block) - * iw_prf_stride - + (inp_prf_idx % ic_block) - * ic_prf_stride); - } - - mic_prefetcht0(EVEX_compress_addr( - aux_reg_inp_prf, inp_prf_offset)); - } - } - } - } - - step++; - } - } - add(aux_reg_ker, jcp.typesize_in * kw * oc_block * ic_block); - if (prf_ker) { - add(aux_reg_ker_prf, jcp.typesize_in * kw * oc_block * ic_block); - } - int inp_mul = !jcp.is_1stconv ? ic_block : 1; - add(aux_reg_inp, jcp.typesize_in * iw * inp_mul); - if (prf_inp) { - add(aux_reg_inp_prf, jcp.typesize_in * iw * inp_mul); - } - - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_label, T_NEAR); - } - - L(skip_kh_loop); - store_output(ur_w); -} - - -void jit_conv_kernel::compute_loop(int ur_w, int pad_l, int pad_r) { - - if (jcp.ver == ver_fma){ - if (jcp.is_1stconv || mayiuse(avx512_mic)) { - compute_loop_fma(ur_w, pad_l, pad_r); - } - else if (jcp.kernel_kind == embd_bcast && jcp.nb_oc_blocking == 1) { - compute_loop_fma(ur_w, pad_l, pad_r); - } - else { - compute_loop_fma_core(ur_w, pad_l, pad_r); - } - } else { - assert(!"unknown convolution version"); - } -} - - -void jit_conv_kernel::generate() { - int iw = jcp.iw; - int ow = jcp.ow; - int kw = jcp.kw; - int l_pad = jcp.l_pad; - int ur_w = jcp.ur_w; - int ur_w_tail = jcp.ur_w_tail; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - - int inp_mult = !jcp.is_1stconv ? ic_block : 1; - int inp_shift_pad = jcp.typesize_in * (ur_w * stride_w - l_pad) * inp_mult; - int inp_shift = jcp.typesize_in * (ur_w * stride_w * inp_mult); - int out_shift = jcp.typesize_out * (ur_w * oc_block); - preamble(); - - mov(reg_inp, ptr[param1 + GET_OFF(src)]); - mov(reg_out, ptr[param1 + GET_OFF(dst)]); - mov(reg_ker, ptr[param1 + GET_OFF(filt)]); - mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]); - mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); - - int r_pad = utils::max(0, (ow - 1) * stride_w + (kw - 1) - (iw + l_pad - 1)); - - int n_oi = ow / ur_w; - int r_pad1 = (ur_w * n_oi - 1) * stride_w + kw - 1 - (iw + l_pad - 1); - if (r_pad1 > 0) n_oi--; - - - if (ow == ur_w) { - mov(reg_inp_prf, ptr[param1 + GET_OFF(src_prf)]); - mov(reg_out_prf, ptr[param1 + GET_OFF(dst_prf)]); - compute_loop(ur_w, l_pad, r_pad); - } else { - //TODO: potentially suboptimal - mov(reg_inp_prf, reg_inp); - mov(reg_out_prf, reg_out); - - if (n_oi == 0) { - add(reg_inp_prf, inp_shift_pad); - add(reg_out_prf, out_shift); - compute_loop(ur_w, l_pad, r_pad1); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - if (ur_w_tail != 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w_tail, 0, r_pad); - } - } else { - xor_(reg_oi, reg_oi); - if (l_pad > 0) { - add(reg_inp_prf, inp_shift_pad); - add(reg_out_prf, out_shift); - compute_loop(ur_w, l_pad, 0); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - inc(reg_oi); - } - if ((l_pad <= 0 && n_oi > 0) || (l_pad > 0 && n_oi > 1)) { - if (l_pad <= 0 && r_pad1 > 0) - n_oi--; - Label ow_loop_label; - L(ow_loop_label); - { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w, 0, 0); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - inc(reg_oi); - cmp(reg_oi, n_oi); - jl(ow_loop_label, T_NEAR); - } - } - if (r_pad1 > 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w, 0, r_pad1); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - } - if (ur_w_tail != 0) { - add(reg_inp_prf, inp_shift); - add(reg_out_prf, out_shift); - compute_loop(ur_w_tail, 0, r_pad); - } - } - } - postamble(); -} - - -SaberStatus jit_conv_kernel::init_conf(jit_conv_conf_t &jcp) { - if (!mayiuse(avx512_common)) { - LOG(ERROR) << "init a AVX512 kernel in non-avx512 machine is not permitted"; - return SaberUnImplError; - } - - const int simd_w = cpu_isa_traits::vlen / sizeof(float); - const int regs = 28; - - jcp.ur_h = 1; - jcp.oc_block = simd_w; - jcp.ic_block = (jcp.ic % simd_w != 0) ? jcp.ic : simd_w; - - if (mayiuse(avx512_common)) { - jcp.ver = ver_fma; - jcp.typesize_in = sizeof(float); - jcp.typesize_out = sizeof(float); - - if (jcp.is_1stconv) { - // TODO: fix & remove constraints below - if (jcp.l_pad != 0 || jcp.r_pad != 0 - || jcp.b_pad != 0 || jcp.t_pad != 0 - || (jcp.kw < 7 && jcp.kh < 7)) - jcp.ver = ver_fma; - } - } - - // set jcp.ur_w - if (jcp.is_1stconv) { - jcp.ur_w = utils::min(jcp.ow, regs); - } else { - for (int ur_w = regs; ur_w > 0; --ur_w) { - if (jcp.ow % ur_w == 0) { - jcp.ur_w = ur_w; - break; - } - } - if (jcp.ur_w == 1) { - jcp.ur_w = utils::min(jcp.ow, regs); - } - } - - // TODO (Tanya): currenly applied to Segnet convolutions only. - // Need to try for other topologies - if (jcp.ow > 150 && jcp.ur_w < regs / 2) { - jcp.ur_w = regs; - } - - int n_oi = (jcp.ow / jcp.ur_w); - int r_pad = (jcp.ur_w * n_oi - 1) * jcp.stride_w + jcp.kw - jcp.iw - jcp.l_pad; - if (jcp.l_pad > 0 && r_pad > 0) { - n_oi--; - } - - bool large_code_size = jcp.ur_w != jcp.ow && jcp.l_pad > 0 && r_pad > 0 && - ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1)); - if (large_code_size) { - const int max_code_size = 24 * 1024; - const int num_ops_per_reg = 6 + jcp.ic_block * jcp.kw; - int mult = 1; - if (jcp.l_pad > 0) { - mult += 1; - } - if (r_pad > 0) { - mult += 1; - } - for (int ur_w = jcp.ur_w; ur_w > regs / 2; --ur_w) { - if (ur_w * mult * num_ops_per_reg * 9.0 < max_code_size) { - jcp.ur_w = ur_w; - break; - } - } - } - - jcp.nb_ic = jcp.ic / jcp.ic_block; - jcp.nb_oc = jcp.oc / jcp.oc_block; - jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; - if (jcp.ver == ver_fma && mayiuse(avx512_core)) { - int try_nb_oc_blocking = 2; - unsigned int ker_inp_size = typesize * (jcp.iw / jcp.stride_w) - * jcp.ic_block * jcp.kh; - unsigned int ker_out_size = typesize * jcp.ow * jcp.oc_block - * try_nb_oc_blocking; - unsigned int ker_wei_size = typesize * jcp.kh * jcp.kw * jcp.ic_block - * jcp.oc_block * try_nb_oc_blocking; - unsigned int ker_total_size = ker_inp_size + ker_out_size - + ker_wei_size; - - if (jcp.mb == 1) { - jcp.kernel_kind = embd_bcast; - } else if (jcp.is_1stconv || jcp.kw > 3 - || ((jcp.kw == 3 && jcp.ow <= 28 && ker_total_size < L1_cache_size) - && !(jcp.kw == 3 && jcp.ow == 13 && jcp.ic >= 192) - && !(jcp.kw == 3 && jcp.ow == 28 && jcp.ic >= 512)) - ) { - jcp.kernel_kind = embd_bcast; - jcp.ur_w = utils::min(jcp.ow, regs); - jcp.nb_ic_blocking = jcp.nb_oc_blocking = 1; - if (ker_total_size < L1_cache_size && jcp.ow <= 8 && jcp.kh <= 3 - && jcp.kw <= 3) { - if (jcp.nb_oc % try_nb_oc_blocking == 0 && !jcp.is_1stconv) { - jcp.nb_oc_blocking = try_nb_oc_blocking; - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; - } - } - } else { - jcp.kernel_kind = expl_bcast; - jcp.nb_ic_blocking = 1; - jcp.nb_oc_blocking = 4; - if (jcp.nb_oc < jcp.nb_oc_blocking) { - jcp.nb_oc_blocking = jcp.nb_oc; - } - if (jcp.nb_oc % jcp.nb_oc_blocking != 0) { - for (int i = jcp.nb_oc_blocking; i > 0; i--) { - if (jcp.nb_oc % i == 0) { - jcp.nb_oc_blocking = i; - break; - } - } - } - jcp.ur_w = 31 / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) { - jcp.ur_w = jcp.ow; - } - } - } - - jcp.ur_w_tail = jcp.ow % jcp.ur_w; - - bool args_ok = true && - jcp.oc % simd_w == 0 && - jcp.l_pad <= jcp.ur_w && - utils::implication(!jcp.is_1stconv, jcp.ic % simd_w == 0) && - jcp.dilate_h == 0 && jcp.dilate_w == 0; - if (!args_ok) { - LOG(ERROR) << "arguments check failed"; - return SaberUnImplError; - } - - int r_pad_no_tail = utils::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + - jcp.kw - jcp.iw - jcp.l_pad); - if (r_pad_no_tail > jcp.ur_w) { - LOG(ERROR) << "tail should not be greater than ur_w"; - return SaberUnImplError; - } - - pick_loop_order(jcp); - jcp.nb_ic_L2 = jcp.nb_ic; - - return SaberSuccess; -} - - -} // namespace jit -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h deleted file mode 100644 index 3d2446dc9..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_conv_kernel.h +++ /dev/null @@ -1,163 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CONV_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX512_CONV_KERNEL_H - -#include -#include - -#include "jit_generator.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin { -namespace saber { -namespace jit { - -struct jit_conv_kernel : public jit_generator { - -public: - jit_conv_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) { - generate(); - jit_ker = (void (*)(jit_conv_call_t *))getCode(); - } - - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_conv_act_kernel); - - static SaberStatus init_conf(jit_conv_conf_t &jcp); - - jit_conv_conf_t jcp; - void (*jit_ker)(jit_conv_call_t *); - -private: - using reg64_t = const Xbyak::Reg64; - enum { - typesize = sizeof(float), - ker_reg_base_idx = 28, - }; - - reg64_t param = abi_param1; - reg64_t reg_inp = r8; - reg64_t reg_ker = r9; - reg64_t reg_out = r10; - - reg64_t reg_inp_prf = r11; - reg64_t reg_ker_prf = r12; - reg64_t reg_out_prf = r13; - - reg64_t aux_reg_inp = r14; - reg64_t aux_reg_ker = r15; - - reg64_t aux_reg_inp_prf = rsi; - reg64_t aux_reg_ker_prf = rdx; - - reg64_t reg_channel = rsi; - reg64_t reg_bias = rdx; - - reg64_t reg_kj = rax; - reg64_t reg_relu_ns = rax; - reg64_t reg_oi = rbx; - reg64_t reg_kh = abi_not_param1; - - reg64_t reg_tmp = rbp; - - reg64_t reg_ic_loop = rdx; - reg64_t reg_inp_loop = rsi; - - reg64_t reg_init_flag = r13; - reg64_t reg_bias_ptr = param; - - reg64_t aux_reg_ic = r12; - reg64_t reg_binp = rax; - reg64_t reg_bout = r11; - reg64_t aux1_reg_inp = rbx; - reg64_t aux_reg_out = abi_not_param1; - - inline Xbyak::Zmm zmm_ker(int i_ic) { - assert(i_ic < 4); - return Xbyak::Zmm(ker_reg_base_idx + i_ic); - } - - inline Xbyak::Zmm zmm_out(int i_ur, int i_oc) { - int idx = i_ur + i_oc * jcp.ur_w; - assert(idx < ker_reg_base_idx); - return Xbyak::Zmm(idx); - } - - inline Xbyak::Zmm zmm_inp(int i_ic, int nb_x_blocking) { - int idx = i_ic + nb_x_blocking * jcp.ur_w; - assert(idx < 31); - return Xbyak::Zmm(idx); - } - - Xbyak::Reg64 imm_addr64 = r15; - Xbyak::Xmm xmm_relu_ns = Xbyak::Xmm(30); - Xbyak::Zmm zmm_relu_ns = Xbyak::Zmm(30); - Xbyak::Zmm zmm_zero = Xbyak::Zmm(31); - Xbyak::Zmm zmm_wei = Xbyak::Zmm(31); - - inline void prepare_output(int ur_w); - inline void store_output(int ur_w); - inline void compute_loop_fma(int ur_w, int pad_l, int pad_r); - inline void compute_loop_fma_core(int ur_w, int pad_l, int pad_r); - inline void compute_loop_4fma(int ur_w, int pad_l, int pad_r); - inline void compute_loop_4fma_1st(int ur_w, int pad_l, int pad_r); - inline void compute_loop(int ur_w, int pad_l, int pad_r); - - void generate(); - - inline void vpXdpwssd(Xbyak::Zmm zmm1, Xbyak::Zmm zmm2, reg64_t reg, - int offset) { - vpdpwssd(zmm1, zmm2, EVEX_compress_addr(reg, offset, true)); - } - - inline void vadd(Xbyak::Zmm zmm, reg64_t reg, int offset) { - vaddps(zmm, zmm, EVEX_compress_addr(reg, offset)); - } - - inline void vcmp(Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2, const unsigned char cmp) { - vcmpps(kmask, zmm_src1, zmm_src2, cmp); - } - - inline void vmul(Xbyak::Zmm zmm_dst, Xbyak::Opmask kmask, - Xbyak::Zmm zmm_src1, Xbyak::Zmm zmm_src2) { - vmulps(zmm_dst | kmask, zmm_src1, zmm_src2); - } - - inline int get_output_offset(int oi, int n_oc_block) { - return jcp.typesize_out - * (n_oc_block * jcp.oh * jcp.ow + oi) * jcp.oc_block; - } - - inline int get_input_offset(int ki, int ic, int oi, int pad_l) { - int scale = 1; - int iw_str = !jcp.is_1stconv ? jcp.ic_block : 1; - int ic_str = !jcp.is_1stconv ? 1 : jcp.iw * jcp.ih; - return jcp.typesize_in - * ((ki + oi * jcp.stride_w - pad_l) * iw_str + scale * ic * ic_str); - } - - inline int get_kernel_offset(int ki,int ic,int n_oc_block,int ker_number) { - int scale = 1; - return jcp.typesize_in * jcp.oc_block - * (n_oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw - + (ic + ker_number) * scale + ki * jcp.ic_block); - } - - inline int get_ow_start(int ki, int pad_l) { - return utils::max(0, (pad_l - ki + jcp.stride_w - 1) / jcp.stride_w); - } - - inline int get_ow_end(int ur_w, int ki, int pad_r) { - return ur_w - utils::max(0, - (ki + pad_r - (jcp.kw - 1) + jcp.stride_w - 1) / jcp.stride_w); - } - -}; - - -} // namespace jit -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_KERMEL_JIT_AVX512_CONV_ACT_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.cpp deleted file mode 100644 index 0e37aa5da..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.cpp +++ /dev/null @@ -1,127 +0,0 @@ -#include -#include -#include "jit_avx512_core_8bit_concat_kernel.h" - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -void jit_avx512_core_8bit_concat_kernel::compute_one_input_with_scale(int block_size) { - Label l_next_block; - Label l_tail_block; - Label l_end; - - uni_vpxor(zmm_zero, zmm_zero, zmm_zero); - mov(reg_ptr_src_i, ptr[reg_ptr_src]); - mov(reg_ptr_dst_i, reg_ptr_dst); - - cmp(reg_nb, 0); - je(l_tail_block, T_NEAR); - L(l_next_block); { - vpmovzxbd(zmm_src_s32, ptr[reg_ptr_src_i]); - vcvtdq2ps(zmm_dst_f32, zmm_src_s32); - vfmadd132ps(zmm_dst_f32, zmm_zero, zword_b[reg_scale]); - vcvtps2dq(zmm_dst_s32 | T_rn_sae, zmm_dst_f32); - vpmovusdb(ptr[reg_ptr_dst_i], zmm_dst_s32); - - add(reg_ptr_src_i, block_size); - add(reg_ptr_dst_i, block_size); - dec(reg_nb); - cmp(reg_nb, 0); - jg(l_next_block, T_NEAR); - } - - cmp(reg_tail, 0); - je(l_end, T_NEAR); - - L(l_tail_block); - { - vpmovzxbd(zmm_src_s32 | mask(0), ptr[reg_ptr_src_i]); - vcvtdq2ps(zmm_dst_f32, zmm_src_s32); - vfmadd132ps(zmm_dst_f32, zmm_zero, zword_b[reg_scale]); - vcvtps2dq(zmm_dst_s32 | T_rn_sae, zmm_dst_f32); - vpmovusdb(ptr[reg_ptr_dst_i] ,zmm_dst_s32 | mask(0)); - } - - L(l_end); -} - -void jit_avx512_core_8bit_concat_kernel::compute_one_input_without_scale(int block_size) { - Label l_next_block; - Label l_tail_block; - Label l_end; - - uni_vpxor(zmm_zero, zmm_zero, zmm_zero); - mov(reg_ptr_src_i, ptr[reg_ptr_src]); - mov(reg_ptr_dst_i, reg_ptr_dst); - - cmp(reg_nb, 0); - je(l_tail_block, T_NEAR); - L(l_next_block); { - vmovdqu8(zmm_src_s32, ptr[reg_ptr_src_i]); - vmovdqu8(ptr[reg_ptr_dst_i], zmm_src_s32); - - add(reg_ptr_src_i, block_size); - add(reg_ptr_dst_i, block_size); - dec(reg_nb); - cmp(reg_nb, 0); - jg(l_next_block, T_NEAR); - } - - cmp(reg_tail, 0); - je(l_end, T_NEAR); - - L(l_tail_block); { - vmovdqu8(zmm_src_s32 | mask(0), ptr[reg_ptr_src_i]); - vmovdqu8(ptr[reg_ptr_dst_i] , zmm_src_s32 | mask(0)); - } - - L(l_end); -} - -void jit_avx512_core_8bit_concat_kernel::generate() { - preamble(); - -# define READ_PARAM(reg, field) \ - mov(reg, ptr[abi_param1 + offsetof(jit_concat_call_t, field)]) - - READ_PARAM(reg_ptr_src, src); - READ_PARAM(reg_ptr_dst, dst); -# undef READ_PARAM - - mov(reg_scale, (size_t)jpp.scales); - for (int i = 0; i < jpp.n_inputs; i++) { - mov(reg_tail, jpp.tail[i]); - kmovq(mask(0), reg_tail); - mov(reg_nb, jpp.nb_ic[i]); - - if (std::fabs(1.0f - jpp.scales[i]) > FLT_MIN) { - compute_one_input_with_scale(jpp.block[i]); - } - else { - compute_one_input_without_scale(jpp.block[i]); - } - - add(reg_ptr_src, sizeof(unsigned char*)); - add(reg_ptr_dst, jpp.ic[i]); - add(reg_scale, sizeof(float)); - } - - postamble(); -} - -SaberStatus jit_avx512_core_8bit_concat_kernel::init_conf(jit_concat_conf_t &jpp) { - SaberStatus ret = SaberUnImplError; - - if (!mayiuse(avx512_core)) { - return ret; - } - - return SaberSuccess; -} - -} -} -} diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.h deleted file mode 100644 index b2a2061bc..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_concat_kernel.h +++ /dev/null @@ -1,78 +0,0 @@ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_8BIT_CONCAT_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_8BIT_CONCAT_KERNEL_H - -#include -#include -#include -#include - -#include "saber/funcs/impl/x86/kernel/jit_generator.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -struct jit_avx512_core_8bit_concat_kernel: public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_8bit_concat_kernel) - - enum { - USE_ZMM = 512, - USE_YMM = 256, - USE_XMM = 128, - }; - - Reg64 param = abi_param1; - Reg64 reg_ptr_src = r8; - Reg64 reg_ptr_src_i = r9; - Reg64 reg_ptr_dst = r10; - Reg64 reg_ptr_dst_i = r11; - Reg64 reg_nb = r15; - Reg64 reg_scale = r13; - Reg64 reg_tail = r14; - Reg64 reg_ninputs = rbx; - - Xmm xmm_src = Xmm(30); - Xmm xmm_dst = Xmm(31); - - Zmm zmm_zero = Zmm(23); - Zmm zmm_src_s32 = Zmm(26); - Zmm zmm_dst_s32 = Zmm(27); - Zmm zmm_dst_f32 = Zmm(28); - Zmm zmm_scale = Zmm(25); - Xmm xmm_scale = Xmm(25); - Zmm zmm_scale_min = Zmm(24); - Xmm xmm_scale_min = Xmm(24); - - Opmask mask(int idx) { - return Opmask(6 - idx); - } - - void compute_one_input_with_scale(int block_size); - void compute_one_input_without_scale(int block_size); - void (*ker_)(const jit_concat_call_t *); - jit_concat_conf_t jpp; - - void generate(); - - static SaberStatus init_conf(jit_concat_conf_t &jpp); - - jit_avx512_core_8bit_concat_kernel(const jit_concat_conf_t &jpp_) - : jpp(jpp_) { - generate(); - ker_ = reinterpret_cast(const_cast(getCode())); - } - - void operator()(jit_concat_call_t *arg) {ker_(arg);} -}; - -} -} -} - -#endif diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.cpp deleted file mode 100644 index b38f72c62..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.cpp +++ /dev/null @@ -1,418 +0,0 @@ -#include -#include - -#include "jit_avx512_core_8bit_pooling_kernel.h" - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -void jit_avx512_core_8bit_pooling_kernel::load_src(int jj, - int ll, - int c_tail) { - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - - switch (jpp.alg) { - case Pooling_max: { - auto offset = jj * c_block * sizeof_src_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.src_dt == AK_INT32) { - vmovups(vreg_src(jj) | mask(0), - ptr[aux_reg_src_w + offset]); - } else { - vmovdqu8(vreg_src(jj) | mask(0), - ptr[aux_reg_src_w + offset]); - } - } else { - vmovups(vreg_src(jj), ptr[aux_reg_src_w + offset]); - } - break; - } - case Pooling_average_include_padding: - case Pooling_average_exclude_padding: { - auto offset = (ll * (c_block / 4) + jj * c_block) * sizeof_src_dt(); - if (jj == jpp.ur_c - 1 && c_tail) { - if (jpp.tail[ll]) { - switch (jpp.src_dt) { - case AK_INT32: - vmovups(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - case AK_INT8: - vpmovsxbd(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - case AK_UINT8: - vpmovzxbd(vreg_src_s32(jj, ll) | mask(ll), - ptr[aux_reg_src_w + offset]); - break; - // case AK_FLOAT: - // vmovups(vreg_src_s32(jj, ll) | mask(ll), - // ptr[aux_reg_src_w + offset]); - // break; - default: - assert(!"unsupported src data type"); - } - } - } else { - switch (jpp.src_dt) { - case AK_INT32: - vmovups(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - case AK_INT8: - vpmovsxbd(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - case AK_UINT8: - vpmovzxbd(vreg_src_s32(jj, ll), - ptr[aux_reg_src_w + offset]); - break; - // case AK_FLOAT: - // vmovups(vreg_src_s32(jj, ll), - // ptr[aux_reg_src_w + offset]); - // break; - default: - assert(!"unsupported src data type"); - } - } - break; - } - default: - assert(!"unsupported algorithm"); - } -} - -void jit_avx512_core_8bit_pooling_kernel::store_dst(int jj, - int ll, - int c_tail) { - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - - switch (jpp.alg) { - case Pooling_max: { - auto offset = jj * c_block * sizeof_dst_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.dst_dt == AK_INT32) { - vmovups(ptr[reg_ptr_dst + offset], - vreg_dst(jj) | mask(0)); - } else{ - vmovdqu8(ptr[reg_ptr_dst + offset], - vreg_dst(jj) | mask(0)); - } - } else { - vmovups(ptr[reg_ptr_dst + offset], vreg_dst(jj)); - } - break; - } - case Pooling_average_include_padding: - case Pooling_average_exclude_padding: { - auto offset = (ll * (c_block / 4) + jj * c_block) * sizeof_dst_dt(); - if (jj == ur_c - 1 && c_tail) { - if (jpp.tail[ll]) { - switch (jpp.dst_dt) { - case AK_INT32: - vmovups(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - case AK_INT8: - vpmovdb(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - case AK_UINT8: - vpmovusdb(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll) | mask(ll)); - break; - case AK_FLOAT: - vmovups(ptr[reg_ptr_dst + offset], - vreg_dst_f32(jj, ll) | mask(ll)); - break; - default: - assert(!"unsupported dst data_type"); - } - } - } else { - switch (jpp.dst_dt) { - case AK_INT32: - vmovups(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll)); - break; - case AK_INT8: - vpmovdb(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll)); - break; - case AK_UINT8: - vpmovusdb(ptr[reg_ptr_dst + offset], - vreg_dst_s32(jj, ll)); - break; - case AK_FLOAT: - vmovups(ptr[reg_ptr_dst + offset], - vreg_dst_f32(jj, ll)); - break; - default: - assert(!"unsuppotred dst data_type"); - } - } - break; - } - default: - assert(!"unsupported pooling algorithm"); - } -} - -void jit_avx512_core_8bit_pooling_kernel::compute_max_step(int ur_c, - int c_tail) { - Label l_kw; - Label l_kh; - int iw = jpp.iw; - int c = jpp.c; - - for (int jj = 0; jj < ur_c; jj++) { - vmovups(vreg_dst(jj), vreg_tmp); - } - - mov(aux_reg_src_h, reg_ptr_src); - - xor_(kj, kj); - L(l_kh); { - mov(aux_reg_src_w, aux_reg_src_h); - xor_(ki, ki); - L(l_kw); { - for (int jj = 0; jj < ur_c; jj++) { - load_src(jj, 0, c_tail); - if (jpp.src_dt == AK_INT32) { - vpcmpd(k_cmp_mask, vreg_dst(jj), vreg_src(jj), _cmp_lt_os); - vpblendmd(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), - vreg_src(jj)); - } else { - if (jpp.src_dt == AK_INT8) { - vpcmpb(k_cmp_mask, vreg_dst(jj), vreg_src(jj), - _cmp_lt_os); - } else { - vpcmpub(k_cmp_mask, vreg_dst(jj), vreg_src(jj), - _cmp_lt_os); - } - vpblendmb(vreg_dst(jj) | k_cmp_mask, vreg_dst(jj), - vreg_src(jj)); - } - } - add(aux_reg_src_w, c * sizeof_src_dt()); - inc(ki); - cmp(ki, reg_kw); - jl(l_kw, T_NEAR); - } - add(aux_reg_src_h, iw * c * sizeof_src_dt()); - inc(kj); - cmp(kj, reg_kh); - jl(l_kh, T_NEAR); - } - - for (int jj = 0; jj < ur_c; jj++) { - store_dst(jj, 0, c_tail); - } -} - -void jit_avx512_core_8bit_pooling_kernel::compute_avg_step(int ur_c, - int c_tail) { - Label l_kw; - Label l_kh; - int iw = jpp.iw; - int c = jpp.c; - int num_ll = 0; - - switch (jpp.src_dt) { - case AK_INT32: - case AK_FLOAT: - num_ll = 1; - break; - case AK_INT8: - case AK_UINT8: - num_ll = 4; - break; - default: - assert(!"unsuppotred src data_type"); - } - - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < 4; ll++) { - uni_vpxor(vreg_src_s32(jj, ll), - vreg_src_s32(jj, ll), vreg_src_s32(jj, ll)); - uni_vpxor(vreg_dst_s32(jj, ll), - vreg_dst_s32(jj, ll), vreg_dst_s32(jj, ll)); - uni_vpxor(vreg_dst_f32(jj, ll), - vreg_dst_f32(jj, ll), vreg_dst_f32(jj, ll)); - } - } - - mov(aux_reg_src_h, reg_ptr_src); - - xor_(kj, kj); - L(l_kh); { - mov(aux_reg_src_w, aux_reg_src_h); - xor_(ki, ki); - L(l_kw); { - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < num_ll; ll++) { - load_src(jj, ll, c_tail); - vpaddd(vreg_dst_s32(jj, ll), - vreg_dst_s32(jj, ll), - vreg_src_s32(jj, ll)); - } - } - add(aux_reg_src_w, c * sizeof_src_dt()); - inc(ki); - cmp(ki, reg_kw); - jl(l_kw, T_NEAR); - } - add(aux_reg_src_h, iw * c * sizeof_src_dt()); - inc(kj); - cmp(kj, reg_kh); - jl(l_kh, T_NEAR); - } - - for (int jj = 0; jj < ur_c; jj++) { - for (int ll = 0; ll < num_ll; ll++) { - if (jpp.src_dt != AK_FLOAT) { - vcvtdq2ps(vreg_dst_f32(jj, ll), vreg_dst_s32(jj, ll)); - } - vfmadd132ps(vreg_dst_f32(jj, ll), vreg_zeros, vreg_tmp); - if (jpp.dst_dt == AK_UINT8) { - vcvtps2dq(vreg_dst_s32(jj, ll) | T_rn_sae, vreg_dst_f32(jj, ll)); - } - store_dst(jj, ll, c_tail); - } - } -} - -void jit_avx512_core_8bit_pooling_kernel::compute_step(int ur_c, - int c_tail) { - switch (jpp.alg) { - case Pooling_max: - compute_max_step(ur_c, c_tail); - break; - case Pooling_average_include_padding: - case Pooling_average_exclude_padding: - compute_avg_step(ur_c, c_tail); - break; - default: assert(!"unsupported pooling algorithm"); - } -} - -void jit_avx512_core_8bit_pooling_kernel::compute_c_block() { - Label l_main_loop; - - int nb_c = jpp.nb_c; - int c_block = jpp.c_block; - int ur_c = jpp.ur_c; - int ur_c_tail = jpp.ur_c_tail; - int c_steps = nb_c / ur_c; - int c_tail = jpp.c_tail; - - xor_(c_iter, c_iter); - if (c_steps > 0) { - L(l_main_loop); { - compute_step(ur_c, 0); - add(reg_ptr_src, ur_c * c_block * sizeof_src_dt()); - add(reg_ptr_dst, ur_c * c_block * sizeof_dst_dt()); - inc(c_iter); - cmp(c_iter, c_steps); - jl(l_main_loop, T_NEAR); - } - } - - if (ur_c_tail != 0) { - compute_step(ur_c_tail, c_tail); - } -} - -void jit_avx512_core_8bit_pooling_kernel::init_mask() { - for (int i = 0; i < 4; i++) { - mov(reg_mask, jpp.tail[i]); - kmovq(mask(i), reg_mask); - } -} - -void jit_avx512_core_8bit_pooling_kernel::init_tmp_reg() { - switch (jpp.alg) { - case Pooling_average_include_padding: - case Pooling_average_exclude_padding: - mov(reg_tmp, ptr[abi_param1 + offsetof(jit_pool_call_nhwc_t, idivider)]); - movq(xmm_tmp, reg_tmp); - vpbroadcastd(vreg_tmp, xmm_tmp); - break; - case Pooling_max: - switch (jpp.src_dt) { - case AK_INT32: - mov(reg_tmp, std::numeric_limits::lowest()); - break; - case AK_INT8: - mov(reg_tmp, std::numeric_limits::lowest()); - break; - case AK_UINT8: - mov(reg_tmp, std::numeric_limits::lowest()); - break; - default: assert(!"unsupported src data_type"); - } - - movq(xmm_tmp, reg_tmp); - if (jpp.src_dt == AK_INT32) - vpbroadcastd(vreg_tmp, xmm_tmp); - else - vpbroadcastb(vreg_tmp, xmm_tmp); - break; - default: assert(!"unsupported pooling algorithm"); - } - -} - -void jit_avx512_core_8bit_pooling_kernel::generate() { - preamble(); - - #define READ_PARAM(reg, field) \ - mov(reg, ptr[abi_param1 + offsetof(jit_pool_call_nhwc_t, field)]) - - if (jpp.src_dt == AK_FLOAT) { - READ_PARAM(reg_ptr_src, src_fp32); - } - else { - READ_PARAM(reg_ptr_src, src_i8); - } - - if (jpp.dst_dt == AK_FLOAT) { - READ_PARAM(reg_ptr_dst, dst_fp32); - } - else { - READ_PARAM(reg_ptr_dst, dst_i8); - } - - READ_PARAM(reg_kw, kw_range); - READ_PARAM(reg_kh, kh_range); - - #undef READ_PARAM - - init_tmp_reg(); - init_mask(); - - uni_vpxor(vreg_zeros, vreg_zeros, vreg_zeros); - - compute_c_block(); - - postamble(); -} - -SaberStatus jit_avx512_core_8bit_pooling_kernel::init_conf(jit_pool_conf_t &jpp) { - SaberStatus ret = SaberUnImplError; - - if (!mayiuse(avx512_core)) { - return ret; - } - - return SaberSuccess; -} - -} // namespace jit -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.h deleted file mode 100644 index 8d89216b7..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_8bit_pooling_kernel.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CORE_8BIT_POOLING_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CORE_8BIT_POOLING_KERNEL_H - -#include -#include - -#include "saber/funcs/impl/x86/kernel/jit_generator.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; - -struct jit_avx512_core_8bit_pooling_kernel : public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_8bit_pooling_kernel) - - jit_avx512_core_8bit_pooling_kernel(const jit_pool_conf_t &jpp_) : jpp(jpp_) { - generate(); - ker_ = reinterpret_cast(const_cast(getCode())); - } - - Reg64 reg_ptr_src = r8; - Reg64 reg_ptr_dst = r9; - - Reg64 ki = r10; - Reg64 kj = r11; - Reg64 reg_kw = r12; - Reg64 reg_kh = r13; - Reg64 c_iter = r14; - - Reg64 aux_reg_src_h = rax; - Reg64 aux_reg_src_w = rbx; - - Reg64 reg_tmp = rdx; - - Reg64 reg_mask = r15; - - Opmask k_cmp_mask = Opmask(7); - - Opmask mask(int idx) { - return Opmask(6 - idx); - } - - Xmm xmm_tmp = Xmm(0); - Xmm xmm_zeros = Xmm(31); - Zmm vreg_tmp = Zmm(30); - Zmm vreg_zeros = Zmm(31); - - size_t sizeof_src_dt() const { - return datatype_size(jpp.src_dt); - } - size_t sizeof_dst_dt() const { - return datatype_size(jpp.dst_dt); - } - - /* max pooling */ - Zmm vreg_src(int idx) { - return Zmm(idx); - } - - Zmm vreg_dst(int idx) { - return Zmm(jpp.ur_c + idx); - } - - /* avg pooling */ - Zmm vreg_src_s32(int jj, int ll) { - return Zmm(12*jj + ll); - } - - Zmm vreg_dst_s32(int jj, int ll) { - return Zmm(12*jj + ll + 4); - } - - Zmm vreg_dst_f32(int jj, int ll) { - return Zmm(12*jj + ll + 8); - } - - void (*ker_)(const jit_pool_call_nhwc_t *); - jit_pool_conf_t jpp; - - void init_tmp_reg(); - void init_mask(); - - void load_src(int jj, int ll, int c_tail); - void store_dst(int jj, int ll, int c_tail); - - void compute_avg_step(int ur_c, int c_tail); - void compute_max_step(int ur_c, int c_tail); - void compute_step(int ur_c, int c_tail); - - void compute_c_block(); - void generate(); - - static SaberStatus init_conf(jit_pool_conf_t &jpp); - - void operator()(jit_pool_call_nhwc_t *arg) {ker_(arg);} -}; - -} // namespace jit -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CORE_8BIT_POOLING_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.cpp deleted file mode 100644 index 09f6fda28..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.cpp +++ /dev/null @@ -1,497 +0,0 @@ -#include "saber/funcs/impl/x86/x86_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -void JitAvx512u8s8s32xConv1x1::prepare_rtus(const std::vector*>& inputs, - jit_1x1_conv_conf_t& conf) { - bool rtus_applicable = true && - (conf.stride_h != 1 || conf.stride_w != 1) && - (inputs[0]->get_layout() == Layout_NCHW_C16 || inputs[0]->get_layout() == Layout_NCHW_C8); - - rtus_applicable = rtus_applicable && - conf.t_pad == 0 && conf.l_pad == 0 && - conf.oh * conf.stride_h == conf.ih && - conf.ow * conf.stride_w == conf.iw; - - // LOG(ERROR) << "rtus applicable:" << rtus_applicable; - if (rtus_applicable) { - this->reduce_src = true; - conf.stride_h = conf.stride_w = 1; - conf.ih = conf.oh; - conf.iw = conf.ow; - } - - return; -} - - -template -void balance2D(U nthr, U ithr, T ny, T& ny_start, T& ny_end, - T nx, T& nx_start, T& nx_end, T nx_divider) { - const T grp_size = utils::div_up(nthr, nx_divider); - const T grp_count = utils::div_up(nthr, grp_size); - - T grp = ithr / grp_size; - T grp_ithr = ithr % grp_size; - T grp_nthr = grp_size; - T first_grps = nthr % grp_count; - - if (first_grps > 0 && grp >= first_grps) { - ithr -= first_grps * grp_size; - grp_nthr--; - grp = ithr / grp_nthr + first_grps; - grp_ithr = ithr % grp_nthr; - } - - utils::balance211(nx, grp_count, grp, nx_start, nx_end); - utils::balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); -} - -SaberStatus JitAvx512u8s8s32xConv1x1::init(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam& param, - Context& ctx) { - this->_ctx = &ctx; - ConvParam* conv_param = &(param.conv_param); - const Tensor* weights = conv_param->weight(); - - if (!(inputs[0]->get_layout() == Layout_NHWC && - outputs[0]->get_layout() == Layout_NHWC && - weights->get_layout() == Layout_NCHW)) { - return SaberUnImplError; - } - - // reorder weights - Tensor* weights_reorder = conv_param->mutable_weight(); - - if (weights_internal_ != nullptr) { - delete weights_internal_; - } - - weights_internal_ = new Tensor(weights_reorder->shape(), AK_INT8); - weights_internal_->set_scale(weights_reorder->get_scale()); - weight_reorder_OIhw4i16o4i(*weights_reorder, *weights_internal_, weights_reorder->get_scale()); - - return create(inputs, outputs, param, ctx); -} - -SaberStatus JitAvx512u8s8s32xConv1x1::create(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam& param, - Context& ctx) { - SaberStatus status; - ConvParam* conv_param = &(param.conv_param); - EltwiseParam* eltwise_param = &(param.eltwise_param); - ActivationParam* act_param = &(conv_param->activation_param); - const Tensor* weights = conv_param->weight(); - const Tensor* bias = conv_param->bias(); - Tensor* input = inputs[0]; - Tensor* output = outputs[0]; - Shape src_shape(input->shape()); - Shape dst_shape(output->shape()); - Shape wgt_shape(weights->shape()); - - - // check conf - if (kernel_) { - status = check_conf(inputs, outputs, param); - - if (status != SaberNotInitialized) { - return status; - } - } - - // init conf - const bool with_groups = (conv_param->group > 1); - conf.ngroups = with_groups ? weights->num() : 1; - - conf.mb = src_shape[0]; - conf.ic = wgt_shape[1]; - conf.ih = src_shape[1]; - conf.iw = src_shape[2]; - - conf.oc = wgt_shape[0]; - conf.oh = dst_shape[1]; - conf.ow = dst_shape[2]; - conf.oc_without_padding = conf.oc; - conf.ic_without_padding = conf.ic; - - conf.kh = wgt_shape[2]; - conf.kw = wgt_shape[3]; - conf.stride_h = conv_param->stride_h; - conf.stride_w = conv_param->stride_w; - conf.t_pad = conv_param->pad_h; - conf.l_pad = conv_param->pad_w; - - conf.with_relu = act_param->has_active; - - if (conf.with_relu) { - conf.relu_negative_slope = static_cast(act_param->negative_slope); - } - - conf.with_sum = eltwise_param->has_eltwise && (eltwise_param->operation == Eltwise_sum); - - if (conf.with_sum) { - conf.sum_scale = eltwise_param->coeff[1]; - } - - conf.with_bias = (bias != NULL); - - if (bias != nullptr) { - conf.bia_dt = bias->get_dtype(); - } - - conf.dst_dt = output->get_dtype(); - conf.typesize_in = type_length(input->get_dtype()); - conf.typesize_out = type_length(output->get_dtype()); - conf.typesize_acc = sizeof(int32_t); - conf.typesize_bia = conf.with_bias ? type_length(conf.bia_dt) : 0; - conf.rm = conv_param->rm; - - prepare_rtus(inputs, conf); - - conv_d.n = src_shape[0]; - conv_d.ic = wgt_shape[1]; - conv_d.ih = src_shape[1]; - conv_d.iw = src_shape[2]; - conv_d.oc = wgt_shape[0]; - conv_d.oh = dst_shape[1]; - conv_d.ow = dst_shape[2]; - conv_d.t_pad = conv_param->pad_h; - conv_d.l_pad = conv_param->pad_w; - conv_d.stride_h = conv_param->stride_h; - conv_d.stride_w = conv_param->stride_w; - - status = jit_avx512_core_u8s8s32x_conv1x1_kernel::init_conf(conf, conv_d, omp_get_max_threads(), - reduce_src); - - if (status == SaberSuccess) { - if (kernel_ != nullptr) { - delete kernel_; - kernel_ = nullptr; - } - - kernel_ = new jit_avx512_core_u8s8s32x_conv1x1_kernel(conf); - } else { - return SaberUnImplError; - } - - if (reduce_src) { - init_rtus_driver(&rtus_driver_, conf, conv_d, ws_per_thread_, &scratch_); - } - - // bias reorder - Tensor* bias_src = conv_param->mutable_bias(); - - if (bias_internal_ != nullptr) { - delete bias_internal_; - bias_internal_ = nullptr; - } - - if (bias_src != nullptr) { - bias_internal_ = new Tensor(bias_src->shape(), AK_INT32); - bias_internal_->set_scale(bias_src->get_scale()); - bias_reorder_nchw(*bias_src, *bias_internal_, bias_src->get_scale()); - } - - float scale_in = inputs[0]->get_scale()[0]; - float scale_out = outputs[0]->get_scale()[0]; - auto scale_w = weights_internal_->get_scale(); - std::vector().swap(scale_); - - for (int i = 0; i < scale_w.size(); i++) { - this->scale_.push_back((scale_w[i] * scale_in) / scale_out); - } - - return SaberSuccess; -} - -SaberStatus JitAvx512u8s8s32xConv1x1::dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam& param) { - ConvParam* conv_param = &(param.conv_param); - const Tensor* bias = conv_param->bias(); - - // check input and output data type, do scale or not - CHECK_EQ(inputs[0]->get_dtype(), AK_UINT8) << "only support uint8 input type"; - const unsigned char* ptr_src = reinterpret_cast(inputs[0]->data()); - const char* ptr_weights = reinterpret_cast(weights_internal_->data()); - const int32_t* ptr_bias = nullptr; - - if (bias_internal_ != nullptr) { - ptr_bias = reinterpret_cast(bias_internal_->data()); - } - - char* ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - int dst_type_size = type_length(outputs[0]->get_dtype()); - - const auto& jcp = kernel_->jcp; - const auto& oscales = scale_; - const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; - - const int stride_h = conv_param->stride_h; - const int stride_w = conv_param->stride_w; - const int pad_t = conv_param->pad_h; - const int pad_l = conv_param->pad_w; - - auto step = [](int default_step, int remaining, int tail_step) { - assert(default_step <= tail_step); - return remaining < tail_step ? remaining : default_step; - }; - - #pragma omp parallel - { - int ithr = omp_get_thread_num(); - int nthr = omp_get_num_threads(); - - auto p = jit_1x1_conv_call_t(); - - auto rp = rtus_driver_t::call_params_t(); - - const int nb_oc = jcp.nb_load; - const int os_block = jcp.bcast_block; - // LOG(INFO) << "saber [nb_oc, nb_ic, nb_ic_blocking, os_block, load_grp_count] is [" << jcp.nb_load << ", " << jcp.nb_reduce << ", " << jcp.nb_reduce_blocking - // << ", " << jcp.bcast_block << ", " << jcp.load_grp_count; - - int bcast_start{ 0 }, bcast_end{ 0 }, ocb_start{ 0 }, ocb_end{ 0 }; - balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, - jcp.nb_load, ocb_start, ocb_end, jcp.load_grp_count); - - auto init_bcast = [&](int iwork, int& n, int& g, int& bcast_step, - int& oh, int& ow, int& ih, int& iw) { - int osb{0}; - nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb, - jcp.nb_bcast); - bcast_step = step(jcp.nb_bcast_blocking, jcp.nb_bcast - osb, - jcp.nb_bcast_blocking_max); - bcast_step = utils::min(bcast_step, bcast_end - iwork); - - const int os = osb * os_block; - oh = os / jcp.ow; - ow = os % jcp.ow; - - ih = utils::max(oh * stride_h - pad_t, 0); - iw = utils::max(ow * stride_w - pad_l, 0); - rp.iw_start = iw; - - p.bcast_dim = this_block_size(os, jcp.os, bcast_step * os_block); - rp.os = p.bcast_dim; - }; - - auto init_load = [&](int ocb, int& load_step) { - load_step = step(jcp.nb_load_blocking, ocb_end - ocb, - jcp.nb_load_blocking_max); - p.load_dim = this_block_size(ocb * jcp.oc_block, - ocb_end * jcp.oc_block, load_step * jcp.oc_block); - - if (ocb + load_step >= nb_oc) { - p.first_last_flag |= FLAG_OC_LAST; - } else { - p.first_last_flag &= ~FLAG_OC_LAST; - } - }; - - auto init_reduce = [&]() { - p.reduce_dim = this_block_size(0, jcp.ic, jcp.ic); - rp.icb = p.reduce_dim / jcp.reduce_block; - }; - - auto inner_ker = [&](int ocb, int n, int g, int oh, int ow, - int ih, int iw) { - const int icb = 0; // Start from the first IC block - const int _ocb = g * nb_oc + ocb; - const int _icb = g; - - //const size_t dst_off = dst_d.blk_off(n, _ocb * jcp.oc_block, oh, ow); - const size_t dst_off = n * jcp.oc * jcp.oh * jcp.ow + oh * jcp.ow * jcp.oc - + ow * jcp.oc + _ocb * jcp.oc_block; - const size_t wei_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block - + icb * jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; - - // p.output_data = &ptr_dst[dst_off]; - p.output_data = ptr_dst + dst_off * dst_type_size; - // p.load_data = &weights[conf_.with_groups() - // ? weights_d.blk_off(g, ocb, icb) - // : weights_d.blk_off(ocb, icb)]; - p.load_data = &ptr_weights[wei_off]; - p.bias_data = &ptr_bias[_ocb * jcp.oc_block]; - p.scales = &oscales[jcp.is_oc_scale * _ocb * jcp.oc_block]; - - if (reduce_src) { - rp.ws = scratch_ + ithr * ws_per_thread_ - + _icb * jcp.is * jcp.ic_block; - - if (ocb == ocb_start) { - // rp.src = src + src_d.blk_off(n, _icb * jcp.ic_block, ih, iw); - rp.src = ptr_src + n * jcp.ic * jcp.ih * jcp.iw + - + ih * jcp.iw * jcp.ic + iw * jcp.ic + _icb * jcp.ic_block; - rtus_driver_->ker_(&rp); - } - - p.bcast_data = rp.ws; - } else { - // p.bcast_data = src + src_d.blk_off(n, _icb * jcp.ic_block, ih, iw); - p.bcast_data = ptr_src + n * jcp.ic * jcp.ih * jcp.iw + - + ih * jcp.iw * jcp.ic + iw * jcp.ic + _icb * jcp.ic_block;; - } - - kernel_->jit_ker(&p); - }; - - if (jcp.loop_order == loop_rlb) { - init_reduce(); - int ocb = ocb_start; - - while (ocb < ocb_end) { - int load_step = 0; - init_load(ocb, load_step); - int iwork = bcast_start; - - while (iwork < bcast_end) { - int n = 0; - int g = 0; - int bcast_step = 0; - int oh = 0; - int ow = 0; - int ih = 0; - int iw = 0; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - inner_ker(ocb, n, g, oh, ow, ih, iw); - iwork += bcast_step; - } - - ocb += load_step; - } - } else if (jcp.loop_order == loop_lbr) { - int ocb = ocb_start; - - while (ocb < ocb_end) { - int load_step = 0; - init_load(ocb, load_step); - int iwork = bcast_start; - - while (iwork < bcast_end) { - int n = 0; - int g = 0; - int bcast_step = 0; - int oh = 0; - int ow = 0; - int ih = 0; - int iw = 0; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - init_reduce(); - inner_ker(ocb, n, g, oh, ow, ih, iw); - iwork += bcast_step; - } - - ocb += load_step; - } - } else if (jcp.loop_order == loop_rbl) { - init_reduce(); - int iwork = bcast_start; - - while (iwork < bcast_end) { - int n = 0; - int g = 0; - int bcast_step = 0; - int oh = 0; - int ow = 0; - int ih = 0; - int iw = 0; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - int ocb = ocb_start; - - while (ocb < ocb_end) { - int load_step = 0; - init_load(ocb, load_step); - inner_ker(ocb, n, g, oh, ow, ih, iw); - ocb += load_step; - } - - iwork += bcast_step; - } - } else if (jcp.loop_order == loop_blr) { - int iwork = bcast_start; - - while (iwork < bcast_end) { - int n = 0; - int g = 0; - int bcast_step = 0; - int oh = 0; - int ow = 0; - int ih = 0; - int iw = 0; - init_bcast(iwork, n, g, bcast_step, oh, ow, ih, iw); - int ocb = ocb_start; - - while (ocb < ocb_end) { - int load_step = 0; - init_load(ocb, load_step); - init_reduce(); - inner_ker(ocb, n, g, oh, ow, ih, iw); - ocb += load_step; - } - - iwork += bcast_step; - } - } else { - assert(!"unsupported loop order"); - } - } - - return SaberSuccess; -} - -SaberStatus JitAvx512u8s8s32xConv1x1::check_conf(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam& param) { - ConvParam* conv_param = &(param.conv_param); - const Tensor* weights = conv_param->weight(); - const jit_1x1_conv_conf_t jcp = kernel_->jcp; - - // check format - if (!(inputs[0]->get_layout() == Layout_NHWC && - outputs[0]->get_layout() == Layout_NHWC && - weights->get_layout() == Layout_NCHW)) { - LOG(ERROR) << "wrong format"; - return SaberUnImplError; - } - - // check param - bool param_ok = true && - jcp.t_pad == conv_param->pad_h && - jcp.l_pad == conv_param->pad_w && - jcp.stride_h == conv_param->stride_h && - jcp.stride_w == conv_param->stride_w; - -#if 0 - // check shape - bool shape_ok = true && - jcp.kh == weights->height() && - jcp.kw == weights->width() && - jcp.ngroups == 1 && - jcp.mb == input->num() && - jcp.ic == input->channel() && - jcp.ih == input->height() && - jcp.iw == input->width() && - jcp.oc == output->channel() && - jcp.oh == output->height() && - jcp.ow == output->width(); - - if (param_ok && shape_ok) { - return SaberSuccess; - } else { - LOG(ERROR) << "param or shape changed, re-init kernel"; - return SaberNotInitialized; - } - -#endif - return SaberSuccess; -} - -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.h deleted file mode 100644 index 3df8a8c78..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_1X1_CONV_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_1X1_CONV_H - -#include "anakin_config.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_macro.h" -#include "saber/saber_funcs_param.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.h" - -#include "x86_utils.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -class JitAvx512u8s8s32xConv1x1 : public ImplBase< - X86, - AK_INT8, - ConvEltwiseParam > { -public: - - JitAvx512u8s8s32xConv1x1() - : kernel_(nullptr), rtus_driver_(nullptr), scratch_(nullptr), - weights_internal_(nullptr), ws_per_thread_(0), - bias_internal_(nullptr), reduce_src(false) { - } - - ~JitAvx512u8s8s32xConv1x1() { - if (kernel_) { - delete kernel_; - kernel_ = nullptr; - } - if (rtus_driver_) { - delete rtus_driver_; - rtus_driver_ = nullptr; - } - if (scratch_) { - zfree(scratch_); - scratch_ = nullptr; - } - if (weights_internal_ != nullptr) { - delete weights_internal_; - weights_internal_ = nullptr; - } - } - - virtual SaberStatus init(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) override; - - virtual SaberStatus create(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) override; - - virtual SaberStatus dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) override; - -private: - bool reduce_src; - jit_avx512_core_u8s8s32x_conv1x1_kernel *kernel_; - rtus_driver_t *rtus_driver_; - size_t ws_per_thread_; - uint8_t *scratch_; - Tensor* weights_internal_; - Tensor* bias_internal_; - jit_1x1_conv_conf_t conf; - conv_1x1_desc conv_d; - - // quantization scale(s) - std::vector scale_; - - void prepare_rtus(const std::vector*> &inputs, jit_1x1_conv_conf_t &jcp); - - SaberStatus check_conf(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CORE_U8S8S32X_CONV1x1_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.cpp deleted file mode 100644 index add11b5f1..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.cpp +++ /dev/null @@ -1,640 +0,0 @@ -#include "jit_avx512_core_u8s8s32x_1x1_conv_kernel.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -using namespace anakin::saber::utils; - -namespace anakin { -namespace saber { -namespace jit { - -using namespace Xbyak; -#define GET_OFF(field) offsetof(jit_1x1_conv_call_t, field) - -bool jit_avx512_core_u8s8s32x_conv1x1_kernel::maybe_relu(int position, const float* post_sum) { - if (position == 0) { - /* if do sum, then skip relu before sum */ - if (post_sum) { - return false; - } - return false || jcp.with_relu; - } else if (position == 1) { - /* relu after sum */ - if (post_sum == nullptr) { - return false; - } - return false || - jcp.dst_dt == AK_UINT8 || - jcp.with_relu; - } - - return false; -} - -void jit_avx512_core_u8s8s32x_conv1x1_kernel::bcast_loop(int load_loop_blk) { - mov(aux1_reg_bcast_data, reg_bcast_data); - mov(aux_reg_bcast_data, reg_bcast_data); - - mov(aux_reg_output_data, reg_output_data); - mov(bcast_loop_iter, EVEX_compress_addr(rsp, bcast_loop_work_offt)); - - Label bcast_loop; - Label bcast_loop_tail; - - cmp(bcast_loop_iter, jcp.ur); - jl(bcast_loop_tail, T_NEAR); - - L(bcast_loop); { - assert(jcp.bcast_block % jcp.ur == 0); - int num_substeps = jcp.bcast_block / jcp.ur; - assert(num_substeps > 0 && num_substeps < 10); - for (int i = 0; i < num_substeps; i++) { - reduce_loop(load_loop_blk, jcp.ur, i, false); - if (i < num_substeps - 1) { - add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_substep); - add(aux_reg_output_data, jcp.bcast_loop_output_substep); - } else { - add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_step - - (num_substeps - 1) * jcp.bcast_loop_bcast_substep); - int output_offset = jcp.bcast_loop_output_step - - (num_substeps - 1) * jcp.bcast_loop_output_substep; - add(aux_reg_output_data, output_offset); - } - } - sub(bcast_loop_iter, jcp.bcast_block); - cmp(bcast_loop_iter, jcp.bcast_block); - jge(bcast_loop, T_NEAR); - } - - L(bcast_loop_tail); - if (jcp.ur_tail) { - Label bcast_loop_tail_out; - cmp(bcast_loop_iter, 0); - jz(bcast_loop_tail_out, T_NEAR); - reduce_loop(load_loop_blk, jcp.ur_tail, 0, true); - L(bcast_loop_tail_out); - } -} - -void jit_avx512_core_u8s8s32x_conv1x1_kernel::cvt2ps(DataType type_in, - zmm_t zmm_in, - const Xbyak::Operand &op, - bool mask_flag) { - zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in; - switch (type_in) { - case AK_FLOAT: - case AK_INT32: - vmovups(zmm, op); - break; - case AK_INT8: - vpmovsxbd(zmm, op); - break; - case AK_UINT8: - vpmovzxbd(zmm, op); - break; - default: - assert(!"unsupported data type"); - } - if (type_in != AK_FLOAT) { - vcvtdq2ps(zmm_in, zmm_in); - } -} - -void jit_avx512_core_u8s8s32x_conv1x1_kernel::reduce_loop(int load_loop_blk, - int ur, - int substep, - bool wraparound) { - auto vreg_load = [=](int i_load) { - return Zmm(ur * load_loop_blk + i_load); - }; - - auto vreg_accum = [=](int i_load, int i_ur) { - return Zmm(i_ur * load_loop_blk + i_load); - }; - - auto bias_ptr = [=](int i_load) { - return EVEX_compress_addr(reg_bias_data, - jcp.typesize_bia * jcp.oc_block * i_load); - }; - auto scale_ptr = [=](int i_load) { - return EVEX_compress_addr(reg_ptr_scales, - jcp.is_oc_scale * (sizeof(float) * jcp.oc_block * i_load)); - }; - - auto bcast_ptr = [=](int i_reduce, int i_ur, bool bcast) { - assert(i_ur < jcp.ur); - assert(i_reduce <= jcp.reduce_loop_unroll); - assert(jcp.reduce_loop_unroll == jcp.reduce_block); - - int offt = (jcp.ic_without_padding * i_ur + i_reduce); - - return EVEX_compress_addr(aux_reg_bcast_data, jcp.typesize_in * offt, - bcast); - }; - - auto load_ptr = [=](int i_reduce, int i_load) { - int u0 = i_reduce % jcp.reduce_loop_unroll; - int u1 = i_reduce / jcp.reduce_loop_unroll; - - int offt = (i_load * jcp.reduce_dim + u0) * jcp.load_block; - - return EVEX_compress_addr(aux_reg_load_data, - u1 * jcp.reduce_loop_load_step - + jcp.typesize_in * offt); - }; - - auto output_ptr = [=](int i_load, int i_ur) { - return EVEX_compress_addr(aux_reg_output_data, - jcp.typesize_out * (jcp.oc_without_padding * i_ur + i_load * jcp.load_block)); - }; - - auto init = [=]() { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - for (int i_ur = 0; i_ur < ur; ++i_ur) { - auto r = vreg_accum(i_load, i_ur); - vpxord(r, r, r); - } - } - }; - - auto store = [=](const bool mask_flag_in) { - const float *p_sum_scale = nullptr; - if (jcp.with_sum) { - p_sum_scale = &(jcp.sum_scale); - } - mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data); - mov(reg_ptr_scales, EVEX_compress_addr(rsp, reg_ptr_sum_scale_off)); - - if (p_sum_scale && *p_sum_scale != 1.f) { - mov(EVEX_compress_addr(rsp, reg_load_data_off), reg_load_data); - mov(reg_ptr_sum_scale, (size_t)p_sum_scale); - } - - vpxord(zmm_zero, zmm_zero, zmm_zero); - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - const bool mask_flag = mask_flag_in && i_load == load_loop_blk - 1; - auto zmm_bias = zmm_tmp; - if (jcp.with_bias) { - cvt2ps(jcp.bia_dt, zmm_bias, bias_ptr(i_load), mask_flag); - } - for (int i_ur = 0; i_ur < ur; ++i_ur) { - auto r = vreg_accum(i_load, i_ur); - vcvtdq2ps(r, r); - if (jcp.with_bias) { - vaddps(r, r, zmm_bias); - } - zmm_t mask_zmm = mask_flag ? r | ktail_mask | T_z : r; - vmulps(mask_zmm, r, scale_ptr(i_load)); - if (maybe_relu(0, p_sum_scale)) { - vmaxps(r, zmm_zero, r); - } - if (p_sum_scale) { // post_op: sum - auto zmm_prev_dst = zmm_bcast; - cvt2ps(jcp.dst_dt, zmm_prev_dst, output_ptr(i_load, i_ur), - mask_flag); - if (*p_sum_scale == 1.f) { - vaddps(r, zmm_prev_dst); - } else { - vfmadd231ps(r, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); - } - } - if (maybe_relu(1, p_sum_scale)) { - vmaxps(r, zmm_zero, r); - } - if (jcp.dst_dt != AK_FLOAT) { - if (jcp.rm == round_mode::nearest) { - vcvtps2dq(r | T_rn_sae, r); - } else if (jcp.rm == round_mode::down) { - vcvtps2dq(r | T_rd_sae, r); - } else { - assert(!"unimplemented"); - } - } - } - for (int i_ur = 0; i_ur < ur; ++i_ur) { - auto r = vreg_accum(i_load, i_ur); - zmm_t r_zmm = mask_flag ? r | ktail_mask : r; - switch (jcp.dst_dt) { - case AK_FLOAT: - case AK_INT32: - vmovups(output_ptr(i_load, i_ur), r_zmm); - break; - case AK_INT8: - vpmovsdb(output_ptr(i_load, i_ur), r_zmm); - break; - case AK_UINT8: - vpmovusdb(output_ptr(i_load, i_ur), r_zmm); - break; - default: - assert(!"unknown dst_dt"); - } - } - } - - mov(reg_bcast_data, EVEX_compress_addr(rsp, reg_bcast_data_off)); - if (p_sum_scale && *p_sum_scale != 1.f) { - mov(reg_load_data, EVEX_compress_addr(rsp, reg_load_data_off)); - } - }; - - auto compute = [=](Zmm vreg_acc, Zmm vreg_wei, Zmm vreg_src) { - if (jcp.ver == ver_vnni) { - vpdpbusd(vreg_acc, vreg_src, vreg_wei); - } else { - vpmaddubsw(zmm_tmp, vreg_src, vreg_wei); - vpmaddwd(zmm_tmp, zmm_tmp, zmm_one); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); - } - }; - - auto fma_block = [=](bool last_block) { - int reduce_step = 4; - int tail_size = jcp.ic_without_padding % reduce_step; - int loop_unroll = last_block && jcp.ic != jcp.ic_without_padding ? - rnd_up(jcp.ic_without_padding % jcp.ic_block, reduce_step) : - jcp.reduce_loop_unroll; - for (int i_reduce = 0; i_reduce < loop_unroll; i_reduce += reduce_step) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - vmovups(vreg_load(i_load), load_ptr(i_reduce, i_load)); - } - for (int i_ur = 0; i_ur < ur; ++i_ur) { - if (last_block && tail_size != 0 - && i_reduce == loop_unroll - reduce_step) { - Xmm xmm_bcast = Xmm(zmm_bcast.getIdx()); - for (int r = 0; r < tail_size; ++r) { - vpinsrb(xmm_bcast, xmm_bcast, - ptr[aux_reg_bcast_data + jcp.ic_without_padding * i_ur + i_reduce + r], - r); - } - vpbroadcastd(zmm_bcast, xmm_bcast); - } else { - vpbroadcastd(zmm_bcast, bcast_ptr(i_reduce, i_ur, false)); - } - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - compute(vreg_accum(i_load, i_ur), vreg_load(i_load), zmm_bcast); - } - } - } - }; - - Label reduce_loop; - Label reduce_loop_tail; - - mov(aux_reg_load_data, reg_load_data); - - mov(aux_reg_bcast_data, aux1_reg_bcast_data); - init(); - - mov(reduce_loop_iter, reg_reduce_loop_work); - sub(reduce_loop_iter, jcp.reduce_loop_unroll); - jle(reduce_loop_tail, T_NEAR); - - L(reduce_loop); { - fma_block(false); - add(aux_reg_bcast_data, jcp.reduce_loop_bcast_step); - add(aux_reg_load_data, jcp.reduce_loop_load_step); - sub(reduce_loop_iter, jcp.reduce_loop_unroll); - jg(reduce_loop, T_NEAR); - } - - L(reduce_loop_tail); - if (jcp.ic != jcp.ic_without_padding) { - fma_block(true); - } else { - fma_block(false); - } - - if (jcp.oc_without_padding != jcp.oc) { - Label end_store; - Label common_store; - mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data); - - /*Check if it is the last load_loop_blk*/ - sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step); - cmp(reg_load_loop_work, 0); - jg(common_store, T_NEAR); - - /*Check if it is the last ocb*/ - test(reg_reduce_pos_flag, FLAG_OC_LAST); - jz(common_store, T_NEAR); - - store(true); - jmp(end_store, T_NEAR); - - L(common_store); - store(false); - - L(end_store); - - add(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step); - } else { - store(false); - } -} - -void jit_avx512_core_u8s8s32x_conv1x1_kernel::generate() { - preamble(); - - xor_(reg_scratch, reg_scratch); - Reg16 _t = reg_scratch.cvt16(); - mov(_t, 0x1); - vpbroadcastw(zmm_one, _t); - - sub(rsp, stack_space_needed); - - if (jcp.oc_without_padding != jcp.oc) { - int tail_size = jcp.oc_without_padding % jcp.oc_block; - int mask = (1 << tail_size) - 1; - Reg32 regw_tmp = reg_last_load.cvt32(); - mov(regw_tmp, mask); - kmovw(ktail_mask, regw_tmp); - } - - if (jcp.with_bias) { - mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]); - } - mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]); - mov(EVEX_compress_addr(rsp, reg_ptr_sum_scale_off), reg_ptr_scales); - mov(reg_bcast_data, ptr[param1 + GET_OFF(bcast_data)]); - mov(reg_load_data, ptr[param1 + GET_OFF(load_data)]); - mov(reg_output_data, ptr[param1 + GET_OFF(output_data)]); - - mov(reg_load_loop_work, ptr[param1 + GET_OFF(load_dim)]); - mov(reg_bcast_loop_work, ptr[param1 + GET_OFF(bcast_dim)]); - mov(EVEX_compress_addr(rsp, bcast_loop_work_offt), reg_bcast_loop_work); - mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]); - mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]); - - auto load_loop_body = [=](int load_loop_blk) { - bcast_loop(load_loop_blk); - add(reg_load_data, load_loop_blk * jcp.load_loop_load_step); - if (jcp.with_bias) { - add(reg_bias_data, - load_loop_blk * jcp.load_block * jcp.typesize_bia); - } - mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data); - mov(reg_ptr_scales, EVEX_compress_addr(rsp, reg_ptr_sum_scale_off)); - add(reg_ptr_scales, - jcp.is_oc_scale * load_loop_blk * jcp.load_block * sizeof(float)); - mov(EVEX_compress_addr(rsp, reg_ptr_sum_scale_off), reg_ptr_scales); - mov(reg_bcast_data, EVEX_compress_addr(rsp, reg_bcast_data_off)); - add(reg_output_data, - load_loop_blk * jcp.load_block * jcp.typesize_out); - sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step); - }; - - const int simd_w = 16; - - Label load_loop_blk[7]; - - static const int ur_cases_fma_expl_bcast[] = { 2, 5, 6, 9, 14, 32 }; - const int size_ur_cases_fma = sizeof(ur_cases_fma_expl_bcast); - const int *ur_cases_fma = ur_cases_fma_expl_bcast; - const int *ur_cases = ur_cases_fma; - const int num_ur_cases = (size_ur_cases_fma) / sizeof(*ur_cases); - - for (int ur_idx = num_ur_cases - 1; ur_idx > 0; ur_idx--) { - int label_idx = num_ur_cases - ur_idx - 1; - if (jcp.ur <= ur_cases[ur_idx]) { - cmp(reg_load_loop_work, simd_w * (label_idx + 1)); - jle(load_loop_blk[label_idx], T_NEAR); - } - } - - for (int ur_idx = 0; ur_idx < num_ur_cases; ur_idx++) { - if (jcp.ur <= ur_cases[ur_idx]) { - int label_idx = num_ur_cases - ur_idx - 1; - L(load_loop_blk[label_idx]); - { - if (label_idx == 0) { - cmp(reg_load_loop_work, 0); - je(load_loop_blk[num_ur_cases], T_NEAR); - } - load_loop_body(label_idx + 1); - if (label_idx - 1 > 0) { - cmp(reg_load_loop_work, 2 * label_idx * simd_w); - je(load_loop_blk[label_idx - 1], T_NEAR); - } - cmp(reg_load_loop_work, (label_idx + 1) * simd_w); - jge(load_loop_blk[label_idx]); - } - for (int idx = label_idx - 1; idx > 0; --idx) { - cmp(reg_load_loop_work, simd_w * (idx + 1)); - je(load_loop_blk[idx], T_NEAR); - } - if (ur_idx < num_ur_cases - 2) { - cmp(reg_load_loop_work, simd_w); - jle(load_loop_blk[0], T_NEAR); - } - } - } - L(load_loop_blk[num_ur_cases]); - - add(rsp, stack_space_needed); - - postamble(); -} - -SaberStatus jit_avx512_core_u8s8s32x_conv1x1_kernel::init_conf(jit_1x1_conv_conf_t &jcp, - conv_1x1_desc &conv_d, - int nthreads, - bool reduce_src) { - if (!mayiuse(avx512_core)) { - LOG(ERROR) << "init a AVX512 kernel on non-avx512 machine is not permitted"; - return SaberUnImplError; - } - jcp.ver = ver_avx512_core; - if (mayiuse(avx512_core_vnni)) { - jcp.ver = ver_vnni; - } - - bool args_ok = true; - - const int simd_w = 16; - jcp.oc = rnd_up(jcp.oc, simd_w); - jcp.ic = rnd_up(jcp.ic, simd_w); - - args_ok = true && - jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0 && - jcp.t_pad == 0 && jcp.l_pad == 0 && - jcp.stride_w == 1 && jcp.stride_h == 1 && - jcp.kh == 1 && jcp.kw == 1; - if (!args_ok) { - LOG(ERROR) << "ic:" << jcp.ic << ", oc:" << jcp.oc << ", stride_h:" << jcp.stride_h << ", stride_w:" << jcp.stride_w << ", kh:" << jcp.kh << ", kw:" << jcp.kw << ", pad:" << jcp.t_pad; - return SaberUnImplError; - } - - jcp.os = jcp.oh * jcp.ow; - jcp.is = jcp.ih * jcp.iw; - jcp.tr_is = rnd_up(jcp.is, 4); - - jcp.ic_block = jcp.oc_block = simd_w; - - const int SMALL_SPATIAL = 7 * 7; - const int BIG_REDUCE_DIM = 1024; - - int load_blocking = 0; - int load_blocking_max = 0; - int bcast_blocking = 0; - int bcast_blocking_max = 0; - int reduce_blocking = 0; - int reduce_blocking_max = 0; - jcp.load_grp_count = 1; - jcp.use_vmovntps = false; - - const int L2_size = get_cache_size(2, true) / sizeof(jcp.typesize_in); - const int L2_capacity = (L2_size * 3) / 4; - - int size_treshold = 28; - int max_regs = 0; - int min_regs = 6; - if (jcp.ver == ver_vnni) { - max_regs = ((jcp.oh > size_treshold && jcp.ow > size_treshold) && - (jcp.oc < 128 || jcp.ic < 128)) ? min_regs : 9; - } else { - max_regs = 8; - } - jcp.expl_bcast = true; - - const int spatial = jcp.oh; - jcp.ur = 1; - for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) { - if ((spatial >= size_treshold && spatial % ur_w == 0) || - (spatial < size_treshold && jcp.os % ur_w == 0)) { - jcp.ur = ur_w; - break; - } - } - if (jcp.ur == 1) { - jcp.ur = utils::min(max_regs, jcp.os); - int os_tail = jcp.os % max_regs; - for (int i = max_regs; i >= min_regs; i--) { - int i_tail = jcp.os % i; - if (i_tail > os_tail || i_tail == 0) { - jcp.ur = i; - os_tail = i_tail; - if (i_tail == 0) { - break; - } - } - } - } - - jcp.reduce_dim = jcp.ic; - jcp.reduce_block = jcp.ic_block; - - jcp.load_dim = jcp.oc; - jcp.load_block = jcp.oc_block; - - jcp.bcast_dim = jcp.is; - - jcp.bcast_block = jcp.ur; - - jcp.reduce_loop_unroll = jcp.reduce_block; - jcp.reduce_loop_bcast_step = jcp.reduce_loop_unroll * jcp.typesize_in; - - jcp.reduce_loop_load_step = jcp.reduce_loop_unroll * jcp.load_block * jcp.typesize_in; - - jcp.bcast_loop_output_step = jcp.ur * jcp.oc_without_padding * jcp.typesize_out; - jcp.bcast_loop_output_substep = -1; // unused - jcp.bcast_loop_bcast_step = jcp.ur * jcp.ic_without_padding * jcp.typesize_in; - jcp.bcast_loop_bcast_substep = -1; // unused - - jcp.load_loop_load_step = jcp.reduce_dim * jcp.load_block * jcp.typesize_in; - - jcp.load_loop_iter_step = jcp.load_block; - - jcp.loop_order = reduce_src ? loop_blr : loop_lbr; - - int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block); - int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block); - - reduce_blocking = nb_reduce; - if (jcp.bcast_dim <= SMALL_SPATIAL && jcp.reduce_dim >= BIG_REDUCE_DIM) { - reduce_blocking = 64; - } else if (jcp.bcast_dim > SMALL_SPATIAL && jcp.reduce_dim >= BIG_REDUCE_DIM) { - reduce_blocking = 16; - } - reduce_blocking = best_divider(nb_reduce, 1, reduce_blocking, true); - reduce_blocking *= jcp.reduce_block; - - bool cmp_reduce = reduce_blocking <= jcp.reduce_dim; - if (cmp_reduce) { - jcp.loop_order = reduce_src ? loop_rbl : loop_rlb; - } - load_blocking = jcp.load_dim; - - jcp.load_grp_count = div_up(nthreads, jcp.mb * jcp.ngroups * nb_bcast); - jcp.load_grp_count = best_divider(nthreads, jcp.load_grp_count, 2 * jcp.load_grp_count, false); - - if (jcp.bcast_dim <= SMALL_SPATIAL && jcp.load_dim * jcp.reduce_dim >= L2_size) { - jcp.load_grp_count = utils::max(jcp.load_grp_count, 4); - } else if (jcp.bcast_dim <= SMALL_SPATIAL && jcp.mb <= nthreads && - jcp.load_dim > 512 && jcp.load_dim / jcp.reduce_dim >= 4) { - jcp.load_grp_count = utils::max(jcp.load_grp_count, 2); - load_blocking = jcp.load_block; - } - - bcast_blocking = div_up(jcp.mb * jcp.ngroups * nb_bcast, - div_up(nthreads, jcp.load_grp_count)) * jcp.bcast_block; - bcast_blocking = utils::min(jcp.bcast_dim, bcast_blocking); - bcast_blocking = rnd_up(bcast_blocking, jcp.bcast_block); - - int space_for_bcast - = (L2_capacity - /* kernel_size - */ - 2 * jcp.load_block * reduce_blocking - - jcp.ur * reduce_blocking - 3 * 1024); - if (jcp.reduce_dim * jcp.bcast_dim > L2_capacity) { - space_for_bcast /= 2; - } - - int bcast_in_cache = utils::max(jcp.bcast_block, space_for_bcast / reduce_blocking); - bcast_blocking = utils::min(bcast_blocking, rnd_dn(bcast_in_cache, jcp.bcast_block)); - - load_blocking_max = load_blocking; - bcast_blocking_max = bcast_blocking * 3 / 2; - reduce_blocking_max = reduce_blocking; - - assert(load_blocking); - assert(load_blocking_max); - assert(bcast_blocking); - assert(bcast_blocking_max); - assert(reduce_blocking); - assert(reduce_blocking_max); - assert(load_blocking % jcp.load_block == 0); - assert(reduce_blocking % jcp.reduce_block == 0); - assert(load_blocking_max % jcp.load_block == 0); - assert(reduce_blocking_max % jcp.reduce_block == 0); - - assert(jcp.reduce_loop_unroll % 4 == 0); - assert(jcp.reduce_dim % jcp.reduce_loop_unroll == 0); - - assert(jcp.bcast_block % jcp.ur == 0); - assert(jcp.reduce_dim % jcp.reduce_block == 0); - - jcp.ur_tail = jcp.bcast_dim % jcp.ur; - - jcp.nb_bcast_blocking = bcast_blocking / jcp.bcast_block; - jcp.nb_bcast_blocking_max = bcast_blocking_max / jcp.bcast_block; - jcp.nb_load_blocking = load_blocking / jcp.load_block; - jcp.nb_load_blocking_max = load_blocking_max / jcp.load_block; - jcp.nb_reduce_blocking = reduce_blocking / jcp.reduce_block; - jcp.nb_reduce_blocking_max = reduce_blocking_max / jcp.reduce_block; - - jcp.nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block); - jcp.nb_load = div_up(jcp.load_dim, jcp.load_block); - jcp.nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block); - - jcp.is_oc_scale = 0; -#if 0 - const auto &oscales = attr.output_scales_; - jcp.is_oc_scale = oscales.mask_ == 1 << 1; - assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0)); -#endif - return SaberSuccess; -} - - -} // namespace jit -} // namespace saber -} // namespace anakin - diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.h deleted file mode 100644 index e0ba75040..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv_kernel.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_1X1_CONV_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_1X1_CONV_KERNEL_H - -#include "saber/funcs/impl/impl_base.h" -#include "saber/core/tensor.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "jit_uni_1x1_conv_utils.h" -#include "jit_generator.h" - -namespace anakin { -namespace saber { -namespace jit { - -struct jit_avx512_core_u8s8s32x_conv1x1_kernel : public jit_generator { - jit_avx512_core_u8s8s32x_conv1x1_kernel(jit_1x1_conv_conf_t ajcp) : jcp(ajcp) { - this->generate(); - jit_ker = (void (*)(jit_1x1_conv_call_t *)) this->getCode(); - } - - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_conv1x1_kernel) - - static SaberStatus init_conf(jit_1x1_conv_conf_t &jcp, conv_1x1_desc &conv_d, - int nthreads, bool reduce_src = false); - - jit_1x1_conv_conf_t jcp; - void (*jit_ker)(jit_1x1_conv_call_t *); - - private: - using reg64_t = const Xbyak::Reg64; - using zmm_t = const Xbyak::Zmm; - using mask_t = const Xbyak::Opmask; - - reg64_t reg_bcast_data = r8; - reg64_t reg_ptr_scales = r8; - reg64_t reg_output_data = r9; - reg64_t reg_load_data = r10; - reg64_t reg_ptr_sum_scale = r10; - reg64_t reg_reduce_loop_work = r11; - reg64_t reg_bias_data = r12; - reg64_t reg_scratch = r13; - reg64_t aux_reg_bcast_data = r14; - reg64_t aux_reg_load_data = r15; - reg64_t imm_addr64 = r15; - reg64_t reg_reduce_pos_flag = rax; - reg64_t aux1_reg_bcast_data = rbx; - reg64_t reg_bcast_loop_work = rbx; - reg64_t bcast_loop_iter = rdx; // FIXME - reg64_t reg_load_loop_work = rsi; - reg64_t aux_reg_output_data = abi_not_param1; - reg64_t reduce_loop_iter = abi_param1; - - reg64_t reg_last_load = r8; - mask_t ktail_mask = k6; - mask_t vmask = k7; - - Xbyak::Zmm zmm_tmp = Xbyak::Zmm(28); - Xbyak::Zmm zmm_one = Xbyak::Zmm(29); - Xbyak::Zmm zmm_zero = Xbyak::Zmm(30); - Xbyak::Zmm zmm_bcast = Xbyak::Zmm(31); - - int bcast_loop_work_offt = 0; - int reg_bias_data_offt = 8; - int reg_bcast_data_off = 16; - int reg_load_data_off = 24; - int reg_ptr_sum_scale_off = 32; - int reg_last_load_off = 40; - int stack_space_needed = 48; - - bool maybe_relu(int position, const float* post_sum); - void bcast_loop(int load_loop_blk); - void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound); - void generate(); - static void balance(jit_1x1_conv_conf_t &jcp, int nthreads); - void cvt2ps(DataType type_in, zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag); -}; - -} // namespace jit -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_JIT_AVX512_CORE_U8S8S32X_CONV1X1_ACT_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.cpp deleted file mode 100644 index f8c643731..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.cpp +++ /dev/null @@ -1,291 +0,0 @@ -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include "anakin_thread.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -SaberStatus JitAvx512U8S8S32XConv::init(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx) { - this->_ctx = &ctx; - ConvParam *conv_param = &(param.conv_param); - const Tensor *weights = conv_param->weight(); - Shape wgt_shape(weights->shape()); - bool depthwise = (conv_param->group > 1) && (wgt_shape[1] == 1); - - // reorder weights - // TODO check weights, do scale or not? - Tensor *weights_reorder = conv_param->mutable_weight(); - if (weights_internal_ != nullptr) { - delete weights_internal_; - weights_internal_ = nullptr; - } - weights_internal_ = new Tensor(weights_reorder->shape(), AK_INT8); - weights_internal_->set_scale(weights_reorder->get_scale()); - if (depthwise) { - weight_reorder_Goihw16g(*weights_reorder, *weights_internal_); - } else if (conv_param->group == 1) { - weight_reorder_OIhw4i16o4i(*weights_reorder, *weights_internal_, weights_reorder->get_scale()); - } else { - return SaberUnImplError; - } - - return create(inputs, outputs, param, ctx); -} - -SaberStatus JitAvx512U8S8S32XConv::create(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx) { - SaberStatus status = SaberSuccess; - - ConvParam *conv_param = &(param.conv_param); - jit_conv_conf_t jcp; - - status = init_conf(jcp, inputs, outputs, param); - if (status != SaberSuccess) { - return status; - } - - // TODO check bias, do scale or not? - Tensor *bias_src = conv_param->mutable_bias(); - if (bias_internal_ != nullptr) { - delete bias_internal_; - bias_internal_ = nullptr; - } - if (bias_src != nullptr) { - bias_internal_ = new Tensor(bias_src->shape(), AK_INT32); - bias_internal_->set_scale(bias_src->get_scale()); - bias_reorder_nchw(*bias_src, *bias_internal_, bias_src->get_scale()); - } - - float scale_in = inputs[0]->get_scale()[0]; - float scale_out = outputs[0]->get_scale()[0]; - auto scale_w = weights_internal_->get_scale(); - std::vector().swap(scale_); - for (int i = 0; i < scale_w.size(); i++) { - this->scale_.push_back((scale_w[i] * scale_in) / scale_out); - } - - return status; -} - -SaberStatus JitAvx512U8S8S32XConv::dispatch(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - ConvParam *conv_param = &(param.conv_param); - const Tensor *bias = conv_param->bias(); - - // check input and output data type, do scale or not - CHECK_EQ(inputs[0]->get_dtype(), AK_UINT8) << "only support uint8 input type"; - const unsigned char *ptr_src = reinterpret_cast(inputs[0]->data()); - const char *ptr_weights = reinterpret_cast(weights_internal_->data()); - const int32_t *ptr_bias = nullptr; - if (bias_internal_ != nullptr) { - ptr_bias = reinterpret_cast(bias_internal_->data()); - } - char *ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); - int dst_type_size = type_length(outputs[0]->get_dtype()); - - const auto &jcp = kernel_->jcp; - const auto oscale = scale_; - - parallel(0, [&](const int ithr, const int nthr) { - int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking; - int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking; - int nb_groups = jcp.nb_ch; - int group_block = jcp.ch_block; - - int start{0}, end{0}; - int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh; - balance211(work_amount, nthr, ithr, start, end); - - auto p = jit_conv_call_t(); - - size_t src_h_stride = jcp.iw * jcp.ic; - size_t dst_h_stride = jcp.ow * jcp.oc; - size_t wht_h_stride = jcp.kw * jcp.ic_block * jcp.oc_block; - size_t wht_ic_stride = jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block; - if (jcp.is_dw) { - src_h_stride = jcp.iw * jcp.ic * jcp.ngroups; - dst_h_stride = jcp.ow * jcp.oc * jcp.ngroups; - wht_h_stride = jcp.kw * jcp.ch_block; - wht_ic_stride = jcp.kh * jcp.kw * jcp.ch_block; - } - - int n{0}, gb{0}, occ{0}, oh_s{0}; - if (jcp.loop_order == loop_cgn) { - utils::nd_iterator_init(start, occ, oc_chunks, gb, nb_groups, n, jcp.mb, oh_s, jcp.oh); - } else if (jcp.loop_order == loop_gnc) { - utils::nd_iterator_init(start, gb, nb_groups, n, jcp.mb, occ, oc_chunks, oh_s, jcp.oh); - } else if (jcp.loop_order == loop_ngc) { - utils::nd_iterator_init(start, n, jcp.mb, gb, nb_groups, occ, oc_chunks, oh_s, jcp.oh); - } else { - assert(!"unsupported loop order"); - } - - while (start < end) { - int ocb = occ * jcp.nb_oc_blocking; - int g = gb * group_block; - int g_oc = (g * jcp.nb_oc + ocb) * jcp.oc_block; - - int g_ic = g * jcp.nb_ic * jcp.oc_block; - - int work_rem = end - start; - int ih_s = -jcp.t_pad + oh_s * jcp.stride_h; - int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem; - - size_t bias_blk_off = g_oc; - size_t dst_blk_off = n * jcp.oc * jcp.oh * jcp.ow + - oh_s * jcp.ow * jcp.oc + g_oc; - size_t src_blk_off = n * jcp.ic * jcp.ih * jcp.iw + - ih_s * jcp.iw * jcp.ic + g_ic; - size_t weight_blk_off = ocb * jcp.ic * jcp.kh * jcp.kw * jcp.oc_block; - if (jcp.is_dw) { - dst_blk_off = n * nb_groups *jcp.oh * jcp.ow * jcp.ch_block + g_oc + oh_s * jcp.ow * nb_groups * jcp.ch_block; - src_blk_off = n * nb_groups *jcp.ih * jcp.iw * jcp.ch_block + g_ic + ih_s * jcp.iw * nb_groups * jcp.ch_block; - weight_blk_off = gb * jcp.kh * jcp.kw * jcp.ch_block + ocb * jcp.kh * jcp.kw * jcp.ch_block; - } - auto bias_w = ptr_bias ? ptr_bias + bias_blk_off : 0; - auto dst_w = ptr_dst + dst_blk_off * dst_type_size; - auto src_w = ptr_src + src_blk_off; - auto wht_w = ptr_weights + weight_blk_off; - - for (int oj = oh_s, ij = ih_s; - oj < oh_e; ++oj, ij += jcp.stride_h) { - int dilate_h = jcp.dilate_h + 1; - int i_t_overflow = utils::div_up(utils::max(0, -ij), dilate_h); - int i_b_overflow = utils::div_up(utils::max(0, ij - jcp.ih + (jcp.kh - 1) * dilate_h + 1), - dilate_h); - int kh_padding = utils::max(0, - jcp.kh - i_t_overflow - i_b_overflow); - - p.src = src_w + i_t_overflow * dilate_h * src_h_stride; - p.dst = dst_w; - p.filt = wht_w + i_t_overflow * wht_h_stride; - p.bias = bias_w; - p.oc_blocks = jcp.is_dw ? gb : ocb; - p.kh_padding = kh_padding; - p.scales = &oscale[jcp.is_oc_scale * g_oc]; - kernel_->jit_ker(&p); - - src_w += src_h_stride * jcp.stride_h; - dst_w += dst_h_stride * dst_type_size; - } - - if (jcp.loop_order == loop_cgn) { - utils::nd_iterator_jump(start, end, occ, oc_chunks, gb, nb_groups, n, - jcp.mb, oh_s, jcp.oh); - } else if (jcp.loop_order == loop_gnc) { - utils::nd_iterator_jump(start, end, gb, nb_groups, n, jcp.mb, occ, - oc_chunks, oh_s, jcp.oh); - } else if (jcp.loop_order == loop_ngc) { - utils::nd_iterator_jump(start, end, n, jcp.mb, gb, nb_groups, occ, - oc_chunks, oh_s, jcp.oh); - } else { - assert(!"unsupported loop order"); - } - } - }); - - return SaberSuccess; -} - -SaberStatus JitAvx512U8S8S32XConv::init_conf(jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - SaberStatus status; - ConvParam *conv_param = &(param.conv_param); - EltwiseParam *eltwise_param = &(param.eltwise_param); - ActivationParam *act_param = &(conv_param->activation_param); - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; - Shape src_shape(input->shape()); - Shape dst_shape(output->shape()); - Shape wgt_shape(weights->shape()); - - // init conf - const bool with_groups = (conv_param->group > 1); - jcp.ngroups = with_groups ? conv_param->group : 1; - - jcp.mb = src_shape[0]; - jcp.ic = src_shape[3]/jcp.ngroups; - jcp.ic_without_padding = jcp.ic; - jcp.ih = src_shape[1]; - jcp.iw = src_shape[2]; - jcp.oc = dst_shape[3]/jcp.ngroups; - jcp.oc_without_padding = jcp.oc; - jcp.oh = dst_shape[1]; - jcp.ow = dst_shape[2]; - - jcp.kh = wgt_shape[2]; - jcp.kw = wgt_shape[3]; - - jcp.stride_h = conv_param->stride_h; - jcp.stride_w = conv_param->stride_w; - jcp.t_pad = conv_param->pad_h; - jcp.l_pad = conv_param->pad_w; - jcp.b_pad = conv_param->pad_h; - jcp.r_pad = conv_param->pad_w; - jcp.dilate_h = conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1); - jcp.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); - - if (bias != nullptr) { - jcp.bia_dt = bias->get_dtype(); - } - jcp.dst_dt = output->get_dtype(); - jcp.rm = conv_param->rm; - jcp.ur_h = 1; - - jcp.with_bias = (bias != NULL); - jcp.with_relu = conv_param->activation_param.has_active; - if (jcp.with_relu) { - jcp.relu_negative_slope = static_cast(act_param->negative_slope); - } - - jcp.is_dw = with_groups && (jcp.ic == 1); - - jcp.with_sum = eltwise_param->has_eltwise && (eltwise_param->operation == Eltwise_sum); - if (jcp.with_sum) { - jcp.sum_scale = eltwise_param->coeff[1]; - } - - status = jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jcp); - if (status == SaberSuccess) { - if (kernel_ != nullptr) { - delete kernel_; - kernel_ = nullptr; - } - kernel_ = new jit_avx512_core_u8s8s32x_fwd_kernel(jcp); - } else { - return SaberUnImplError; - } - - const int nthreads = omp_get_max_threads(); - ws_per_thread_ = jcp.oh * jcp.ow * jcp.oc; - ws_ = (int *)zmalloc(nthreads * ws_per_thread_ * sizeof(int), 4096); - if (!ws_) { - LOG(ERROR) << "workspace allocation failed"; - delete kernel_; - kernel_ = nullptr; - return SaberOutOfMem; - } - return SaberSuccess; -} - -SaberStatus JitAvx512U8S8S32XConv::check_conf(const jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m) { - return SaberSuccess; -} - -} // namespace saber -} // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.h deleted file mode 100644 index f695ee0f8..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.h +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2018 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_CONV_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_CONV_H - -#include "anakin_config.h" -#include "saber/funcs/impl/impl_base.h" -#include "saber/funcs/impl/impl_macro.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.h" - -namespace anakin { -namespace saber { - -using namespace jit; - -class JitAvx512U8S8S32XConv : - public ImplBase< - X86, - AK_INT8, - ConvEltwiseParam > { -public: - typedef typename DataTrait::Dtype OpDataType; - - JitAvx512U8S8S32XConv() - : kernel_(nullptr), weights_internal_(nullptr), - bias_internal_(nullptr), ws_(nullptr), ws_per_thread_(0) { - } - - ~JitAvx512U8S8S32XConv() { - if (kernel_ != nullptr) { - delete kernel_; - kernel_ = nullptr; - } - - if (bias_internal_ != nullptr) { - delete bias_internal_; - bias_internal_ = nullptr; - } - - if (weights_internal_ != nullptr) { - delete weights_internal_; - weights_internal_ = nullptr; - } - - if (ws_ != nullptr) { - delete ws_; - ws_ = nullptr; - } - - std::vector().swap(scale_); - } - - virtual SaberStatus init(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx); - - virtual SaberStatus create(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m, - Context &ctx); - - virtual SaberStatus dispatch(const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); - - -private: - jit_avx512_core_u8s8s32x_fwd_kernel *kernel_; - Tensor *weights_internal_; - Tensor *bias_internal_; - int *ws_; - size_t ws_per_thread_; - - // quantization scale(s) - std::vector scale_; - - virtual SaberStatus init_conf(jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); - - virtual SaberStatus check_conf(const jit_conv_conf_t &jcp, - const std::vector*> &inputs, - std::vector*> &outputs, - ConvEltwiseParam ¶m); -}; - -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_U8S8S32X_CONV_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.cpp b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.cpp deleted file mode 100644 index fe1a4070e..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.cpp +++ /dev/null @@ -1,572 +0,0 @@ -#include "saber/funcs/impl/x86/x86_utils.h" -#include "jit_avx512_core_u8s8s32x_conv_kernel.h" - -namespace anakin { -namespace saber { -namespace jit { - -#define GET_OFF(field) offsetof(jit_conv_call_t, field) -using namespace Xbyak; - -static inline void pick_loop_order(jit_conv_conf_t &jcp) { - jcp.loop_order = loop_cgn; - if (jcp.ngroups > 1) { - jcp.loop_order = loop_ngc; - } -} - -bool jit_avx512_core_u8s8s32x_fwd_kernel::maybe_relu(int position, const float *post_sum) { - if (position == 0) { - /* if do sum, then skip relu before sum */ - if (post_sum) { - return false; - } - return false || jcp.with_relu; - } else if (position == 1) { - /* relu after sum */ - if (post_sum == nullptr) { - return false; - } - - return false || - jcp.dst_dt == AK_UINT8 || - jcp.with_relu; - } - - return false; -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::prepare_output(int ur_w) { - for (int k = 0; k < jcp.nb_oc_blocking; k++) { - for (int j = 0; j < ur_w; j++) { - Zmm zmm = zmm_out(j, k); - vpxord(zmm, zmm, zmm); - } - } -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::cvt2ps(DataType type_in, - zmm_t zmm_in, - const Xbyak::Operand &op, - bool mask_flag) { - zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in; - switch (type_in) { - case AK_FLOAT: - case AK_INT32: - vmovups(zmm, op); - break; - case AK_INT8: - vpmovsxbd(zmm, op); - break; - case AK_UINT8: - vpmovzxbd(zmm, op); - break; - default: - assert(!"unsupported data type"); - } - if (type_in != AK_FLOAT) { - vcvtdq2ps(zmm_in, zmm_in); - } -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::store_output(int ur_w, - int last_oc_block_flag) { - int nb_oc_block = jcp.nb_oc_blocking; - - mov(reg_bias, ptr[param1 + GET_OFF(bias)]); - mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]); - - const float *p_sum_scale = nullptr; - if (jcp.with_sum) { - p_sum_scale = &(jcp.sum_scale); - } - - if (p_sum_scale && *p_sum_scale != 1.f) { - mov(reg_ptr_sum_scale, (size_t)p_sum_scale); - } - - vpxord(zmm_zero, zmm_zero, zmm_zero); - for (int k = 0; k < nb_oc_block; k++) { - const bool mask_flag = last_oc_block_flag == 1 && k == nb_oc_block - 1; - int scale_offset = jcp.is_oc_scale * (sizeof(float) * k * jcp.oc_block); - auto zmm_bias = zmm_tmp; - if (jcp.with_bias) { - int bias_offset = jcp.typesize_bia * k * jcp.oc_block; - auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset); - - cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag); - } - for (int j = 0; j < ur_w; j++) { - int aux_output_offset = jcp.typesize_out * - (k * jcp.oc_block + j * jcp.oc_without_padding * jcp.ngroups); - auto addr = EVEX_compress_addr(reg_out, aux_output_offset); - - Zmm zmm = zmm_out(j, k); - vcvtdq2ps (zmm, zmm); - if (jcp.with_bias) { - vaddps(zmm, zmm, zmm_bias); - } - - zmm_t mask_zmm = mask_flag ? zmm | ktail_mask | T_z : zmm; - vmulps(mask_zmm, zmm, EVEX_compress_addr(reg_ptr_scales, scale_offset)); - if (maybe_relu(0, p_sum_scale)) { - vmaxps(zmm, zmm_zero, zmm); - } - if (p_sum_scale) { // post_op: sum - auto zmm_prev_dst = zmm_bcast; - - cvt2ps(jcp.dst_dt, zmm_prev_dst, addr, mask_flag); - - if (*p_sum_scale == 1.f) { - vaddps(zmm, zmm_prev_dst); - } else { - vfmadd231ps(zmm, zmm_prev_dst, zword_b[reg_ptr_sum_scale]); - } - } - - if (maybe_relu(1, p_sum_scale)) { - vmaxps(zmm, zmm_zero, zmm); - } - - if (jcp.dst_dt != AK_FLOAT) { - if (jcp.rm == round_mode::nearest) { - vcvtps2dq(zmm | T_rn_sae, zmm); - } else if (jcp.rm == round_mode::down) { - vcvtps2dq(zmm | T_rd_sae, zmm); - } else { - assert(!"unimplemented"); - } - } - } - for (int j = 0; j < ur_w; j++) { - int aux_output_offset = jcp.typesize_out * (k * jcp.oc_block - + j * jcp.oc_without_padding * jcp.ngroups); - auto addr = EVEX_compress_addr(reg_out, aux_output_offset); - - Zmm zmm = zmm_out(j, k); - zmm_t r_zmm = mask_flag ? zmm | ktail_mask : zmm; - switch (jcp.dst_dt) { - case AK_FLOAT: - case AK_INT32: - vmovups(addr, r_zmm); - break; - case AK_INT8: - vpmovsdb(addr, r_zmm); - break; - case AK_UINT8: - vpmovusdb(addr, r_zmm); - break; - default: - assert(!"unknown dst_dt"); - } - } - } -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::compute_ker(int ur_w, - int pad_l, - int pad_r, - int last_ic_block_flag) { - int kw = jcp.kw; - int stride_w = jcp.stride_w; - int ic_block = jcp.ic_block; - int oc_block = jcp.oc_block; - int ch_block_all = jcp.ch_block * ic_block * oc_block; - - int nb_oc_block = jcp.nb_oc_blocking; - - Label kh_label; - Label skip_kh_loop; - - int shift_kernel_ptr = jcp.typesize_in * jcp.kw * ch_block_all; - int shift_input_ptr = jcp.typesize_in * (jcp.dilate_h + 1) * jcp.iw * - jcp.ic_without_padding * jcp.ngroups; - - auto input_offset = [=](int oi, int ic, int ki) { - return jcp.typesize_in * - ((ki * (jcp.dilate_w + 1) + oi * stride_w - pad_l) * - jcp.ic_without_padding * jcp.ngroups + 4 * ic); - }; - auto kernel_offset = [=](int ii, int ic, int ki) { - return jcp.typesize_in * - ((ii * jcp.nb_ic * jcp.kh * jcp.kw + ki) * ch_block_all + 4 * ic * oc_block); - }; - auto compute = [=](Zmm vreg_acc, Zmm vreg_wei, Zmm vreg_src) { - if (jcp.ver == ver_vnni) { - // also okay for depthwise since src is zero-extended - vpdpbusd(vreg_acc, vreg_src, vreg_wei); - } else if (jcp.is_dw) { - vpmulld(zmm_tmp, vreg_src, vreg_wei); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); - } else { - vpmaddubsw(zmm_tmp, vreg_src, vreg_wei); - vpmaddwd(zmm_tmp, zmm_tmp, zmm_one); - vpaddd(vreg_acc, vreg_acc, zmm_tmp); - } - }; - - mov(aux_reg_inp, reg_inp); - mov(aux_reg_ker, reg_ker); - - mov(reg_kj, reg_kh); - if ((jcp.kh - 1) * (jcp.dilate_h + 1) < std::max(jcp.t_pad, jcp.b_pad)) { - cmp(reg_kj, 0); - je(skip_kh_loop, T_NEAR); - } - L(kh_label); { - for (int ki = 0; ki < kw; ki++) { - int jj_start = get_ow_start(ki, pad_l); - int jj_end = get_ow_end(ur_w, ki, pad_r); - int tail_size = jcp.ic_without_padding % 4; - /* Skip the last loads of input if (ic%16)/4 < ic_block/4 */ - int icb = jcp.is_dw - ? 1 - : (last_ic_block_flag != no_last_block) - ? utils::div_up((jcp.ic_without_padding % ic_block), 4) - : ic_block / 4; - for (int ic = 0; ic < icb; ic++) { - for (int jj = jj_start; jj < jj_end; jj++) { - int aux_input_offset = input_offset(jj, ic, ki); - if (jcp.is_dw) { - vpmovzxbd(zmm_inp(jj, nb_oc_block), - EVEX_compress_addr( - aux_reg_inp, aux_input_offset)); - } else if (last_ic_block_flag == last_sp_block && - tail_size != 0 && ic == icb - 1) { - Xmm xmm_tmp = Xmm(zmm_inp(jj, nb_oc_block).getIdx()); - for (int r = 0; r < tail_size; ++r) { - vpinsrb(xmm_tmp, xmm_tmp, - ptr[aux_reg_inp + aux_input_offset + r], r); - } - vpbroadcastd(zmm_inp(jj, nb_oc_block), xmm_tmp); - } else { - vpbroadcastd(zmm_inp(jj, nb_oc_block), - EVEX_compress_addr(aux_reg_inp, - aux_input_offset)); - } - } - - for (int ii = 0; ii < nb_oc_block; ii++) { - int aux_kernel_offset = kernel_offset(ii, ic, ki); - if (jj_end - jj_start > 0) { - if (jcp.is_dw) { - vpmovsxbd(zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); - } else { - vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker, - aux_kernel_offset)); - } - } - for (int jj = jj_start; jj < jj_end; jj++) { - compute(zmm_out(jj, ii), zmm_wei, - zmm_inp(jj, nb_oc_block)); - } - } - } - } - add(aux_reg_ker, shift_kernel_ptr); - add(aux_reg_inp, shift_input_ptr); - dec(reg_kj); - cmp(reg_kj, 0); - jg(kh_label, T_NEAR); - } - L(skip_kh_loop); -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::compute_loop(int ur_w, - int pad_l, - int pad_r, - bool is_last_sp_block) { - prepare_output(ur_w); - - // IC loop - Label icb_label; - mov(reg_icb, jcp.nb_ic); - L(icb_label); - if (jcp.ic_without_padding != jcp.ic) { - Label common_ker; - Label end_ker; - - cmp(reg_icb, 1); // The last IC block - jne(common_ker, T_NEAR); - - compute_ker(ur_w, pad_l, pad_r, - is_last_sp_block ? last_sp_block : last_ic_block); - jmp(end_ker, T_NEAR); - - L(common_ker); - compute_ker(ur_w, pad_l, pad_r, no_last_block); - - L(end_ker); - } else { - compute_ker(ur_w, pad_l, pad_r, no_last_block); - } - // End of IC Loop - int inp_step = jcp.ic_block; - int ker_step = jcp.kh * jcp.kw * jcp.oc_block * jcp.ic_block; - add(reg_inp, jcp.typesize_in * inp_step); - add(reg_ker, jcp.typesize_in * ker_step); - - dec(reg_icb); - cmp(reg_icb, 0); - jg(icb_label, T_NEAR); - - sub(reg_inp, jcp.typesize_in * inp_step * jcp.nb_ic); - sub(reg_ker, jcp.typesize_in * ker_step * jcp.nb_ic); - - if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { - Label common_store; - Label end_store; - - if (jcp.is_dw) { - cmp(reg_oc_blocks, jcp.nb_ch - 1); - } else { - cmp(reg_oc_blocks, jcp.nb_oc - jcp.nb_oc_blocking); - } - - jne(common_store, T_NEAR); - - store_output(ur_w, 1); - jmp(end_store, T_NEAR); - - L(common_store); - store_output(ur_w, 0); - - L(end_store); - } else { - store_output(ur_w, 0); - } -} - -void jit_avx512_core_u8s8s32x_fwd_kernel::generate() { - int inp_shift_pad = jcp.typesize_in * (jcp.ur_w * jcp.stride_w - jcp.l_pad) * - jcp.ic_without_padding * jcp.ngroups; - - int inp_shift = jcp.typesize_in * - (jcp.ur_w * jcp.stride_w * jcp.ic_without_padding * jcp.ngroups); - - int out_shift = jcp.typesize_out * - (jcp.ur_w * jcp.oc_without_padding * jcp.ngroups); - - preamble(); - - xor_(reg_scratch, reg_scratch); - Reg16 _t = reg_scratch.cvt16(); - mov(_t, 0x1); - vpbroadcastw(zmm_one, _t); - - mov(reg_inp, ptr[param1 + GET_OFF(src)]); - mov(reg_out, ptr[param1 + GET_OFF(dst)]); - mov(reg_ker, ptr[param1 + GET_OFF(filt)]); - mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]); - - if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) { - int tail_size = jcp.is_dw - ? jcp.ngroups % jcp.ch_block - : jcp.oc_without_padding % jcp.oc_block; - int mask = (1 << tail_size) - 1; - mov(reg_oc_blocks, ptr[param1 + GET_OFF(oc_blocks)]); - Reg32 regw_tmp = reg_oi.cvt32(); - mov(regw_tmp, mask); - kmovw(ktail_mask, regw_tmp); - } - - int r_pad = std::max(0, (jcp.ow - 1) * jcp.stride_w + - (jcp.kw - 1) * (jcp.dilate_w + 1) - - (jcp.iw + jcp.l_pad - 1)); - int n_oi = jcp.ow / jcp.ur_w; - int r_pad1 = (jcp.ur_w * n_oi - 1) * jcp.stride_w + - (jcp.kw - 1) * (jcp.dilate_w + 1) - - (jcp.iw + jcp.l_pad - 1); - if (r_pad1 > 0 || jcp.ur_w_tail == 0) { - n_oi--; - } - - xor_(reg_oi, reg_oi); - if (jcp.ow == jcp.ur_w) { - compute_loop(jcp.ur_w, jcp.l_pad, r_pad, true); - } else { - if (n_oi == 0) { - compute_loop(jcp.ur_w, jcp.l_pad, r_pad1, jcp.ur_w_tail == 0); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - if (jcp.ur_w_tail != 0) { - compute_loop(jcp.ur_w_tail, 0, r_pad, true); - } - } else { - if (jcp.l_pad > 0) { - compute_loop(jcp.ur_w, jcp.l_pad, 0, false); - add(reg_inp, inp_shift_pad); - add(reg_out, out_shift); - - inc(reg_oi); - } - if ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1)) { - Label ow_loop_label; - L(ow_loop_label); { - compute_loop(jcp.ur_w, 0, 0, false); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - - inc(reg_oi); - cmp(reg_oi, n_oi); - jl(ow_loop_label, T_NEAR); - } - } - if (r_pad1 > 0 || jcp.ur_w_tail == 0) { - compute_loop(jcp.ur_w, 0, r_pad1, jcp.ur_w_tail == 0); - add(reg_inp, inp_shift); - add(reg_out, out_shift); - } - if (jcp.ur_w_tail != 0) { - compute_loop(jcp.ur_w_tail, 0, r_pad, true); - } - } - } - - postamble(); -} - -SaberStatus jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp) { - SaberStatus ret = SaberUnImplError; - - const int regs = 28; - - // TODO - /* - if (!(mayiuse(avx512_core) && - src_d.data_type() == data_type::u8 - && weights_d.data_type() == data_type::s8 - && one_of(dst_d.data_type(), data_type::f32, data_type::s32, - data_type::s8, data_type::u8))) - return status::unimplemented; - - if (!implication(with_relu, relu_negative_slope == 0.)) - return status::unimplemented; - */ - - using namespace utils; - if (jcp.is_dw) { - jcp.ch_block = 16; - jcp.ic_block = 1; - jcp.oc_block = 1; - if (jcp.ngroups % jcp.ch_block != 0) { - return ret; - } - } else { - jcp.ch_block = 1; - jcp.ic_block = 16; - jcp.oc_block = 16; - - if (jcp.ngroups == 1) { - jcp.oc = rnd_up(jcp.oc, jcp.oc_block); - jcp.ic = rnd_up(jcp.ic, jcp.ic_block); - } - - if (jcp.ic % jcp.ic_block != 0) { - return ret; - } - } - - jcp.ver = ver_avx512_core; - if (mayiuse(avx512_core_vnni)) { - jcp.ver = ver_vnni; - } - -/*TOTO - const auto w_format = with_groups - ? (jcp.is_dw ? Goihw16g : gOIhw4i16o4i) : OIhw4i16o4i; - if (weights_d.format() == any) - CHECK(weights_pd.set_format(w_format)); - if (weights_d.format() != w_format) - return status::unimplemented; - - if (dst_d.format() == any) - CHECK(dst_pd.set_format(nhwc)); - if (dst_d.format() != nhwc) - return status::unimplemented; - if (src_d.format() == any) - CHECK(src_pd.set_format(nhwc)); - if (src_d.format() != nhwc) - return status::unimplemented; - if (jcp.with_bias) { - if (bias_d.format() == any) - CHECK(bias_pd.set_format(x)); - if (bias_d.format() != x) - return status::unimplemented; - } - - jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef; - jcp.dst_dt = cd.dst_desc.data_type; - - jcp.typesize_in = types::data_type_size(src_d.data_type()); - jcp.typesize_out = types::data_type_size(dst_d.data_type()); - jcp.typesize_acc = sizeof(int32_t); - jcp.typesize_bia = jcp.with_bias - ? types::data_type_size(bias_d.data_type()) - : 0; -*/ - - jcp.typesize_in = 1; - jcp.typesize_out = datatype_size(jcp.dst_dt); - jcp.typesize_acc = sizeof(int32_t); - jcp.typesize_bia = jcp.with_bias - ? datatype_size(jcp.bia_dt) - : 0; - - jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block); - jcp.nb_ic = jcp.ic / jcp.ic_block; - jcp.nb_oc = jcp.oc / jcp.oc_block; - - // If OC blocking is incommensurate with the number of OC blocks (general - // requirement for all convolutions), or if it results in an unrolling - // factor smaller than the left padding (special requirement for SSD:fc6), - // then search for a smaller OC blocking that satisfies both constraints. - jcp.nb_oc_blocking = std::min(4, jcp.nb_oc); - for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--) { - if (jcp.nb_oc % jcp.nb_oc_blocking == 0 - && jcp.l_pad <= regs / (jcp.nb_oc_blocking + 1)) - break; - } - - jcp.ur_w = regs / (jcp.nb_oc_blocking + 1); - if (jcp.ow < jcp.ur_w) { - jcp.ur_w = jcp.ow; - } - jcp.ur_w_tail = jcp.ow % jcp.ur_w; - - bool args_ok = true - && jcp.oc % jcp.oc_block == 0 - && jcp.l_pad <= jcp.ur_w - && implication(!jcp.is_1stconv, jcp.ic % jcp.ic_block == 0); - if (!args_ok) { - return ret; - } - - int r_pad_no_tail = std::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + - (jcp.kw - 1) * (jcp.dilate_w + 1) - - (jcp.iw + jcp.l_pad - 1)); - if (r_pad_no_tail > jcp.ur_w) { - return ret; - } - - pick_loop_order(jcp); - - jcp.nb_ic_L2 = jcp.nb_ic; - - jcp.is_oc_scale = 1; - /* TODO - const auto &oscales = attr.output_scales_; - jcp.is_oc_scale = oscales.mask_ == 1 << 1; - - assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0)); - */ - - return SaberSuccess; -} - -} // namespace jit -} // namespace saber -} // namespace anakin - -// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.h b/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.h deleted file mode 100644 index 277ccc54d..000000000 --- a/saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv_kernel.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_CONV_KERNEL_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_AVX512_CORE_U8S8S32X_CONV_KERNEL_H - -#include -#include - -#include "saber/funcs/impl/x86/kernel/jit_generator.h" -#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" -#include "saber/saber_types.h" -#include "saber/funcs/impl/x86/x86_utils.h" - -namespace anakin { -namespace saber { -namespace jit { - -struct jit_avx512_core_u8s8s32x_fwd_kernel : public jit_generator { -public: - jit_avx512_core_u8s8s32x_fwd_kernel(jit_conv_conf_t ajcp) : jcp(ajcp) { - generate(); - jit_ker = (void (*)(jit_conv_call_t *))getCode(); - } - - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_conv_fwd_ker_t) - - jit_conv_conf_t jcp; - static SaberStatus init_conf(jit_conv_conf_t &jcp); - void (*jit_ker)(jit_conv_call_t *); - -private: - using reg64_t = const Xbyak::Reg64; - using zmm_t = const Xbyak::Zmm; - using xmm_t = const Xbyak::Xmm; - enum { - typesize = sizeof(float), - ker_reg_base_idx = 28, - }; - enum { - no_last_block, - last_ic_block, - last_sp_block, - }; - - reg64_t reg_inp = r8; - reg64_t reg_ker = r9; - reg64_t reg_out = r10; - reg64_t aux_reg_inp = r11; - reg64_t reg_ptr_sum_scale = r11; - reg64_t aux_reg_ker = r12; - reg64_t reg_scratch = r14; - reg64_t reg_kj = rax; - reg64_t reg_ptr_scales = rax; - reg64_t reg_oi = rbx; - reg64_t reg_bias = rdx; - reg64_t reg_kh = abi_not_param1; - reg64_t param = abi_param1; - reg64_t reg_tmp = rbp; - reg64_t imm_addr64 = r15; - reg64_t reg_oc_blocks = rsi; - reg64_t reg_icb = reg_bias; - - Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); - - zmm_t zmm_tmp = zmm_t(28); - zmm_t zmm_one = zmm_t(29); - zmm_t zmm_scales = zmm_t(30); - zmm_t zmm_bcast = zmm_t(30); - zmm_t zmm_zero = zmm_t(31); - zmm_t zmm_wei = zmm_t(31); - - zmm_t zmm_out(int i_ur, int i_oc) { - int idx = i_ur + i_oc * jcp.ur_w; - assert(idx < ker_reg_base_idx); - return zmm_t(idx); - } - xmm_t xmm_out(int i_ur, int i_oc) { - int idx = i_ur + i_oc * jcp.ur_w; - assert(idx < ker_reg_base_idx); - return xmm_t(idx); - } - zmm_t zmm_inp(int i_ic, int nb_x_blocking) { - int idx = i_ic + nb_x_blocking * jcp.ur_w; - assert(idx < 31); - return zmm_t(idx); - } - int get_ow_start(int ki, int pad_l) { - return std::max(0, - utils::div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w)); - } - int get_ow_end(int ur_w, int ki, int pad_r) { - return ur_w - std::max(0, utils::div_up(pad_r - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1), - jcp.stride_w)); - } - bool maybe_relu(int position, const float *post_sum); - void prepare_output(int ur_w); - void store_output(int ur_w, int last_oc_block_flag); - void compute_ker(int ur_w, int pad_l, int pad_r, int last_ic_block_flag); - void compute_loop(int ur_w, int pad_l, int pad_r, bool is_last_spatial_block); - void generate(); - void cvt2ps(DataType type_in, zmm_t zmm_in, const Xbyak::Operand &op, - bool mask_flag); -}; - -} // namespace jit -} // namespace saber -} // namespace anakin - -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_AVX512_CORE_U8S8S32_CONV_ACT_KERNEL_H diff --git a/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h b/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h index 5bfdca4cd..57078e599 100644 --- a/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h +++ b/saber/funcs/impl/x86/kernel/jit_avx512_rtus_driver.h @@ -184,7 +184,7 @@ inline void init_rtus_driver(rtus_driver_t **p_rtus_driver, conv_1x1_desc &conv_d, size_t &ws_per_thread, Dtype **p_scratch) { - const int max_threads = omp_get_max_threads(); + const int max_threads = anakin_get_max_threads(); size_t factor = 0; factor = jcp.nb_reduce; diff --git a/saber/funcs/impl/x86/kernel/jit_call_conf.h b/saber/funcs/impl/x86/kernel/jit_call_conf.h index 1f67ddd8c..ed10d1d53 100644 --- a/saber/funcs/impl/x86/kernel/jit_call_conf.h +++ b/saber/funcs/impl/x86/kernel/jit_call_conf.h @@ -17,7 +17,7 @@ #define ANAKIN_SABER_FUNCS_IMPL_X86_JIT_CALL_CONF_H #include -#include +#include #include "saber/saber_types.h" #include "stddef.h" @@ -43,6 +43,25 @@ enum { FLAG_REDUCE_FIRST = 1 << 8, FLAG_REDUCE_LAST = 1 << 9, }; +struct jit_int8_packed_fc_call_t { + const void *src{nullptr}; + const void *weights{nullptr}; + const void *output_data{nullptr}; + + size_t lda{0}; // used in backward_weights only + size_t ldb{0}; + size_t ldc{0}; + size_t k_block{0}; + +}; + +struct jit_int8_packed_fc_config_t { + size_t m_block_size{0}; + size_t n_block_size{0}; + size_t k_block_number{0}; +}; + + struct jit_1x1_conv_call_t { const void *bcast_data; const void *load_data; @@ -50,6 +69,7 @@ struct jit_1x1_conv_call_t { const void *bias_data; // used in forward and backward_weights only const void *acc_s32; const void *scales; + const void *compensation; size_t load_dim; size_t bcast_dim; @@ -60,180 +80,344 @@ struct jit_1x1_conv_call_t { }; struct jit_conv_call_t { - const void *src; /* hack, non-const for backward_data */ - const void *dst; /* hack, non-const for forward */ - const void *filt; /* hack, non-const for backward_weights */ - const void *bias; /* hack, non-const for backward_bias */ - const void *src_prf; - const void *dst_prf; - const void *filt_prf; - const void *bias_prf; - const void *scales; - const void *acc_s32; - size_t kd_padding; - size_t kd_padding_prf; - size_t kh_padding; - size_t kh_padding_prf; - size_t kw_padding; - size_t channel; - size_t channel_prf; - size_t oc_blocks; - size_t ur_w; - size_t ur_str_w; - size_t ch_blocks; - int flags; + const void *src{nullptr}; /* hack, non-const for backward_data */ + const void *dst{nullptr}; /* hack, non-const for forward */ + const void *filt{nullptr}; /* hack, non-const for backward_weights */ + const void *bias{nullptr}; /* hack, non-const for backward_bias */ + const void *src_prf{nullptr}; + const void *dst_prf{nullptr}; + const void *filt_prf{nullptr}; + const void *bias_prf{nullptr}; + const void *scales{nullptr}; + const void *acc_s32{nullptr}; + const void *compensation{nullptr}; + size_t kd_padding{0}; + size_t kd_padding_prf{0}; + size_t kh_padding{0}; + size_t kh_padding_prf{0}; + size_t kw_padding{0}; + size_t channel{0}; + size_t channel_prf{0}; + size_t oc_blocks{0}; + size_t ur_w{0}; + size_t ur_str_w{0}; + size_t ch_blocks{0}; + size_t t_overflow{0}; + size_t b_overflow{0}; + int flags{0}; +}; + +struct jit_wino_transform_call_s { + size_t tile_block; + size_t tile_block_ur; + size_t nb_tile_block_ur; + size_t tile_count; + size_t tj; + size_t ti; + void *src; + void *dst; + void *Mw; + void *M; + void *T; + void *G; + void *bias; }; struct jit_conv_conf_t { - conv_version_t ver; - conv_loop_order_t loop_order; - LayoutType src_fmt; - int ndims; - int mb; - int ngroups, ic, oc, oc_without_padding, ic_without_padding; - int id, ih, iw, od, oh, ow; - int f_pad, l_pad, t_pad; - int back_pad, r_pad, b_pad; - int kd, kh, kw; - int stride_d, stride_h, stride_w; - int dilate_d, dilate_h, dilate_w; - bool with_bias, with_relu; - float relu_negative_slope; - bool with_sum; - bool is_dw; - int idp, ihp, iwp, ohp, owp; - int nb_ic, ic_block; - int nb_oc, oc_block; - int nb_g, g_block; - int nb_ic_blocking, nb_oc_blocking; // blocking of nb_ic and nb_ic - int nb_ic_blocking_max; - int nb_ic_L2; - int nb_oc_L2; - int ur_h, ur_w; - int ur_w_tail; - bool is_1stconv; + conv_version_t ver{ver_unused}; + conv_loop_order_t loop_order{loop_cgn}; + LayoutType src_fmt{Layout_invalid}; + int ndims{0}; + int mb{0}; + int ngroups{0}; + int ic{0}; + int oc{0}; + int oc_without_padding{0}; + int ic_without_padding{0}; + int id{0}; + int ih{0}; + int iw{0}; + int od{0}; + int oh{0}; + int ow{0}; + int f_pad{0}; + int l_pad{0}; + int t_pad{0}; + int back_pad{0}; + int r_pad{0}; + int b_pad{0}; + int kd{0}; + int kh{0}; + int kw{0}; + int stride_d{0}; + int stride_h{0}; + int stride_w{0}; + int dilate_d{0}; + int dilate_h{0}; + int dilate_w{0}; + bool with_bias{false}; + bool with_relu{false}; + float relu_negative_slope{0.f}; + bool with_sum{false}; + bool is_dw{false}; + bool is_dw_int8{false}; + int idp{0}; + int ihp{0}; + int iwp{0}; + int ohp{0}; + int owp{0}; + int nb_ic{0}; + int ic_block{0}; + int nb_oc{0}; + int oc_block{0}; + int nb_g{0}; + int g_block{0}; + int nb_ic_blocking{0}; + int nb_oc_blocking{0}; // blocking of nb_ic and nb_i{0c + int nb_ic_blocking_max{0}; + int nb_ic_L2{0}; + int nb_oc_L2{0}; + int ur_h{0}; + int ur_w{0}; + int ur_w_tail{0}; + bool is_1stconv{0}; /* fma avx512_core */ - conv_kernel_kind_t kernel_kind; + conv_kernel_kind_t kernel_kind{embd_bcast}; /* 4fma */ - int tr_iw; - int tr_src_num_guard_elems; + int tr_iw{0}; + int tr_src_num_guard_elems{0}; /* 1st conv: 4fma */ - int tr_ld; - int kh_step; + int tr_ld{0}; + int kh_step{0}; /* 4vnni */ - int typesize_in; - int typesize_out; - int typesize_bia; - int typesize_acc; - int tr_ow; + int typesize_in{0}; + int typesize_out{0}; + int typesize_bia{0}; + int typesize_acc{0}; + int tr_ow{0}; /* avx512_u8s8u8 */ - int ic_nb1, ic_nb2; - int oc_nb1; - int ur_ow_max, ur_ow, ur_ow_tail; - int ur_ow_nsteps; - DataType bia_dt; - DataType dst_dt; + int ic_nb1{0}; + int ic_nb2{0}; + int oc_nb1{0}; + int ur_ow_max{0}; + int ur_ow{0}; + int ur_ow_tail{0}; + int ur_ow_nsteps{0}; + DataType bia_dt{AK_INVALID}; + DataType dst_dt{AK_INVALID}; + DataType sum_dt{AK_INVALID}; /* avx512: max possible value is nregs(32) - aux_regs(4) */ - int src_offsets[28]; - int src_count; - bool expl_bcast; - bool large_spatial; - int is_oc_scale; + int src_offsets[28]{0}; + int src_count{0}; + bool expl_bcast{false}; + bool large_spatial{false}; + int is_oc_scale{0}; + bool signed_input{false}; + float wei_adj_scale{0.f}; // gemm conv - int is, os, ks; + int is{0}; + int os{0}; + int ks{0}; ptrdiff_t im2col_sz; - bool need_im2col; - int nthr; + bool need_im2col{false}; + int nthr{0}; // dw conv - int nb_ch, ch_block, nb_ch_blocking; - round_mode rm; + int nb_ch{0}; + int ch_block{0}; + int nb_ch_blocking{0}; + round_mode rm{nearest}; // pooling - PoolingType pool_alg; - int pool_kw; + bool with_partial_pool=false; + PoolingType pool_alg{Pooling_unknow}; + int pool_kw{0}; //the scale for post sum - float sum_scale; + float sum_scale{0.f}; // output layout nhwc - bool output_nhwc; + bool output_nhwc{false}; }; struct jit_1x1_conv_conf_t { + conv_version_t ver{ver_unused}; + + int mb{0}; + int ngroups{0}; + int ic{0}; + int oc{0}; + int oc_without_padding{0}; + int ic_without_padding{0}; + int iw{0}; + int ih{0}; + int ow{0}; + int oh{0}; + int l_pad{0}; + int t_pad{0}; + int kh{0}; + int kw{0}; + int stride_h{0}; + int stride_w{0}; + bool with_bias{false}; + bool with_relu{false}; + float relu_negative_slope{0.f}; + bool with_sum{false}; + + int is{0}; + int os{0}; + int ic_block{0}; + int oc_block{0}; + + int ur{0}; + int ur_tail{0}; + + int reduce_dim{0}; + int reduce_block{0}; + int nb_reduce{0}; + int nb_reduce_blocking{0}; + int nb_reduce_blocking_max{0}; + int load_dim{0}; + int load_block{0}; + int nb_load{0}; + int nb_load_blocking{0}; + int nb_load_blocking_max{0}; + int bcast_dim{0}; + int bcast_block{0}; + int nb_bcast{0}; + int nb_bcast_blocking{0}; + int nb_bcast_blocking_max{0}; + + int reduce_loop_unroll{0}; + int reduce_loop_bcast_step{0}; + int reduce_loop_load_step{0}; + int load_loop_load_step{0}; + int load_loop_iter_step{0}; + int bcast_loop_output_step{0}; + int bcast_loop_output_substep{0}; + int bcast_loop_bcast_step{0}; + int bcast_loop_bcast_substep{0}; + int fma_step{0}; + int load_grp_count{0}; + conv_1x1_loop_order_t loop_order{loop_rbl}; + bool use_vmovntps{false}; + /* avx512 core */ + bool expl_bcast{false}; + /* 4vnni */ + int typesize_in{0}; + int typesize_out{0}; + int typesize_bia{0}; + int typesize_acc{0}; + /* 4fma */ + bool transpose_src{false}; + int tr_is{0}; + int nthr{0}; + int nthr_mb{0}; + int nthr_g{0}; + int nthr_oc_b{0}; + int nthr_ic_b{0}; + int is_oc_scale{0}; + DataType bia_dt{AK_INVALID}; + DataType src_dt{AK_INVALID}; + DataType dst_dt{AK_INVALID}; + DataType sum_dt{AK_INVALID}; + round_mode rm{nearest}; + bool signed_input{false}; + float wei_adj_scale{0.f}; + + //the scale for post sum + float sum_scale{0.f}; +}; + +struct jit_conv_conf_2x3_wino_t { conv_version_t ver; + int m; + int r; + int alpha; + int tile_h, tile_w; + int mb; - int ngroups, ic, oc, oc_without_padding, ic_without_padding;; - int iw, ih, ow, oh; + int ngroups, ic, oc, oc_without_padding; + int ih, iw, oh, ow; int l_pad, t_pad; + int r_pad, b_pad; int kh, kw; int stride_h, stride_w; - bool with_bias, with_relu; - float relu_negative_slope; - bool with_sum; + int dilate_h, dilate_w; - int is, os; - int ic_block, oc_block; - - int ur, ur_tail; - - int reduce_dim, reduce_block, nb_reduce, - nb_reduce_blocking, nb_reduce_blocking_max; - int load_dim, load_block, nb_load, - nb_load_blocking, nb_load_blocking_max; - int bcast_dim, bcast_block, nb_bcast, - nb_bcast_blocking, nb_bcast_blocking_max; - - int reduce_loop_unroll, reduce_loop_bcast_step, reduce_loop_load_step; - int load_loop_load_step, load_loop_iter_step; - int bcast_loop_output_step, bcast_loop_output_substep; - int bcast_loop_bcast_step, bcast_loop_bcast_substep; - int fma_step; - int load_grp_count; - conv_1x1_loop_order_t loop_order; - bool use_vmovntps; - /* avx512 core */ - bool expl_bcast; - /* 4vnni */ + int nb_ic, ic_block; + int nb_oc, oc_block; + + int w_block_size, h_block_size; + + DataType bia_dt; + DataType dst_dt; + + int is_oc_scale; int typesize_in; int typesize_out; int typesize_bia; int typesize_acc; - /* 4fma */ - bool transpose_src; - int tr_is; - int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b; - int is_oc_scale; - DataType bia_dt; - DataType dst_dt; - round_mode rm; - //the scale for post sum + bool with_bias, with_relu; + float relu_negative_slope; + bool with_sum; + bool small_mb; + + int xb, yb; + int inp_stride; + int out_stride; + int wei_stride; + int bia_stride; + + int M, N, K; + int m_block, n_block, k_block; + int n2_block, n_chunks; + int k2_block, k_chunks; + + round_mode rm; float sum_scale; }; + // pooling struct jit_pool_conf_t { - int ndims; - int mb, c; - int id, ih, iw, od, oh, ow; - int stride_d, stride_h, stride_w; - int kd, kh, kw; - int f_pad, t_pad, l_pad; - PoolingType alg; - bool pad_w_is_null; - bool simple_alg; - DataType ind_dt; - - int c_block, c_tail, nb_c; - int ur_c, ur_c_tail; - int ur_w; - int ur_w_tail; - size_t tail[4]; - DataType src_dt; - DataType dst_dt; + int ndims{0}; + int mb{0}; + int c{0}; + int id{0}; + int ih{0}; + int iw{0}; + int od{0}; + int oh{0}; + int ow{0}; + int stride_d{0}; + int stride_h{0}; + int stride_w{0}; + int kd{0}; + int kh{0}; + int kw{0}; + int f_pad{0}; + int t_pad{0}; + int l_pad{0}; + PoolingType alg{Pooling_unknow}; + bool pad_w_is_null{0}; + bool simple_alg{0}; + DataType ind_dt{AK_INVALID}; + LayoutType src_fmt{Layout_invalid}; + + int c_block{0}; + int c_tail{0}; + int nb_c{0}; + int ur_c{0}; + int ur_c_tail{0}; + int ur_w{0}; + int ur_w_tail{0}; + size_t tail[4]{0,0,0,0}; + DataType src_dt{AK_INVALID}; + DataType dst_dt{AK_INVALID}; }; struct jit_pool_call_t { @@ -305,16 +489,123 @@ struct jit_axpy_call_t { }; struct jit_axpy_conf_t { + int n_inputs; int bs; int h, w; int oc; int n; DataType dt; int typesize; - int block; // u8: 64, s32: 16 + int block_size; // u8: 64, s32: 16 int bits_size; // 128, 256, 512 : xmm, ymm, zmm }; +struct jit_eltwise_call_t { + const void **src; + const void *dst; + size_t work_amount; +}; + +struct jit_eltwise_conf_t { + int n_inputs; + DataType dt; + int typesize; + bool with_relu; + const float *scales; +}; + +struct jit_priorbox_call_t{ + const void *dst; + float start; + const void *start_offset; + float offset; + float step; + float box_length; + float img_length; + size_t work_amount; + float block = 8.0f; +}; + +struct jit_priorbox_conf_t{ + bool is_add; +}; + +// gemm conv +struct jit_gemm_deconv_conf_t { + int mb; + int ic, ih, iw, oc, oh, ow; + int stride_h, stride_w; + int kh, kw; + int f_pad, t_pad, l_pad; + int dilate_d, dilate_h, dilate_w; +}; + +struct jit_deconv_conf_t { + conv_version_t ver{ver_unused}; + LayoutType src_fmt{Layout_invalid}; + int ndims{0}; + int mb{0}; + int ngroups{0}; + int ic{0}; + int oc{0}; + int oc_without_padding{0}; + int ic_without_padding{0}; + int ih{0}; + int iw{0}; + int oh{0}; + int ow{0}; + int l_pad{0}; + int t_pad{0}; + int back_pad{0}; + int r_pad{0}; + int b_pad{0}; + int kh{0}; + int kw{0}; + int stride_h{0}; + int stride_w{0}; + int dilate_h{0}; + int dilate_w{0}; + bool with_bias{false}; + bool with_relu{false}; + float relu_negative_slope{0.f}; + bool with_sum{false}; + int nb_ic{0}; + int ic_block{0}; + int nb_oc{0}; + int oc_block{0}; + int nb_g{0}; + int g_block{0}; + int nb_ic_blocking{0}; + int nb_oc_blocking{0}; // blocking of nb_ic and nb_ic + int nb_ic_blocking_max{0}; + int nb_ic_L2{0}; + int nb_oc_L2{0}; + int ur_h{0}; + int ur_w{0}; + int ur_w_tail{0}; + int typesize_in{0}; + int typesize_out{0}; + + /* fma avx512_core */ + conv_kernel_kind_t kernel_kind{embd_bcast}; +}; + +struct jit_deconv_call_t { + const void *src{nullptr}; /* hack, non-const for backward_data */ + const void *dst{nullptr}; /* hack, non-const for forward */ + const void *filt{nullptr}; /* hack, non-const for backward_weights */ + const void *bias{nullptr}; /* hack, non-const for backward_bias */ + const void *src_prf{nullptr}; + const void *dst_prf{nullptr}; + const void *filt_prf{nullptr}; + const void *bias_prf{nullptr}; + const void *scales{nullptr}; + size_t kh_padding{0}; + size_t kh_padding_prf{0}; + size_t channel{0}; + size_t channel_prf{0}; +}; + } // namespace jit } // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.cpp b/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.cpp new file mode 100644 index 000000000..58df9ec8e --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.cpp @@ -0,0 +1,381 @@ +#include "saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.h" +#include "saber/funcs/impl/x86/saber_conv.h" +#include "saber/funcs/impl/x86/saber_pooling.h" + +namespace anakin { +namespace saber { + +using namespace jit; + +template <> +SaberStatus JitConvPoolingNormal::allocate_buf(Shape buf_shape, std::vector scale) { + SaberStatus ret = SaberMemAllocFailed; + + Tensor *b_info = new Tensor(buf_shape, AK_FLOAT); + if (buf_shape.get_layout() == Layout_NHWC) { + delete b_info; + b_info = new Tensor(buf_shape, AK_UINT8); + } + if (b_info) { + b_info->set_scale(scale); + buf_.push_back(b_info); + ret = SaberSuccess; + } + return ret; +} + +template <> +void JitConvPoolingNormal::release_buf() { + + for (int i = 0; i < this->buf_.size(); i++) { + delete buf_[i]; + buf_[i] = nullptr; + } + std::vector *> ().swap(buf_); + return; +} + +template <> +SaberStatus JitConvPoolingNormal:: + prepare_buf(Shape pool_shape, PoolingParam pool_param, std::vector scale) { + + SaberStatus ret = SaberMemAllocFailed; + + // calculate the shape of buf + Shape buf_shape({pool_shape[0], pool_shape[1], + (pool_shape[2] - 1) * pool_param.stride_h + pool_param.window_h - 2 * pool_param.pad_h, + (pool_shape[3] - 1) * pool_param.stride_w + pool_param.window_w - 2 * pool_param.pad_w, + 16}, Layout_NCHW_C16); + + LayoutType layout = pool_shape.get_layout(); + if (layout == Layout_NCHW_C16||layout == Layout_NCHW_C16R) { + Shape buf_tmp({pool_shape[0], pool_shape[1], + (pool_shape[2] - 1) * pool_param.stride_h + pool_param.window_h - 2 * pool_param.pad_h, + (pool_shape[3] - 1) * pool_param.stride_w + pool_param.window_w - 2 * pool_param.pad_w, + 16}, Layout_NCHW_C16); + buf_shape = buf_tmp; + } else if (layout == Layout_NHWC) { + Shape buf_tmp({pool_shape[0], + (pool_shape[1] - 1) * pool_param.stride_h + pool_param.window_h - 2 * pool_param.pad_h, + (pool_shape[2] - 1) * pool_param.stride_w + pool_param.window_w - 2 * pool_param.pad_w, + pool_shape[3]}, Layout_NHWC); + buf_shape = buf_tmp; + } else { + assert(!"not supported."); + } + + // make sure allocate buf is successfully + if (buf_.size() > 0 && buf_[0]->valid_shape() == buf_shape) { + return SaberSuccess; + } + + // release buf first + this->release_buf(); + + // allocate the buf according to the shape + ret = allocate_buf(buf_shape, scale); + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberUnImplError; + + this->_ctx = &ctx; + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + auto out_scale = outputs[0]->get_scale(); + DataType dtype_out = outputs[0]->get_dtype(); + DataType dtype_in = inputs[0]->get_dtype(); + // check layout info + Shape out_shape = outputs[0]->valid_shape(); + Shape in_shape = inputs[0]->valid_shape(); + + LayoutType layout_in = in_shape.get_layout(); + LayoutType layout_out = out_shape.get_layout(); + if (!(((dtype_in == AK_FLOAT) && (layout_in == Layout_NCHW) && + ((layout_out == Layout_NCHW_C16) || (layout_out == Layout_NHWC))) || + ((dtype_in == AK_FLOAT) && (dtype_out == AK_FLOAT) && + (layout_in == Layout_NCHW_C16) && (layout_out == Layout_NCHW_C16)))) { + return ret; + } + + if (!this->conv_impl_ || !this->pool_impl_) { + LOG(ERROR) << "impl is NULL"; + return SaberNotInitialized; + } + + // prepare buf + ret = this->prepare_buf(out_shape, pool_param, out_scale); + if (ret != SaberSuccess) { + return ret; + } + + // create conv act op + ret = this->conv_impl_->create(inputs, buf_, conv_param, ctx); + if (ret != SaberSuccess) { + return ret; + } + + // create pooling op + ret = this->pool_impl_->create(buf_, outputs, pool_param, ctx); + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberUnImplError; + + this->_ctx = &ctx; + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + auto out_scale = outputs[0]->get_scale(); + DataType dtype_out = outputs[0]->get_dtype(); + DataType dtype_in = inputs[0]->get_dtype(); + // check layout info + Shape out_shape = outputs[0]->valid_shape(); + Shape in_shape = inputs[0]->valid_shape(); + + LayoutType layout_in = in_shape.get_layout(); + LayoutType layout_out = out_shape.get_layout(); + + if (!(((dtype_in == AK_FLOAT) && (layout_in == Layout_NCHW) && + ((layout_out == Layout_NCHW_C16) || (layout_out == Layout_NHWC))) || + ((dtype_in == AK_FLOAT) && (dtype_out == AK_FLOAT) && + (layout_in == Layout_NCHW_C16) && (layout_out == Layout_NCHW_C16)))) { + return ret; + } + // prepare buf + ret = this->prepare_buf(out_shape, pool_param, out_scale); + if (ret != SaberSuccess) { + return ret; + } + + // init conv op + if (this->conv_impl_) { + delete this->conv_impl_; + } + this->conv_impl_ = new SaberConv2D; + ret = this->conv_impl_->init(inputs, buf_, conv_param, ctx); + if (ret != SaberSuccess) { + LOG(INFO) << "init convact impl error"; + return ret; + } + + // init pool op + if (this->pool_impl_) { + delete this->pool_impl_; + } + + if ((dtype_out == AK_FLOAT) && (layout_out == Layout_NCHW_C16 || layout_out == Layout_NCHW_C16R)) { + this->pool_impl_ = new SaberPooling; + } else if ((dtype_out != AK_FLOAT) && (layout_out == Layout_NHWC)) { + this->pool_impl_ = (Impl_pool_t*) new SaberPooling; + } else { + LOG(INFO) << "not implemented."; + ret = SaberUnImplError; + return ret; + } + ret = this->pool_impl_->init(buf_, outputs, pool_param, ctx); + + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param) { + SaberStatus ret = SaberSuccess; + if (!this->conv_impl_ || !this->pool_impl_) { + LOG(ERROR) << "impl is NULL"; + return SaberNotInitialized; + } + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + ret = this->conv_impl_->dispatch(inputs, buf_, conv_param); + if (ret != SaberSuccess) { + return ret; + } + + ret = this->pool_impl_->dispatch(buf_, outputs, pool_param); + return ret; +} + + +template <> +SaberStatus JitConvPoolingNormal::allocate_buf(Shape buf_shape, std::vector scale) { + SaberStatus ret = SaberMemAllocFailed; + + Tensor *b_info = new Tensor(buf_shape, AK_UINT8); + if (b_info) { + b_info->set_scale(scale); + buf_.push_back(b_info); + ret = SaberSuccess; + } + return ret; +} + +template <> +void JitConvPoolingNormal::release_buf() { + + for (int i = 0; i < this->buf_.size(); i++) { + delete buf_[i]; + buf_[i] = nullptr; + } + std::vector *> ().swap(buf_); + return; +} + +template <> +SaberStatus JitConvPoolingNormal::prepare_buf(Shape pool_shape, PoolingParam pool_param, std::vector scale) { + + SaberStatus ret = SaberMemAllocFailed; + + // calculate the shape of buf + Shape buf_shape({pool_shape[0], + (pool_shape[1] - 1) * pool_param.stride_h + pool_param.window_h - 2 * pool_param.pad_h, + (pool_shape[2] - 1) * pool_param.stride_w + pool_param.window_w - 2 * pool_param.pad_w, + pool_shape[3]}, Layout_NHWC); + + // make sure allocate buf is successfully + if (buf_.size() > 0 && buf_[0]->valid_shape() == buf_shape) { + return SaberSuccess; + } + + // release buf first + this->release_buf(); + + // allocate the buf according to the shape + ret = allocate_buf(buf_shape, scale); + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberUnImplError; + + this->_ctx = &ctx; + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + auto out_scale = outputs[0]->get_scale(); + DataType dtype_out = outputs[0]->get_dtype(); + DataType dtype_in = inputs[0]->get_dtype(); + // check layout info + Shape out_shape = outputs[0]->valid_shape(); + Shape in_shape = inputs[0]->valid_shape(); + + LayoutType layout_in = in_shape.get_layout(); + LayoutType layout_out = out_shape.get_layout(); + if (!((dtype_in != AK_FLOAT) && (dtype_out != AK_FLOAT) && + (layout_in == Layout_NHWC) && (layout_out == Layout_NHWC))) { + return ret; + } + + if (!this->conv_impl_ || !this->pool_impl_) { + LOG(FATAL) << "impl is NULL"; + return SaberNotInitialized; + } + + // prepare buf + ret = this->prepare_buf(out_shape, pool_param, out_scale); + if (ret != SaberSuccess) { + return ret; + } + + // create conv act op + ret = this->conv_impl_->create(inputs, buf_, conv_param, ctx); + if (ret != SaberSuccess) { + return ret; + } + + // create pooling op + ret = this->pool_impl_->create(buf_, outputs, pool_param, ctx); + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberUnImplError; + + this->_ctx = &ctx; + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + auto out_scale = outputs[0]->get_scale(); + DataType dtype_out = outputs[0]->get_dtype(); + DataType dtype_in = inputs[0]->get_dtype(); + // check layout info + Shape out_shape = outputs[0]->valid_shape(); + Shape in_shape = inputs[0]->valid_shape(); + + LayoutType layout_in = in_shape.get_layout(); + LayoutType layout_out = out_shape.get_layout(); + + if (!((dtype_in != AK_FLOAT) && (dtype_out != AK_FLOAT) && + (layout_in == Layout_NHWC) && (layout_out == Layout_NHWC))) { + return ret; + } + + // prepare buf + ret = this->prepare_buf(out_shape, pool_param, out_scale); + if (ret != SaberSuccess) { + return ret; + } + // init conv op + if (this->conv_impl_) { + delete this->conv_impl_; + } + this->conv_impl_ = new SaberConv2D; + ret = this->conv_impl_->init(inputs, buf_, conv_param, ctx); + if (ret != SaberSuccess) { + LOG(FATAL) << "init convact impl error"; + return ret; + } + + // init pool op + if (this->pool_impl_) { + delete this->pool_impl_; + } + + this->pool_impl_ = new SaberPooling; + ret = this->pool_impl_->init(buf_, outputs, pool_param, ctx); + + return ret; +} + +template <> +SaberStatus JitConvPoolingNormal::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param) { + SaberStatus ret = SaberSuccess; + if (!this->conv_impl_ || !this->pool_impl_) { + LOG(FATAL) << "impl is NULL"; + return SaberNotInitialized; + } + ConvParam conv_param(param.conv_param); + PoolingParam pool_param = param.pooling_param; + + ret = this->conv_impl_->dispatch(inputs, buf_, conv_param); + if (ret != SaberSuccess) { + return ret; + } + + ret = this->pool_impl_->dispatch(buf_, outputs, pool_param); + return ret; +} + + +} +} diff --git a/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.h b/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.h new file mode 100644 index 000000000..b80837ffa --- /dev/null +++ b/saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_CONV_POOLING_NORMAL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_KERNEL_JIT_CONV_POOLING_NORMAL_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_macro.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" + +namespace anakin { +namespace saber { + +template +class JitConvPoolingNormal : public ImplBase< + X86, OpDtype, ConvPoolingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + typedef ImplBase > Impl_conv_t; + typedef ImplBase > Impl_pool_t; + + JitConvPoolingNormal() + : conv_impl_(nullptr) + , pool_impl_(nullptr){ + } + + ~JitConvPoolingNormal() { + if (conv_impl_ != nullptr) { + delete conv_impl_; + conv_impl_ = nullptr; + } + if (pool_impl_ != nullptr) { + delete pool_impl_; + pool_impl_ = nullptr; + } + + release_buf(); + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context &ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvPoolingParam& param); + +private: + SaberStatus prepare_buf(Shape pool_shape, PoolingParam pool_param, std::vector scale); + SaberStatus allocate_buf(Shape buf_shape, std::vector scale); + void release_buf(); + + Impl_conv_t* conv_impl_; + Impl_pool_t* pool_impl_; + + std::vector *> buf_; +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_JIT_CONV_POOLING_NORMAL_H diff --git a/saber/funcs/impl/x86/kernel/jit_generator.h b/saber/funcs/impl/x86/kernel/jit_generator.h index 07545c928..75bb2f6aa 100644 --- a/saber/funcs/impl/x86/kernel/jit_generator.h +++ b/saber/funcs/impl/x86/kernel/jit_generator.h @@ -8,10 +8,15 @@ /* in order to make selinux happy memory that would be marked with X-bit should * be obtained with mmap */ #define XBYAK_USE_MMAP_ALLOCATOR +#ifdef USE_SGX +#undef XBYAK_USE_MMAP_ALLOCATOR +#endif +#include "anakin_config.h" #include "xbyak/xbyak.h" #include "xbyak/xbyak_util.h" #include "x86_utils.h" +#include "anakin_thread.h" #define DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_name) \ const char *name() const override { return #jit_name; } \ @@ -23,15 +28,15 @@ namespace jit { static Xbyak::util::Cpu cpu; typedef enum { - isa_any, - sse42, - avx, - avx2, - avx512_common, - avx512_core, - avx512_core_vnni, - avx512_mic, - avx512_mic_4ops, + isa_any, + sse42, + avx, + avx2, + avx512_common, + avx512_core, + avx512_core_vnni, + avx512_mic, + avx512_mic_4ops, } cpu_isa_t; // Instruction set architecture template @@ -39,23 +44,23 @@ struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */ template <> struct cpu_isa_traits { - static constexpr int vlen_shift = 4; - static constexpr int vlen = 16; - static constexpr int n_vregs = 16; + static constexpr int vlen_shift = 4; + static constexpr int vlen = 16; + static constexpr int n_vregs = 16; }; template <> struct cpu_isa_traits { - static constexpr int vlen_shift = 5; - static constexpr int vlen = 32; - static constexpr int n_vregs = 16; + static constexpr int vlen_shift = 5; + static constexpr int vlen = 32; + static constexpr int n_vregs = 16; }; template <> struct cpu_isa_traits { - static constexpr int vlen_shift = 6; - static constexpr int vlen = 64; - static constexpr int n_vregs = 32; + static constexpr int vlen_shift = 6; + static constexpr int vlen = 64; + static constexpr int n_vregs = 32; }; template <> @@ -69,40 +74,50 @@ struct cpu_isa_traits : public cpu_isa_traits { }; static inline bool mayiuse(const cpu_isa_t cpu_isa) { - using namespace Xbyak::util; + using namespace Xbyak::util; - switch (cpu_isa) { + switch (cpu_isa) { case sse42: - return cpu.has(Cpu::tSSE42); + return cpu.has(Cpu::tSSE42); + case avx: - return cpu.has(Cpu::tAVX); + return cpu.has(Cpu::tAVX); + case avx2: - return cpu.has(Cpu::tAVX2); + return cpu.has(Cpu::tAVX2); + case avx512_common: - return cpu.has(Cpu::tAVX512F); + // return false;//for can`t pass test of jit + return cpu.has(Cpu::tAVX512F); + case avx512_core: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && - cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ); + return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && + cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ); + case avx512_core_vnni: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && - cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) && - cpu.has(Cpu::tAVX512_VNNI); + return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) && + cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) && + cpu.has(Cpu::tAVX512_VNNI); + case avx512_mic: - return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) && - cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF); + return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) && + cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF); + case avx512_mic_4ops: - return true && mayiuse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) && - cpu.has(Cpu::tAVX512_4VNNIW); + return true && mayiuse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) && + cpu.has(Cpu::tAVX512_4VNNIW); + case isa_any: - return true; - } - return false; + return true; + } + + return false; } static inline int float2int(float x) { union { - float vfloat; - int vint; + float vfloat; + int vint; } cvt; cvt.vfloat = x; return cvt.vint; @@ -110,34 +125,41 @@ static inline int float2int(float x) { inline unsigned int get_cache_size(int level, bool per_core = true) { - unsigned int l = level - 1; - // Currently, if XByak is not able to fetch the cache topology - // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. - if (cpu.data_cache_levels == 0) { - const int L1_cache_per_core = 32000; - const int L2_cache_per_core = 512000; - const int L3_cache_per_core = 1024000; - int num_cores = per_core ? 1 : omp_get_max_threads(); - switch (l) { - case (0): - return L1_cache_per_core * num_cores; - case (1): - return L2_cache_per_core * num_cores; - case (2): - return L3_cache_per_core * num_cores; - default: + unsigned int l = level - 1; + + // Currently, if XByak is not able to fetch the cache topology + // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. + if (cpu.data_cache_levels == 0) { + const int L1_cache_per_core = 32000; + const int L2_cache_per_core = 512000; + const int L3_cache_per_core = 1024000; + int num_cores = per_core ? 1 : anakin_get_max_threads(); + + switch (l) { + case (0): + return L1_cache_per_core * num_cores; + + case (1): + return L2_cache_per_core * num_cores; + + case (2): + return L3_cache_per_core * num_cores; + + default: + return 0; + } + } + + if (l < cpu.data_cache_levels) { + if (cpu.cores_sharing_data_cache[l] > 0) { + return cpu.data_cache_size[l] / + (per_core ? cpu.cores_sharing_data_cache[l] : 1); + } else { + return cpu.data_cache_size[l]; + } + } else { return 0; } - } - if (l < cpu.data_cache_levels) { - if (cpu.cores_sharing_data_cache[l] > 0){ - return cpu.data_cache_size[l] / - (per_core ? cpu.cores_sharing_data_cache[l] : 1); - }else{ - return cpu.data_cache_size[l]; - } - } else - return 0; } #ifdef XBYAK64 @@ -154,14 +176,31 @@ constexpr Xbyak::Operand::Code abi_save_gpr_regs[] = { #endif }; +constexpr Xbyak::Operand::Code common_save_gpr_regs[] = { + Xbyak::Operand::RAX, + Xbyak::Operand::RCX, + Xbyak::Operand:: RDX, + Xbyak::Operand:: RBX, + Xbyak::Operand:: RSI, + Xbyak::Operand:: RDI, + Xbyak::Operand:: R8, + Xbyak::Operand:: R9, + Xbyak::Operand:: R10, + Xbyak::Operand:: R11, + Xbyak::Operand:: R12, + Xbyak::Operand:: R13, + Xbyak::Operand:: R14, + Xbyak::Operand:: R15, +}; + #ifdef _WIN static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RCX), - abi_param2(Xbyak::Operand::RDX), abi_param3(Xbyak::Operand::R8), - abi_param4(Xbyak::Operand::R9), abi_not_param1(Xbyak::Operand::RDI); + abi_param2(Xbyak::Operand::RDX), abi_param3(Xbyak::Operand::R8), + abi_param4(Xbyak::Operand::R9), abi_not_param1(Xbyak::Operand::RDI); #else static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RDI), - abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), - abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); + abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX), + abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX); #endif #endif @@ -179,112 +218,201 @@ class jit_tagged_label_base { public: enum { maxlen = len }; template ::value>> - jit_tagged_label_base(const char (&base)[n], Tags... tags) { - // XXX: This code is ugly but useful - constexpr size_t ntags = sizeof...(tags); - static_assert(n + ntags < maxlen, "resulting label may be too long"); - // paste tags first in case base has unexpected null chars - paste_tags(tags...); - for (size_t i = 0; i < n; i++) - label_name_[ntags + i] = base[i]; - // don't assume that the base string is 0-terminated - label_name_[ntags + n] = '\0'; - } - operator const char*() const { return label_name_; } - const char *c_str() const { return label_name_; } + typename = std::enable_if::value>> + jit_tagged_label_base(const char (&base)[n], Tags... tags) { + // XXX: This code is ugly but useful + constexpr size_t ntags = sizeof...(tags); + static_assert(n + ntags < maxlen, "resulting label may be too long"); + // paste tags first in case base has unexpected null chars + paste_tags(tags...); + + for (size_t i = 0; i < n; i++) { + label_name_[ntags + i] = base[i]; + } + + // don't assume that the base string is 0-terminated + label_name_[ntags + n] = '\0'; + } + operator const char* () const { + return label_name_; + } + const char* c_str() const { + return label_name_; + } private: char label_name_[maxlen]; void paste_tags() { } template - void paste_tags(char tag, Tags... tags) { - label_name_[sizeof...(tags)] = tag; - paste_tags(tags...); - } + void paste_tags(char tag, Tags... tags) { + label_name_[sizeof...(tags)] = tag; + paste_tags(tags...); + } }; typedef jit_tagged_label_base<> jit_tagged_label; +extern "C" Xbyak::uint8 __jit_start; +extern "C" Xbyak::uint8 __jit_end; + class jit_generator : public Xbyak::CodeGenerator { private: - const size_t xmm_len = 16; + const size_t xmm_reg_numbers = 8; + const size_t ymm_reg_numbers = 16; + const size_t zmm_reg_numbers = 32; + const size_t xmm_len = 16; + const size_t ymm_len = 32; + const size_t zmm_len = 64; #ifdef _WIN - const size_t xmm_to_preserve_start = 6; - const size_t xmm_to_preserve = 10; + const size_t xmm_to_preserve_start = 6; + const size_t xmm_to_preserve = 10; #else - const size_t xmm_to_preserve_start = 0; - const size_t xmm_to_preserve = 0; + const size_t xmm_to_preserve_start = 0; + const size_t xmm_to_preserve = 0; #endif - const size_t num_abi_save_gpr_regs = - sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); + const size_t num_abi_save_gpr_regs = + sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); + + const size_t num_common_save_gpr_regs = + sizeof(common_save_gpr_regs) / sizeof(common_save_gpr_regs[0]); - const size_t size_of_abi_save_regs = - num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len; + const size_t size_of_abi_save_regs = + num_abi_save_gpr_regs * rax.getBit() / 8 + xmm_to_preserve * xmm_len; public: - enum { - _cmp_eq_oq = 0u, - _cmp_lt_os = 1u, - _cmp_le_os = 2u, - _cmp_neq_uq = 4u, - _cmp_nlt_us = 5u, - _cmp_nle_us = 6u, - }; + enum { + _cmp_eq_oq = 0u, + _cmp_lt_os = 1u, + _cmp_le_os = 2u, + _cmp_neq_uq = 4u, + _cmp_nlt_us = 5u, + _cmp_nle_us = 6u, + }; + + Xbyak::Reg64 param1 = abi_param1; + const int EVEX_max_8b_offt = 0x200; + const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + + inline size_t get_size_of_abi_save_regs() { + return size_of_abi_save_regs; + } + + void preamble() { + if (xmm_to_preserve) { + sub(rsp, xmm_to_preserve * xmm_len); + + for (size_t i = 0; i < xmm_to_preserve; ++i) { + movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i)); + } + } - Xbyak::Reg64 param1 = abi_param1; - const int EVEX_max_8b_offt = 0x200; - const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) { + push(Xbyak::Reg64(abi_save_gpr_regs[i])); + } + + if (mayiuse(avx512_common)) { + mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + } + } + void save_common_regs() { + if (mayiuse(avx512_core)) { + sub(rsp, zmm_reg_numbers * zmm_len); + + for (size_t i = 0; i < zmm_reg_numbers; ++i) { + vmovdqu32(ptr[rsp + i * zmm_len], Xbyak::Zmm(i)); + } + } else if (mayiuse(avx)) { + sub(rsp, ymm_reg_numbers * ymm_len); + + for (size_t i = 0; i < ymm_reg_numbers; ++i) { + vmovdqu(ptr[rsp + i * ymm_len], Xbyak::Ymm(i)); + } - inline size_t get_size_of_abi_save_regs() { return size_of_abi_save_regs; } + } else { + sub(rsp, xmm_reg_numbers * xmm_len); + + for (size_t i = 0; i < xmm_reg_numbers; ++i) { + movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(i)); + } + } - void preamble() { - if (xmm_to_preserve) { - sub(rsp, xmm_to_preserve * xmm_len); - for (size_t i = 0; i < xmm_to_preserve; ++i) - movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i)); + for (size_t i = 0; i < num_common_save_gpr_regs; ++i) { + push(Xbyak::Reg64(common_save_gpr_regs[i])); + } } - for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) - push(Xbyak::Reg64(abi_save_gpr_regs[i])); - if (mayiuse(avx512_common)) { - mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + + void restore_common_regs() { + for (size_t i = 0; i < num_common_save_gpr_regs; ++i) { + pop(Xbyak::Reg64(common_save_gpr_regs[num_common_save_gpr_regs - 1 - i])); + } + + if (mayiuse(avx512_core)) { + for (size_t i = 0; i < zmm_reg_numbers; ++i) { + vmovdqu32(Xbyak::Zmm(i), ptr[rsp + i * zmm_len]); + } + + add(rsp, zmm_reg_numbers * zmm_len); + } else if (mayiuse(avx)) { + for (size_t i = 0; i < ymm_reg_numbers; ++i) { + vmovdqu(Xbyak::Ymm(i), ptr[rsp + i * ymm_len]); + } + + add(rsp, ymm_reg_numbers * ymm_len); + } else { + for (size_t i = 0; i < xmm_reg_numbers; ++i) { + movdqu(Xbyak::Xmm(i), ptr[rsp + i * xmm_len]); + } + + add(rsp, xmm_reg_numbers * xmm_len); + } } - } - void mic_prefetcht0(Xbyak::Address a) { - if (mayiuse(avx512_mic)) - prefetcht0(a); + + void mic_prefetcht0(Xbyak::Address a) { + if (mayiuse(avx512_mic)) { + prefetcht0(a); + } } - void mic_prefetcht1(Xbyak::Address a) { - if (mayiuse(avx512_mic)) - prefetcht1(a); + void mic_prefetcht1(Xbyak::Address a) { + if (mayiuse(avx512_mic)) { + prefetcht1(a); + } } - void mic_prefetcht2(Xbyak::Address a) { - if (mayiuse(avx512_mic)) - prefetcht2(a); + void mic_prefetcht2(Xbyak::Address a) { + if (mayiuse(avx512_mic)) { + prefetcht2(a); + } } - void uni_vzeroupper() { - if (mayiuse(avx) && !mayiuse(avx512_mic)) - vzeroupper(); + void uni_vzeroupper() { + if (mayiuse(avx) && !mayiuse(avx512_mic)) { + vzeroupper(); + } } - void postamble() { - for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) - pop(Xbyak::Reg64(abi_save_gpr_regs[num_abi_save_gpr_regs - 1 - i])); - if (xmm_to_preserve) { - for (size_t i = 0; i < xmm_to_preserve; ++i) - movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]); - add(rsp, xmm_to_preserve * xmm_len); + void postamble() { + for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) { + pop(Xbyak::Reg64(abi_save_gpr_regs[num_abi_save_gpr_regs - 1 - i])); + } + + if (xmm_to_preserve) { + for (size_t i = 0; i < xmm_to_preserve; ++i) { + movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]); + } + + add(rsp, xmm_to_preserve * xmm_len); + } + + uni_vzeroupper(); + ret(); } - uni_vzeroupper(); - ret(); - } - Xbyak::Address make_safe_addr(const Xbyak::Reg64 ®_out, size_t offt, - const Xbyak::Reg64 &tmp_reg, bool bcast = false) { + + + Xbyak::Address make_safe_addr(const Xbyak::Reg64& reg_out, size_t offt, + const Xbyak::Reg64& tmp_reg, bool bcast = false) { if (offt > INT_MAX) { mov(tmp_reg, offt); return bcast ? ptr_b[reg_out + tmp_reg] : ptr[reg_out + tmp_reg]; @@ -293,8 +421,8 @@ class jit_generator : public Xbyak::CodeGenerator { } } - Xbyak::Address EVEX_compress_addr_safe(const Xbyak::Reg64 &base, - size_t raw_offt, const Xbyak::Reg64 ®_offt, bool bcast = false) { + Xbyak::Address EVEX_compress_addr_safe(const Xbyak::Reg64& base, + size_t raw_offt, const Xbyak::Reg64& reg_offt, bool bcast = false) { if (raw_offt > INT_MAX) { return make_safe_addr(base, raw_offt, reg_offt, bcast); } else { @@ -302,8 +430,8 @@ class jit_generator : public Xbyak::CodeGenerator { } } - void safe_add(const Xbyak::Reg64 &base, size_t raw_offt, - const Xbyak::Reg64 ®_offt) { + void safe_add(const Xbyak::Reg64& base, size_t raw_offt, + const Xbyak::Reg64& reg_offt) { if (raw_offt > INT_MAX) { mov(reg_offt, raw_offt); add(base, reg_offt); @@ -312,8 +440,8 @@ class jit_generator : public Xbyak::CodeGenerator { } } - void safe_sub(const Xbyak::Reg64 &base, size_t raw_offt, - const Xbyak::Reg64 ®_offt) { + void safe_sub(const Xbyak::Reg64& base, size_t raw_offt, + const Xbyak::Reg64& reg_offt) { if (raw_offt > INT_MAX) { mov(reg_offt, raw_offt); sub(base, reg_offt); @@ -322,436 +450,516 @@ class jit_generator : public Xbyak::CodeGenerator { } } - void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vpxor(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { assert(x1.getIdx() == x2.getIdx()); pxor(x2, op); - } + } - void uni_vpxor(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op) { + void uni_vpxor(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { if (mayiuse(avx2)) { vpxor(x1, x2, op); } else { vxorps(x1, x2, op); } - } + } - void uni_vpxor(const Xbyak::Zmm &x1, const Xbyak::Zmm &x2, - const Xbyak::Operand &op) { + void uni_vpxor(const Xbyak::Zmm& x1, const Xbyak::Zmm& x2, + const Xbyak::Operand& op) { vpxord(x1, x2, op); - } + } - void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + void uni_vmovdqu(const Xbyak::Address& addr, const Xbyak::Xmm& x) { movdqu(addr, x); - } + } - void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + void uni_vmovdqu(const Xbyak::Address& addr, const Xbyak::Ymm& x) { vmovdqu(addr, x); - } + } - void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Zmm &x) { + void uni_vmovdqu(const Xbyak::Address& addr, const Xbyak::Zmm& x) { vmovdqu32(addr, x); - } + } - void uni_vmovdqu(const Xbyak::Xmm &x, const Xbyak::Address &addr) { + void uni_vmovdqu(const Xbyak::Xmm& x, const Xbyak::Address& addr) { movdqu(x, addr); - } + } - void uni_vmovdqu(const Xbyak::Ymm &x, const Xbyak::Address &addr) { + void uni_vmovdqu(const Xbyak::Ymm& x, const Xbyak::Address& addr) { vmovdqu(x, addr); - } + } - void uni_vmovdqu(const Xbyak::Zmm &x, const Xbyak::Address &addr) { + void uni_vmovdqu(const Xbyak::Zmm& x, const Xbyak::Address& addr) { vmovdqu32(x, addr); - } + } - void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + void uni_vmovups(const Xbyak::Address& addr, const Xbyak::Xmm& x) { movups(addr, x); - } + } - void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + void uni_vmovups(const Xbyak::Address& addr, const Xbyak::Ymm& x) { vmovups(addr, x); - } + } - void uni_vmovups(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vmovups(const Xbyak::Xmm& x, const Xbyak::Operand& op) { movups(x, op); - } + } - void uni_vmovups(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vmovups(const Xbyak::Ymm& x, const Xbyak::Operand& op) { vmovups(x, op); - } + } - void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + void uni_vmovntps(const Xbyak::Address& addr, const Xbyak::Xmm& x) { movntps(addr, x); - } + } - void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + void uni_vmovntps(const Xbyak::Address& addr, const Xbyak::Ymm& x) { vmovntps(addr, x); - } + } - void uni_vbroadcastss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vbroadcastss(const Xbyak::Xmm& x, const Xbyak::Operand& op) { movss(x, op); shufps(x, x, 0x0); - } + } - void uni_vbroadcastss(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vbroadcastss(const Xbyak::Ymm& x, const Xbyak::Operand& op) { if (mayiuse(avx2)) { vbroadcastss(x, op); } else { Xbyak::Xmm t(x.getIdx()); - if (t.getIdx() != op.getIdx()) movss(t, op); + + if (t.getIdx() != op.getIdx()) { + movss(t, op); + } + vinsertf128(x, x, t, 1); vshufps(x, x, x, 0); - } - } + } + } - void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vpbroadcastd(const Xbyak::Xmm& x, const Xbyak::Operand& op) { movsd(x, op); pshufd(x, x, 0x0); - } + } - void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vpbroadcastd(const Xbyak::Ymm& x, const Xbyak::Operand& op) { if (mayiuse(avx2)) { vpbroadcastd(x, op); } else { Xbyak::Xmm t(x.getIdx()); - if (t.getIdx() != op.getIdx()) movsd(t, op); + + if (t.getIdx() != op.getIdx()) { + movsd(t, op); + } + vinsertf128(x, x, t, 1); vshufps(x, x, x, 0); - } - } + } + } - void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vdivps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); divps(x, op2); - } + } - void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vdivps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vdivps(x, op1, op2); - } - - void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { - movups(buf, op1); - divps(buf, op2); - if (x.getIdx() != buf.getIdx()) { - movups(x, buf); - } - } - - void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { - vdivps(x, op1, op2); - } - - void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + } + + void uni_vdivps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2, const Xbyak::Xmm& buf) { + movups(buf, op1); + divps(buf, op2); + + if (x.getIdx() != buf.getIdx()) { + movups(x, buf); + } + } + + void uni_vdivps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2, const Xbyak::Ymm& buf) { + vdivps(x, op1, op2); + } + + void uni_vaddps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); addps(x, op2); - } + } - void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vaddps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vaddps(x, op1, op2); - } + } - void uni_vpsignd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + void uni_vpsignd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) { assert(x1.getIdx() == x2.getIdx()); psignd(x1, op); - } - void uni_vpsignd(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + } + void uni_vpsignd(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, const Xbyak::Operand& op) { vpsignd(x1, x2, op); - } + } - void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vsubps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); subps(x, op2); - } + } + + void uni_vsubps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { + vsubps(x, op1, op2); + } + + void uni_vsubps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2, const Xbyak::Xmm& buf) { + movups(buf, op1); + subps(buf, op2); + + if (x.getIdx() != buf.getIdx()) { + movups(x, buf); + } + } - void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vsubps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2, const Xbyak::Ymm& buf) { vsubps(x, op1, op2); - } - - void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { - movups(buf, op1); - subps(buf, op2); - if (x.getIdx() != buf.getIdx()) { - movups(x, buf); - } - } - - void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { - vsubps(x, op1, op2); - } - - void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + } + + void uni_vmulps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); mulps(x, op2); - } + } - void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vmulps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vmulps(x, op1, op2); - } + } - void uni_vfmadd213ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vfmadd213ps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { mulps(x1, x2); addps(x1, op); - } + } - void uni_vfmadd213ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op) { + void uni_vfmadd213ps(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { vfmadd213ps(x1, x2, op); - } + } - void uni_vfmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vfmadd231ps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { mulps(x2, op); addps(x1, x2); - } + } - void uni_vfmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op) { + void uni_vfmadd231ps(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { vfmadd231ps(x1, x2, op); - } + } - void uni_vfnmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vfnmadd231ps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { mulps(x2, op); subps(x1, x2); - } + } - void uni_vfnmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op) { + void uni_vfnmadd231ps(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { vfnmadd231ps(x1, x2, op); - } + } - void uni_vsqrtps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vsqrtps(const Xbyak::Xmm& x, const Xbyak::Operand& op) { sqrtps(x, op); - } + } - void uni_vsqrtps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vsqrtps(const Xbyak::Ymm& x, const Xbyak::Operand& op) { vsqrtps(x, op); - } + } - void uni_vpaddd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vpaddd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { assert(x1.getIdx() == x2.getIdx()); paddd(x2, op); - } + } - void uni_vpaddd(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vpaddd(const Xbyak::Ymm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { vpaddd(x1, x2, op); - } + } - void uni_vandps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vandps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); andps(x, op2); - } + } - void uni_vandps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vandps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vandps(x, op1, op2); - } + } - void uni_vorps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vorps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); orps(x, op2); - } + } - void uni_vorps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vorps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vorps(x, op1, op2); - } + } - void uni_vpslld(const Xbyak::Xmm &x, const Xbyak::Operand &op, + void uni_vpslld(const Xbyak::Xmm& x, const Xbyak::Operand& op, const int imm) { assert(x.getIdx() == op.getIdx()); pslld(x, imm); - } + } - void uni_vpslld(const Xbyak::Ymm &x, const Xbyak::Operand &op, + void uni_vpslld(const Xbyak::Ymm& x, const Xbyak::Operand& op, const int imm) { vpslld(x, op, imm); - } + } - void uni_vpsrld(const Xbyak::Xmm &x, const Xbyak::Operand &op, + void uni_vpsrld(const Xbyak::Xmm& x, const Xbyak::Operand& op, const int imm) { assert(x.getIdx() == op.getIdx()); psrld(x, imm); - } + } - void uni_vpsrld(const Xbyak::Ymm &x, const Xbyak::Operand &op, + void uni_vpsrld(const Xbyak::Ymm& x, const Xbyak::Operand& op, const int imm) { vpsrld(x, op, imm); - } + } - void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vmaxps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); maxps(x, op2); - } + } - void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vmaxps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vmaxps(x, op1, op2); - } + } - void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vminps(const Xbyak::Xmm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { assert(x.getIdx() == op1.getIdx()); minps(x, op2); - } + } - void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, - const Xbyak::Operand &op2 = Xbyak::Operand()) { + void uni_vminps(const Xbyak::Ymm& x, const Xbyak::Operand& op1, + const Xbyak::Operand& op2 = Xbyak::Operand()) { vminps(x, op1, op2); - } + } - void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vcmpgtps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { assert(x1.getIdx() == x2.getIdx()); cmpps(x1, op, 0x6); - } + } - void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op) { + void uni_vcmpgtps(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { vcmpgtps(x1, x2, op); - } + } - void uni_vblendvps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op, const Xbyak::Xmm &msk) { + void uni_vblendvps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op, const Xbyak::Xmm& msk) { assert(x1.getIdx() == x2.getIdx()); blendvps(x1, op); - } + } - void uni_vblendvps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, - const Xbyak::Operand &op, const Xbyak::Ymm &msk) { + void uni_vblendvps(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op, const Xbyak::Ymm& msk) { vblendvps(x1, x2, op, msk); - } + } - void uni_vroundps(const Xbyak::Xmm &x, const Xbyak::Operand &op, + void uni_vroundps(const Xbyak::Xmm& x, const Xbyak::Operand& op, const int imm) { roundps(x, op, imm); - } + } - void uni_vroundps(const Xbyak::Ymm &x, const Xbyak::Operand &op, + void uni_vroundps(const Xbyak::Ymm& x, const Xbyak::Operand& op, const int imm) { vroundps(x, op, imm); - } + } - void uni_vcvtps2dq(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vcvtps2dq(const Xbyak::Xmm& x, const Xbyak::Operand& op) { cvtps2dq(x, op); - } + } - void uni_vcvtps2dq(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vcvtps2dq(const Xbyak::Ymm& x, const Xbyak::Operand& op) { vcvtps2dq(x, op); - } + } - void uni_vcvtdq2ps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + void uni_vcvtdq2ps(const Xbyak::Xmm& x, const Xbyak::Operand& op) { cvtdq2ps(x, op); - } + } - void uni_vcvtdq2ps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + void uni_vcvtdq2ps(const Xbyak::Ymm& x, const Xbyak::Operand& op) { vcvtdq2ps(x, op); - } + } - void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Xmm &x2) { + void uni_vmovmskps(const Xbyak::Reg& x1, const Xbyak::Xmm& x2) { movmskps(x1.cvt64(), x2); - } + } - void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Ymm &x2) { + void uni_vmovmskps(const Xbyak::Reg& x1, const Xbyak::Ymm& x2) { vmovmskps(x1, x2); - } - template - Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, - T raw_offt, - bool bcast = false) { - using Xbyak::Zmm; - using Xbyak::Reg64; - using Xbyak::Address; - using Xbyak::RegExp; - - auto offt = static_cast(raw_offt); - - int scale = 0; - - if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { - offt = offt - 2 * EVEX_max_8b_offt; - scale = 1; - } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { - offt = offt - 4 * EVEX_max_8b_offt; - scale = 2; - } - - auto re = RegExp() + base + offt; - if (scale) re = re + reg_EVEX_max_8b_offt * scale; - - if (bcast) - return zword_b[re]; - else - return zword[re]; - } - - void L(const char *label) { Xbyak::CodeGenerator::L(label); } - void L(const Xbyak::Label &label) { Xbyak::CodeGenerator::L(label); } - - void dump_code(const Xbyak::uint8 *code) const { - if (code) { - static int counter = 0; -#define MAX_FNAME_LEN 256 - char fname[MAX_FNAME_LEN + 1]; - snprintf(fname, MAX_FNAME_LEN, "jit_dump_%s.%d.bin", name(), counter); - counter++; + } + template + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, + T raw_offt, + bool bcast = false) { + using Xbyak::Zmm; + using Xbyak::Reg64; + using Xbyak::Address; + using Xbyak::RegExp; + + auto offt = static_cast(raw_offt); + + int scale = 0; + + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + + auto re = RegExp() + base + offt; + + if (scale) { + re = re + reg_EVEX_max_8b_offt * scale; + } - FILE *fp = fopen(fname, "w+"); - // Failure to dump code is not fatal - if (fp) { - fwrite(code, getSize(), 1, fp); - fclose(fp); - } + if (bcast) { + return zword_b[re]; + } else { + return zword[re]; + } + } + + void L(const char* label) { + Xbyak::CodeGenerator::L(label); + } + void L(const Xbyak::Label& label) { + Xbyak::CodeGenerator::L(label); } + + void dump_code(const Xbyak::uint8* code) const { + if (code) { + static int counter = 0; +#define MAX_FNAME_LEN 256 + char fname[MAX_FNAME_LEN + 1]; + snprintf(fname, MAX_FNAME_LEN, "jit_dump_%s.%d.bin", name(), counter); + counter++; + + FILE* fp = fopen(fname, "w+"); + + // Failure to dump code is not fatal + if (fp) { + fwrite(code, getSize(), 1, fp); + fclose(fp); + } + } + #undef MAX_FNAME_LEN - } + } public: - jit_generator(void *code_ptr = nullptr, size_t code_size = 256 * 1024) - : Xbyak::CodeGenerator(code_size, code_ptr) {} + static constexpr size_t max_code_size = 256 * 4096; - virtual const char *name() const = 0; - virtual const char *source_file() const = 0; +#ifdef USE_SGX +private: + struct SGXAllocator : Xbyak::Allocator { + Xbyak::uint8* const jit_start; + const size_t meta_size; + std::unique_ptr meta; + + SGXAllocator(Xbyak::uint8* jit_start, Xbyak::uint8* jit_end) + : Xbyak::Allocator(), jit_start(jit_start), + meta_size((jit_end - jit_start) / max_code_size), + meta(new bool[meta_size]) { + memset(meta.get(), 0, sizeof(bool) * meta_size); + } + + Xbyak::uint8* alloc(size_t size) override { + if (size != max_code_size) { + abort(); + } + + for (int i = 0; i < meta_size; ++i) { + if (!meta[i]) { + meta[i] = true; + return jit_start + i * size; + } + } + + abort(); + return nullptr; + } - // XXX: use normal_case name and update all callees (?) - const Xbyak::uint8 *getCode() { - const Xbyak::uint8 *code = CodeGenerator::getCode(); + void free(Xbyak::uint8* p) { + size_t dis = p - jit_start; + + if (dis % max_code_size) { + abort(); + } + + meta[dis / max_code_size] = false; + } + + bool useProtect() const override { + return false; + } + }; + + static Xbyak::Allocator* get_jit_allocator() { + static SGXAllocator _allocator(&__jit_start, &__jit_end); + return &_allocator; + }; +#else +#define get_jit_allocator() nullptr +#endif + +public: + jit_generator() + : Xbyak::CodeGenerator(max_code_size, nullptr, get_jit_allocator()) {} + + virtual const char* name() const = 0; + virtual const char* source_file() const = 0; + + // XXX: use normal_case name and update all callees (?) + const Xbyak::uint8* getCode() { + const Xbyak::uint8* code = CodeGenerator::getCode(); #ifdef WITH_DUMP_CODE - // only can dump code when cmake option is enabled - if (util::env::jit_dump_code()) dump_code(code); + + // only can dump code when cmake option is enabled + if (util::env::jit_dump_code()) { + dump_code(code); + } + #endif - return code; - } + return code; + } - template - const F getCode() { - // XXX (Roma): Xbyak code probably has a bug here - return (const F)getCode(); - } + template + const F getCode() { + // XXX (Roma): Xbyak code probably has a bug here + return (const F)getCode(); + } }; } diff --git a/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h b/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h index 7d7a9860f..ed9e3b6be 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_1x1_conv_utils.h @@ -44,38 +44,6 @@ inline int best_divider(int value, int min_divider, int max_divider, return x_divider; } - -template -inline U this_block_size(const T offset, const U max, const V block_size) { - assert(offset < max); - const T block_boundary = offset + block_size; - if (block_boundary > max) - return max - offset; - else - return block_size; -} - -template -inline T nd_iterator_init(T start) { return start; } - -template -inline T nd_iterator_init(T start, U &x, const W &X, Args &&... tuple) { - start = nd_iterator_init(start, utils::forward(tuple)...); - x = start % X; - return start / X; -} - -inline bool nd_iterator_step() { return true; } - -template -inline bool nd_iterator_step(U &x, const W &X, Args &&... tuple) { - if (nd_iterator_step(utils::forward(tuple)...)) { - x = (x + 1) % X; - return x == 0; - } - return false; -} - } // namepsace jit #define JIT_TENSOR_MAX_DIMS 12 diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dwconv.cpp b/saber/funcs/impl/x86/kernel/jit_uni_dwconv.cpp index dc71b3366..a9da7b8b7 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dwconv.cpp +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv.cpp @@ -10,29 +10,29 @@ using namespace jit; template <> SaberStatus JitUniDWConv::check_conf( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { - ConvParam *conv_param = &(param.conv_param); - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = &(param.conv_param); + const Tensor* weights = conv_param->weight(); + const Tensor* bias = conv_param->bias(); const jit_conv_conf_t jcp = kernel->jcp; - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; - - // check format -// if (!(std::is_same::value && -// std::is_same::value && -// std::is_same::value && -// inDtype == AK_FLOAT)) { -// LOG(ERROR) << "wrong format"; -// return SaberUnImplError; -// } - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) - || (conv_param->weight()->get_layout() != Layout_NCHW)) { - - LOG(ERROR) << "wrong format"; + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; + + bool layout_c16 = true + && input->get_layout() == Layout_NCHW_C16R + && output->get_layout() == Layout_NCHW_C16R + && mayiuse(avx512_common); + bool layout_c8 = true + && (input->get_layout() == Layout_NCHW_C8 || input->get_layout() == Layout_NCHW_C8R) + && (output->get_layout() == Layout_NCHW_C8 || output->get_layout() == Layout_NCHW_C8R) + && mayiuse(avx2); + + + if (((!layout_c16) && (!layout_c8)) + || (conv_param->weight()->get_layout() != Layout_NCHW)) { + LOG(ERROR) << "wrong format"; return SaberUnImplError; } @@ -44,8 +44,8 @@ SaberStatus JitUniDWConv::check_conf( && jcp.r_pad == conv_param->pad_w && jcp.stride_h == conv_param->stride_h && jcp.stride_w == conv_param->stride_w - && jcp.dilate_h == conv_param->dilation_h - && jcp.dilate_w == conv_param->dilation_w; + && jcp.dilate_h == conv_param->dilation_h - 1 + && jcp.dilate_w == conv_param->dilation_w - 1; // check shape bool shape_ok = true @@ -63,35 +63,37 @@ SaberStatus JitUniDWConv::check_conf( if (param_ok && shape_ok) { return SaberSuccess; } else { - LOG(INFO) << "param or shape changed, re-init kernel"; + LOG(INFO) << "param or shape changed, re-init kernel"; return SaberNotInitialized; } } template <> SaberStatus JitUniDWConv::create( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) { + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { SaberStatus status; - ConvParam *conv_param = &(param.conv_param); - ActivationParam *act_param = nullptr; - const Tensor *weights = conv_param->weight(); - const Tensor *bias = conv_param->bias(); - Tensor *input = inputs[0]; - Tensor *output = outputs[0]; + ConvParam* conv_param = &(param.conv_param); + ActivationParam* act_param = nullptr; + const Tensor* weights = conv_param->weight(); + const Tensor* bias = conv_param->bias(); + Tensor* input = inputs[0]; + Tensor* output = outputs[0]; // check conf if (kernel) { status = check_conf(inputs, outputs, param); + if (status != SaberNotInitialized) { return status; } } // init conf - conf.ngroups = weights->num(); + conf.src_fmt = input->get_layout(); + conf.ngroups = conv_param->group; conf.mb = input->num(); conf.ic = input->channel(); conf.ih = input->height(); @@ -112,9 +114,13 @@ SaberStatus JitUniDWConv::create( conf.dilate_h = conv_param->dilation_h <= 0 ? 0 : (conv_param->dilation_h - 1); conf.dilate_w = conv_param->dilation_w <= 0 ? 0 : (conv_param->dilation_w - 1); - - conf.with_bias = (bias != NULL); + conf.with_sum = false; + if (param.eltwise_param.has_eltwise){ + conf.with_sum = true; + } + conf.with_bias = (bias != nullptr && bias->valid_size()>0); conf.with_relu = conv_param->activation_param.has_active; + if (conf.with_relu) { act_param = &(conv_param->activation_param); conf.relu_negative_slope = static_cast(act_param->negative_slope); @@ -125,51 +131,66 @@ SaberStatus JitUniDWConv::create( && conf.oc == conf.ngroups && conf.ic == conf.ngroups && conf.is_dw; + if (!ok) { - LOG(ERROR) << "dw conv init fail, return UnImplError"; + LOG(FATAL) << "dw conv init fail, return UnImplError, oc = " << conf.oc << ", ngroup" + << conf.ngroups << ", weight_channel " << weights->valid_shape(); return SaberUnImplError; } - status = jit_uni_dwconv_kernel_f32::init_conf(conf); - if (status == SaberSuccess) { - if (kernel != nullptr) { - delete kernel; - kernel = nullptr; - } - kernel = new jit_uni_dwconv_kernel_f32(conf); + if (kernel != nullptr) { + delete kernel; + kernel = nullptr; + } + + if ((conf.src_fmt == Layout_NCHW_C16 || conf.src_fmt == Layout_NCHW_C16R) && + jit_dwconv_kernel_f32::init_conf(conf) == SaberSuccess) { + kernel = new jit_dwconv_kernel_f32(conf); + } else if ((conf.src_fmt == Layout_NCHW_C8 || conf.src_fmt == Layout_NCHW_C8R) && + jit_dwconv_kernel_f32::init_conf(conf) == SaberSuccess) { + kernel = new jit_dwconv_kernel_f32(conf); } else { + LOG(FATAL) << "not support this config"; return SaberUnImplError; } // reorder weights - Tensor *weights_reorder = conv_param->mutable_weight(); + Tensor* weights_reorder = conv_param->mutable_weight(); weights_internal.reset(new Tensor(weights_reorder->valid_shape())); - weight_reorder_Goihw16g(*weights_reorder, *weights_internal); - return status; + if ((conf.src_fmt == Layout_NCHW_C16 || conf.src_fmt == Layout_NCHW_C16R)) { + weight_reorder_Goihw16g(*weights_reorder, *weights_internal); + } else if ((conf.src_fmt == Layout_NCHW_C8 || conf.src_fmt == Layout_NCHW_C8R)) { + weight_reorder_Goihw8g(*weights_reorder, *weights_internal); + } else { + LOG(FATAL) << "not support this config"; + } + + return SaberSuccess; } template <> SaberStatus JitUniDWConv::init( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m, - Context &ctx) { - ConvParam *conv_param = &(param.conv_param); - -// if (!(std::is_same::value && -// std::is_same::value && -// std::is_same::value && -// OpDtype == AK_FLOAT)) { -// return SaberUnImplError; -// } - if ((inputs[0]->get_layout() != Layout_NCHW_C16) - || (outputs[0]->get_layout() != Layout_NCHW_C16) - || (conv_param->weight()->get_layout() != Layout_NCHW)) { - - LOG(ERROR) << "wrong format"; + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param, + Context& ctx) { + ConvParam* conv_param = &(param.conv_param); + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType output_layout = outputs[0]->get_layout(); + bool ok_layout = + (input_layout == Layout_NCHW_C8R && output_layout == Layout_NCHW_C8R) || + (input_layout == Layout_NCHW_C8 && output_layout == Layout_NCHW_C8) || + (input_layout == Layout_NCHW_C16 && output_layout == Layout_NCHW_C16) || + (input_layout == Layout_NCHW_C16R && output_layout == Layout_NCHW_C16R); + bool ok_weights = conv_param->weight()->get_layout() == Layout_NCHW; + + if (!ok_layout || !ok_weights) { + + LOG(ERROR) << "wrong format"; return SaberUnImplError; } + this->_ctx = &ctx; return create(inputs, outputs, param, ctx); @@ -177,23 +198,20 @@ SaberStatus JitUniDWConv::init( template <> SaberStatus JitUniDWConv::dispatch( - const std::vector*>& inputs, - std::vector*>& outputs, - ConvEltwiseParam ¶m) { - - ConvParam *conv_param = &(param.conv_param); - const Tensor *bias = conv_param->bias(); - - const float *ptr_src = reinterpret_cast(inputs[0]->data()); - const float *ptr_weights = reinterpret_cast(weights_internal->data()); - const float *ptr_bias = nullptr; - if(bias) { - ptr_bias=reinterpret_cast(bias->data()); - } - auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param) { + + ConvParam* conv_param = &(param.conv_param); + const Tensor* bias = conv_param->bias(); - const auto &jcp = kernel->jcp; + const float* ptr_src = reinterpret_cast(inputs[0]->data()); + const float* ptr_weights = reinterpret_cast(weights_internal->data()); + const float* ptr_bias = bias ? reinterpret_cast(bias->data()) : nullptr; + auto ptr_dst = reinterpret_cast(outputs[0]->mutable_data()); + const auto& jcp = kernel->jcp; + int blk_size = (jcp.src_fmt == Layout_NCHW_C16 || jcp.src_fmt == Layout_NCHW_C16R) ? 16 : 8; int dil_h = jcp.dilate_h + 1; int dil_w = jcp.dilate_w + 1; int str_h = jcp.stride_h; @@ -204,25 +222,26 @@ SaberStatus JitUniDWConv::dispatch( const size_t work_amount = MB * chb_work * jcp.oh; auto kernel_params = [&](int ur_w_step, int ow, int oh, int ih, int kh, - int kh_padding, int ch, int ch_num, int n) { + int kh_padding, int ch, int ch_num, int n) { jit_conv_call_t par_conv; const int i_l_overflow = utils::max(0, (jcp.l_pad - ow * str_w)); const int i_r_overflow = utils::max(jcp.iw, (ow * str_w - + (jcp.kw - 1)*dil_w - jcp.l_pad + 1)) - jcp.iw; + + (jcp.kw - 1) * dil_w - jcp.l_pad + 1)) - jcp.iw; - const int iw = utils::max((ow*str_w - jcp.l_pad - + utils::div_up(i_l_overflow, dil_w)*dil_w), 0); + const int iw = utils::max((ow * str_w - jcp.l_pad + + utils::div_up(i_l_overflow, dil_w) * dil_w), 0); const int kw = utils::div_up(i_l_overflow, dil_w); const int kw_padding = jcp.kw - utils::div_up(i_l_overflow, dil_w) - utils::div_up(i_r_overflow, dil_w); - par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + ch * jcp.iw * jcp.ih * 16 + ih * jcp.iw * 16 + iw * 16; - par_conv.dst = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + ch * jcp.ow * jcp.oh * 16 + oh * jcp.ow * 16 + ow * 16; + par_conv.src = ptr_src + n * jcp.ic * jcp.iw * jcp.ih + ch * jcp.iw * jcp.ih * blk_size + ih * + jcp.iw * blk_size + iw * blk_size; + par_conv.dst = ptr_dst + n * jcp.oc * jcp.ow * jcp.oh + ch * jcp.ow * jcp.oh * blk_size + oh * + jcp.ow * blk_size + ow * blk_size; + par_conv.filt = ptr_weights + (ch * jcp.kh * jcp.kw + kh * jcp.kw + kw) * blk_size; - //par_conv.filt = &weights[weights_d.blk_off(ch, 0, 0, kh, kw)]; - par_conv.filt = ptr_weights + (ch * jcp.kh * jcp.kw + kh * jcp.kw + kw) *16; if (bias) { par_conv.bias = ptr_bias + ch * jcp.ch_block; } @@ -239,20 +258,21 @@ SaberStatus JitUniDWConv::dispatch( auto ker = [&](const int ithr, const int nthr) { size_t start{0}, end{0}; - utils::balance211(work_amount, nthr, ithr, start, end); + balance211(work_amount, nthr, ithr, start, end); size_t n{0}, chb{0}, oh{0}; - utils::nd_iterator_init(start, n, MB, chb, chb_work, oh, jcp.oh); + nd_iterator_init(start, n, MB, chb, chb_work, oh, jcp.oh); + for (size_t iwork = start; iwork < end; ++iwork) { int ch = chb * jcp.nb_ch_blocking; int ch_num = jcp.nb_ch_blocking; - const int i_t_overflow = utils::max(0, (int)(jcp.t_pad - oh*str_h)); + const int i_t_overflow = utils::max(0, (int)(jcp.t_pad - oh * str_h)); const int i_b_overflow = utils::max(jcp.ih, - (int)(oh*str_h + (jcp.kh - 1)*dil_h - jcp.t_pad + 1)) - jcp.ih; + (int)(oh * str_h + (jcp.kh - 1) * dil_h - jcp.t_pad + 1)) - jcp.ih; - const int ih = utils::max((int)(oh*str_h - jcp.t_pad - + utils::div_up(i_t_overflow, dil_h)*dil_h), 0); + const int ih = utils::max((int)(oh * str_h - jcp.t_pad + + utils::div_up(i_t_overflow, dil_h) * dil_h), 0); const int kh = utils::div_up(i_t_overflow, dil_h); const int kh_padding = jcp.kh - utils::div_up(i_t_overflow, dil_h) - utils::div_up(i_b_overflow, dil_h); @@ -261,18 +281,20 @@ SaberStatus JitUniDWConv::dispatch( int ow = 0; int l_border = utils::min(utils::div_up(jcp.l_pad, str_w), jcp.ow); int ur_w_step = 1; + for (; ow < l_border; ow++) { jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); + kh, kh_padding, ch, ch_num, n); kernel->jit_ker(&par_conv); } // main loop - ur_w_step = (jcp.iw - (jcp.kw - 1)*dil_w + jcp.l_pad - 1) / jcp.stride_w - ow + 1; + ur_w_step = (jcp.iw - (jcp.kw - 1) * dil_w + jcp.l_pad - 1) / jcp.stride_w - ow + 1; + if (ur_w_step > 0) { jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); + kh, kh_padding, ch, ch_num, n); kernel->jit_ker(&par_conv); @@ -281,20 +303,21 @@ SaberStatus JitUniDWConv::dispatch( // right border ur_w_step = 1; + for (; ow < jcp.ow; ow++) { jit_conv_call_t par_conv = kernel_params(ur_w_step, ow, oh, ih, - kh, kh_padding, ch, ch_num, n); + kh, kh_padding, ch, ch_num, n); kernel->jit_ker(&par_conv); } - utils::nd_iterator_step(n, MB, chb, chb_work, oh, jcp.oh); + nd_iterator_step(n, MB, chb, chb_work, oh, jcp.oh); } }; -#pragma omp parallel + #pragma omp parallel { - ker(omp_get_thread_num(), omp_get_num_threads()); + ker(anakin_get_thread_num(), anakin_get_num_threads()); } return SaberSuccess; diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dwconv.h b/saber/funcs/impl/x86/kernel/jit_uni_dwconv.h index be5d8928c..f25cf8b48 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dwconv.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv.h @@ -26,7 +26,7 @@ namespace saber { using namespace jit; -template +template class JitUniDWConv : public ImplBase< X86, OpDtype, ConvEltwiseParam > { public: @@ -57,7 +57,7 @@ class JitUniDWConv : public ImplBase< private: jit_conv_conf_t conf; - jit_uni_dwconv_kernel_f32 *kernel = nullptr; + jit_uni_dwconv_kernel_f32 *kernel = nullptr; std::shared_ptr > weights_internal; SaberStatus check_conf(const std::vector*>& inputs, std::vector*>& outputs, diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp index 6efcd57bb..8b116e00d 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.cpp @@ -12,7 +12,7 @@ namespace jit { using namespace Xbyak; template -void jit_uni_dwconv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { +void jit_dwconv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { int repeats = isa == sse42 ? 2 : 1; for (int i = 0; i < repeats; i++) { for (int ch = 0; ch < ur_ch_blocks; ch++) { @@ -37,7 +37,7 @@ void jit_uni_dwconv_kernel_f32::load_src(int ur_ch_blocks, int ur_w) { } template -void jit_uni_dwconv_kernel_f32::apply_filter( +void jit_dwconv_kernel_f32::apply_filter( int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int dilate_h = jcp.dilate_h + 1; @@ -88,7 +88,7 @@ void jit_uni_dwconv_kernel_f32::apply_filter( } template -void jit_uni_dwconv_kernel_f32::apply_filter_unrolled(int ur_ch_blocks, int ur_w) { +void jit_dwconv_kernel_f32::apply_filter_unrolled(int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int dilate_h = jcp.dilate_h + 1; int dilate_w = jcp.dilate_w + 1; @@ -129,7 +129,7 @@ void jit_uni_dwconv_kernel_f32::apply_filter_unrolled(int ur_ch_blocks, int } template -void jit_uni_dwconv_kernel_f32::apply_activation(int ur_ch_blocks, int ur_w) { +void jit_dwconv_kernel_f32::apply_activation(int ur_ch_blocks, int ur_w) { if (this->jcp.with_relu) { uni_vpxor(vmm_zero, vmm_zero, vmm_zero); if (jcp.relu_negative_slope == 0) { @@ -167,7 +167,7 @@ void jit_uni_dwconv_kernel_f32::apply_activation(int ur_ch_blocks, int ur_w } template -void jit_uni_dwconv_kernel_f32::store_dst( +void jit_dwconv_kernel_f32::store_dst( int ur_ch_blocks, int ur_w) { int ch_blk = jcp.ch_block; int repeats = isa == sse42 ? 2 : 1; @@ -183,7 +183,7 @@ void jit_uni_dwconv_kernel_f32::store_dst( } template -void jit_uni_dwconv_kernel_f32::loop_body(int ur_ch_blocks) { +void jit_dwconv_kernel_f32::loop_body(int ur_ch_blocks) { Label unrolled_w_label; Label tail_w_label; Label exit_label; @@ -221,7 +221,7 @@ void jit_uni_dwconv_kernel_f32::loop_body(int ur_ch_blocks) { } template -void jit_uni_dwconv_kernel_f32::generate() { +void jit_dwconv_kernel_f32::generate() { this->preamble(); mov(reg_input, ptr[this->param1 + GET_OFF(src)]); @@ -251,7 +251,7 @@ void jit_uni_dwconv_kernel_f32::generate() { template -SaberStatus jit_uni_dwconv_kernel_f32::init_conf(jit_conv_conf_t &jcp) { +SaberStatus jit_dwconv_kernel_f32::init_conf(jit_conv_conf_t &jcp) { if (!mayiuse(isa) && isa == avx512_common) { LOG(ERROR) << "Init an AVX512 kernel in a non-avx512 machine is not permitted"; return SaberUnImplError; @@ -271,8 +271,8 @@ SaberStatus jit_uni_dwconv_kernel_f32::init_conf(jit_conv_conf_t &jcp) { return SaberSuccess; } -template struct jit_uni_dwconv_kernel_f32; - +template struct jit_dwconv_kernel_f32; +template struct jit_dwconv_kernel_f32; } } } diff --git a/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h index 5b2576840..82d269127 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_dwconv_kernel_f32.h @@ -12,21 +12,35 @@ namespace anakin { namespace saber { namespace jit { + +struct jit_uni_dwconv_kernel_f32 { + + jit_uni_dwconv_kernel_f32() {} + + jit_uni_dwconv_kernel_f32(jit_conv_conf_t ajcp): jcp(ajcp) { + } + + jit_conv_conf_t jcp; + + virtual ~jit_uni_dwconv_kernel_f32() {} + + void (*jit_ker)(jit_conv_call_t *); +}; + + template -struct jit_uni_dwconv_kernel_f32 : public jit_generator { +struct jit_dwconv_kernel_f32 : public jit_uni_dwconv_kernel_f32, public jit_generator { public: - jit_uni_dwconv_kernel_f32(jit_conv_conf_t ajcp) : jcp(ajcp) { + jit_dwconv_kernel_f32(jit_conv_conf_t ajcp) : jit_uni_dwconv_kernel_f32(ajcp), jit_generator() { generate(); jit_ker = (void (*)(jit_conv_call_t *))getCode(); } - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dwconv_kernel_f32); + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_dwconv_kernel_f32); static SaberStatus init_conf(jit_conv_conf_t &jcp); - jit_conv_conf_t jcp; - void (*jit_ker)(jit_conv_call_t *); private: using Vmm = typename utils::conditional3 -bool jit_uni_pool_kernel_f32::init_conf(jit_pool_conf_t &jpp) { +bool jit_pool_kernel_f32::init_conf(jit_pool_conf_t &jpp) { + bool layout_c16 = (jpp.src_fmt == Layout_NCHW_C16||jpp.src_fmt==Layout_NCHW_C16R) && mayiuse(avx512_common); + bool layout_c8 = (jpp.src_fmt == Layout_NCHW_C8||jpp.src_fmt ==Layout_NCHW_C8R) && mayiuse(avx2); + bool ok = true && (layout_c16 || layout_c8); + if (!ok) { + return false; + } + + int simd_w; + if (layout_c16) + simd_w = 16; + else if (layout_c8) + simd_w = 8; + else + return false; - bool args_ok = true; - if (!args_ok) { + jpp.simple_alg = false; + jpp.c_block = simd_w; + jpp.nb_c = jpp.c / jpp.c_block; + if (jpp.alg == Pooling_max) { + jpp.ur_w = 16; + if (layout_c8) + jpp.ur_w = 4; + } else { + jpp.ur_w = 24; + if (layout_c8) + jpp.ur_w = 12; + } + + if (jpp.ow < jpp.ur_w) { + jpp.ur_w = jpp.ow; + } + if (jpp.l_pad > jpp.ur_w) { return false; } + jpp.ur_w_tail = jpp.ow % jpp.ur_w; return true; } template -inline void jit_uni_pool_kernel_f32::maybe_recalculate_divisor(int jj, +inline void jit_pool_kernel_f32::maybe_recalculate_divisor(int jj, int ur_w, int pad_l, int pad_r) { if (jpp.alg == Pooling_average_exclude_padding) { int kw = jpp.kw; @@ -41,7 +71,7 @@ inline void jit_uni_pool_kernel_f32::maybe_recalculate_divisor(int jj, } template -inline void jit_uni_pool_kernel_f32::avg_step(int ur_w, int pad_l, +inline void jit_pool_kernel_f32::avg_step(int ur_w, int pad_l, int pad_r, const char* kh_label) { int iw = jpp.iw; @@ -85,7 +115,7 @@ inline void jit_uni_pool_kernel_f32::avg_step(int ur_w, int pad_l, } template -inline void jit_uni_pool_kernel_f32::max_step_fwd(int ur_w, int pad_l, +inline void jit_pool_kernel_f32::max_step_fwd(int ur_w, int pad_l, int pad_r, const char *kh_label) { int iw = jpp.iw; int kw = jpp.kw; @@ -143,7 +173,7 @@ inline void jit_uni_pool_kernel_f32::max_step_fwd(int ur_w, int pad_l, } template -inline void jit_uni_pool_kernel_f32::max_step_bwd(int ur_w, int pad_l, +inline void jit_pool_kernel_f32::max_step_bwd(int ur_w, int pad_l, int pad_r, const char *kh_label) { int iw = jpp.iw; int kw = jpp.kw; @@ -220,7 +250,7 @@ inline void jit_uni_pool_kernel_f32::max_step_bwd(int ur_w, int pad_l, } template -void jit_uni_pool_kernel_f32::maybe_zero_diff_src() { +void jit_pool_kernel_f32::maybe_zero_diff_src() { assert(jpp.c_block * sizeof(float) % cpu_isa_traits::vlen == 0); Label l_skip, l_zero; @@ -249,7 +279,7 @@ void jit_uni_pool_kernel_f32::maybe_zero_diff_src() { } template -void jit_uni_pool_kernel_f32::generate() { +void jit_pool_kernel_f32::generate() { this->preamble(); int ow = jpp.ow; @@ -367,7 +397,8 @@ void jit_uni_pool_kernel_f32::generate() { this->postamble(); } -template struct jit_uni_pool_kernel_f32; +template struct jit_pool_kernel_f32; +template struct jit_pool_kernel_f32; } } diff --git a/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h b/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h index a26cc5b47..b151e1662 100644 --- a/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h +++ b/saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h @@ -31,18 +31,31 @@ namespace jit { using namespace Xbyak; -template -struct jit_uni_pool_kernel_f32: public jit_generator { +struct jit_uni_pool_kernel_f32{ + + jit_uni_pool_kernel_f32() {} + jit_uni_pool_kernel_f32(jit_pool_conf_t ajpp): jpp(ajpp) { - this->generate(); - jit_ker = (decltype(jit_ker))this->getCode(); } jit_pool_conf_t jpp; - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_pool_kernel_f32); - + virtual ~jit_uni_pool_kernel_f32() {} void operator()(jit_pool_call_t *arg) { jit_ker(arg); } + +protected: + void (*jit_ker)(jit_pool_call_t *); +}; + +template +struct jit_pool_kernel_f32: public jit_uni_pool_kernel_f32, public jit_generator { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_pool_kernel_f32); + + jit_pool_kernel_f32(jit_pool_conf_t ajpp): jit_uni_pool_kernel_f32(ajpp), jit_generator() { + this->generate(); + jit_ker = (decltype(jit_ker))this->getCode(); + } + static bool init_conf(jit_pool_conf_t &jpp); private: @@ -91,7 +104,6 @@ struct jit_uni_pool_kernel_f32: public jit_generator { Xbyak::Reg32 reg_shuf_mask = esi; int prev_kw; - void (*jit_ker)(jit_pool_call_t *); void maybe_recalculate_divisor(int jj, int ur_w, int pad_l, int pad_r); void avg_step(int ur_w, int pad_l, int pad_r, const char *kh_label); diff --git a/saber/funcs/impl/x86/mkl_gemm.cpp b/saber/funcs/impl/x86/mkl_gemm.cpp new file mode 100644 index 000000000..3fb2dd089 --- /dev/null +++ b/saber/funcs/impl/x86/mkl_gemm.cpp @@ -0,0 +1,322 @@ +#include "saber/funcs/impl/x86/mkl_gemm.h" +#include "saber/funcs/timer.h" +#include "debug.h" +namespace anakin { + +namespace saber { +#define MKL_GEMM_TIMER 0 +template <> +SaberStatus MklDnnGemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx, const float* ptr_b, MKLGemmMode gemm_mode) { + _gemm_mode = gemm_mode; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + + if (gemm_mode == PACKED_MKLGEMM) { + if (_weights_packed_ptr_fp32 != nullptr) { + cblas_sgemm_free(_weights_packed_ptr_fp32); + } + + _weights_packed_ptr_fp32 = cblas_sgemm_alloc(CblasBMatrix, m, n, k); + + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + trans_b ? CblasTrans : CblasNoTrans, + m, n, k, + 1.0, + ptr_b, n, + _weights_packed_ptr_fp32); + } + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::dispatch( + const float alpha, const float beta, int m, + const float* ptr_a, const float* ptr_b, float* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + + // LOG(INFO)<<"it is mkldnn gemm"; +#if MKL_GEMM_TIMER + Context ctx(0, 0, 0); + SaberTimer timer; + timer.start(ctx); +#endif + + if (_gemm_mode == PACKED_MKLGEMM) { + // LOG(INFO)<<"MklDnnGemm dispatch "<<_m<<","<<_n<<","<<_k; + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + m, _n, _k, + ptr_a, _k, + _weights_packed_ptr_fp32, _n, + beta, + ptr_c, _n); + } else { + CBLAS_TRANSPOSE trans_a = + (_trans_a == 'T') ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE trans_b = + (_trans_b == 'T') ? CblasTrans : CblasNoTrans; + CHECK(ptr_b != nullptr); + cblas_sgemm(CblasRowMajor, trans_a, trans_b, m, _n, _k, alpha, ptr_a, _lda, ptr_b, _ldb, beta, + ptr_c, _ldc); + } + +#if MKL_GEMM_TIMER + timer.end(ctx); + double ms = timer.get_average_ms(); + double work_load = (double)_m * _n * _k * 2; + double speed = work_load / ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn_gemm_f32f32f32 [" << _gemm_mode << "] " << _m << "," << _n << "," << _k << "," + << ms << "," << speed; +#endif + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx, const char* ptr_b, MKLGemmMode gemm_mode) { + _gemm_mode = gemm_mode; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + + auto s8_a = true; + auto packed_b = gemm_mode == PACKED_MKLGEMM; + char oc_mode = 'R'; + auto ocsize = oc_mode == 'R' ? n : oc_mode == 'C' ? m : 1; + _oc_offset.re_alloc(Shape({1, 1, 1, ocsize}), AK_INT32); + fill_tensor_const(_oc_offset, 0); + auto status = _packed_s8s8s32_gemm.init(ptr_b, _oc_offset.data(), &_s8s8s32_handle, oc_mode, + m, n, k, 0, 0, s8_a, packed_b, trans_a, trans_b, + 0.f, 1.f, _lda, _ldb, _ldc); + CHECK_EQ(status, SaberSuccess); + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::dispatch( + const float alpha, const float beta, int m, + const char* ptr_a, const char* ptr_b, int* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_b != nullptr); + CHECK(ptr_c != nullptr); + + if (_gemm_mode == PACKED_MKLGEMM) { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c); + CHECK_EQ(status, SaberSuccess); + } else { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c, ptr_b); + CHECK_EQ(status, SaberSuccess); + } + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx, const int8_t* ptr_b, MKLGemmMode gemm_mode) { + _gemm_mode = gemm_mode; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + + auto s8_a = true; + auto packed_b = gemm_mode == PACKED_MKLGEMM; + char oc_mode = 'R'; + auto ocsize = oc_mode == 'R' ? n : oc_mode == 'C' ? m : 1; + _oc_offset.re_alloc(Shape({1, 1, 1, ocsize}), AK_INT32); + fill_tensor_const(_oc_offset, 0); + auto status = _packed_s8s8s32_gemm.init(ptr_b, _oc_offset.data(), &_s8s8s32_handle, oc_mode, + m, n, k, 0, 0, s8_a, packed_b, trans_a, trans_b, + 0.f, 1.f, _lda, _ldb, _ldc); + CHECK_EQ(status, SaberSuccess); + + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::dispatch( + const float alpha, const float beta, int m, + const int8_t* ptr_a, const int8_t* ptr_b, int* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_c != nullptr); +#if MKL_GEMM_TIMER + Context ctx(0, 0, 0); + SaberTimer timer; + timer.start(ctx); +#endif + + if (_gemm_mode == PACKED_MKLGEMM) { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c); + CHECK_EQ(status, SaberSuccess); + } else { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c, ptr_b); + CHECK_EQ(status, SaberSuccess); + } + +#if MKL_GEMM_TIMER + timer.end(ctx); + double ms = timer.get_average_ms(); + double work_load = (double)_m * _n * _k * 2; + double speed = work_load / ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn_gemm_s8s8s32 " << _m << "," << _n << "," << _k << "," << ms << "," << speed; +#endif + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx, const int8_t* ptr_b, MKLGemmMode gemm_mode) { + _gemm_mode = gemm_mode; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + + auto s8_a = false; + auto packed_b = gemm_mode == PACKED_MKLGEMM; + char oc_mode = 'R'; + auto ocsize = oc_mode == 'R' ? n : oc_mode == 'C' ? m : 1; + _oc_offset.re_alloc(Shape({1, 1, 1, ocsize}), AK_INT32); + fill_tensor_const(_oc_offset, 0); + auto status = _packed_s8s8s32_gemm.init(ptr_b, _oc_offset.data(), &_s8s8s32_handle, oc_mode, + m, n, k, 0, 0, s8_a, packed_b, trans_a, trans_b, + 0.f, 1.f, _lda, _ldb, _ldc); + CHECK_EQ(status, SaberSuccess); + + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::dispatch( + const float alpha, const float beta, int m, + const uint8_t* ptr_a, const int8_t* ptr_b, int* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_c != nullptr); +#if MKL_GEMM_TIMER + Context ctx(0, 0, 0); + SaberTimer timer; + timer.start(ctx); +#endif + + if (_gemm_mode == PACKED_MKLGEMM) { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c); + CHECK_EQ(status, SaberSuccess); + } else { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c, ptr_b); + CHECK_EQ(status, SaberSuccess); + } + +#if MKL_GEMM_TIMER + timer.end(ctx); + double ms = timer.get_average_ms(); + double work_load = (double)_m * _n * _k * 2; + double speed = work_load / ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn_gemm_s8s8s32 " << _m << "," << _n << "," << _k << "," << ms << "," << speed; +#endif + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::init( + const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx, const char* ptr_b, MKLGemmMode gemm_mode) { + _gemm_mode = gemm_mode; + _lda = (!trans_a) ? k : m; + _ldb = (!trans_b) ? n : k; + _ldc = n; + _m = m; + _n = n; + _k = k; + _trans_a = trans_a ? 'T' : 'N'; + _trans_b = trans_b ? 'T' : 'N'; + + auto s8_a = false; + auto packed_b = gemm_mode == PACKED_MKLGEMM; + char oc_mode = 'R'; + auto ocsize = oc_mode == 'R' ? n : oc_mode == 'C' ? m : 1; + _oc_offset.re_alloc(Shape({1, 1, 1, ocsize}), AK_INT32); + fill_tensor_const(_oc_offset, 0); + auto status = _packed_s8s8s32_gemm.init(ptr_b, _oc_offset.data(), &_s8s8s32_handle, oc_mode, + m, n, k, 0, 0, s8_a, packed_b, trans_a, trans_b, + 0.f, 1.f, _lda, _ldb, _ldc); + CHECK_EQ(status, SaberSuccess); + + + return SaberSuccess; +} + +template <> +SaberStatus MklDnnGemm::dispatch( + const float alpha, const float beta, int m, + const unsigned char* ptr_a, const char* ptr_b, int* ptr_c) { + CHECK(ptr_a != nullptr); + CHECK(ptr_c != nullptr); +#if MKL_GEMM_TIMER + Context ctx(0, 0, 0); + SaberTimer timer; + timer.start(ctx); +#endif + + if (_gemm_mode == PACKED_MKLGEMM) { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c); + CHECK_EQ(status, SaberSuccess); + } else { + auto status = _packed_s8s8s32_gemm.execute(_s8s8s32_handle, m, ptr_a, ptr_c, ptr_b); + CHECK_EQ(status, SaberSuccess); + } + +#if MKL_GEMM_TIMER + timer.end(ctx); + double ms = timer.get_average_ms(); + double work_load = (double)_m * _n * _k * 2; + double speed = work_load / ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn_gemm_s8s8s32 " << _m << "," << _n << "," << _k << "," << ms << "," << speed; +#endif + return SaberSuccess; +} + + + + + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/mkl_gemm.h b/saber/funcs/impl/x86/mkl_gemm.h new file mode 100644 index 000000000..854d64a67 --- /dev/null +++ b/saber/funcs/impl/x86/mkl_gemm.h @@ -0,0 +1,72 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_MKL_GEMM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_MKL_GEMM_H + +#include "saber/core/tensor.h" +#include "saber/funcs/gemm.h" +#include "saber/funcs/impl/x86/mkl_gemm_int8.h" + +namespace anakin { +namespace saber { + + +enum MKLGemmMode : int{ + NORMAL_MKLGEMM=0, + PACKED_MKLGEMM +}; + +template +class MklDnnGemm{ + +public: + + + MklDnnGemm():_s8s8s32_handle(nullptr){}; + ~MklDnnGemm() { + if (_weights_packed_ptr_fp32 != nullptr){ + cblas_sgemm_free(_weights_packed_ptr_fp32); + } + if (_s8s8s32_handle!= nullptr){ + _packed_s8s8s32_gemm.release(_s8s8s32_handle); + } + } + + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, + Context ctx,const inDtype_B* ptr_b= nullptr,MKLGemmMode gemm_mode = PACKED_MKLGEMM); + + SaberStatus dispatch(const float alpha, const float beta,int m, + const inDtype_A* a, const inDtype_B* b, + outDtype* c); + +private: + MKLGemmMode _gemm_mode{NORMAL_MKLGEMM}; + float* _weights_packed_ptr_fp32{nullptr}; + int _m{-1}; + int _n{-1}; + int _k{-1}; + int _lda{-1}; + int _ldb{-1}; + int _ldc{-1}; + float _alpha{1.f}; + float _beta{0.f}; + char _trans_a{'N'}; + char _trans_b{'N'}; + char _b_pack{'T'}; + char _offset_c_flag{'F'}; + int8_t _offset_a{0}; + int8_t _offset_b{0}; + int32_t _offset_c{0}; + + MKLGEMM _packed_s8s8s32_gemm; + Tensor _oc_offset; + void* _s8s8s32_handle{nullptr}; + + +}; + + +} +} + +#endif \ No newline at end of file diff --git a/saber/funcs/impl/x86/mkl_gemm_int8.cpp b/saber/funcs/impl/x86/mkl_gemm_int8.cpp new file mode 100644 index 000000000..a374e052b --- /dev/null +++ b/saber/funcs/impl/x86/mkl_gemm_int8.cpp @@ -0,0 +1,378 @@ +#include "saber/funcs/impl/x86/mkl_gemm_int8.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { + +template<> +SaberStatus MKLGEMM::mem_a_s82u8(const int8_t* src, size_t length) { + if (src == nullptr) { + LOG(FATAL) << "wrong empty pointer !"; + return SaberInvalidValue; + } + + utils::try_expand_tensor(_inner_u8_matrix_a, length); + + uint8_t* inner_u8_ptr = static_cast(_inner_u8_matrix_a.data()); + uint8_t* scr_pointer = (uint8_t*)src; +#pragma omp parallel for + + for (auto i = 0; i < length; i++) { + inner_u8_ptr[i] = scr_pointer[i] + 128; + } + + return SaberSuccess; +} + +template<> +void* MKLGEMM::mem_oc_s8a_compute(void* handle) { + if (handle == nullptr) { + LOG(FATAL) << "wrong empty pointer !"; + return nullptr; + } + + auto args = static_cast(handle); + auto b_mem = static_cast(args->matrix_b); + + if (b_mem == nullptr) { + LOG(FATAL) << "wrong empty pointer !"; + return nullptr; + } + + if (args->s8_a) { + auto dim_k = args->k; + auto dim_n = args->n; + auto ob = args->ob; + auto dst = static_cast(calloc(dim_n, sizeof(int32_t))); + auto oc_mem = args->matrix_oc + ? static_cast(args->matrix_oc) + : nullptr; + auto fix_oc = oc_mem ? oc_mem[0] : 0; + auto alpha = args->alpha; + auto scale = args->alpha * -128; + + auto thread_num = omp_max_thread; + + if (dim_n <= 2) { + thread_num = 1; + } else if (dim_n < omp_max_thread) { + thread_num = dim_n; + } + + if (args->oc_mode == 'F') { + if (args->trans_b) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto i = 0; i < dim_n; i++) { + int32_t b_dim_k_sum = 0; +#pragma omp simd + for (auto j = 0; j < dim_k; j++) { + b_dim_k_sum += b_mem[i * dim_k + j] + ob; + } + + dst[i] += scale * b_dim_k_sum + fix_oc; + } + } else { + for (auto i = 0; i < dim_k; i++) { + if (i == 0) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < dim_n; j++) { + dst[j] += scale * (b_mem[i * dim_n + j] + ob) + fix_oc; + } + } else { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < dim_n; j++) { + dst[j] += scale * (b_mem[i * dim_n + j] + ob); + } + } + + } + } + } else if (args->oc_mode == 'R') { + if (args->trans_b) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto i = 0; i < dim_n; i++) { + int32_t b_dim_k_sum = 0; + #pragma omp simd + + for (auto j = 0; j < dim_k; j++) { + b_dim_k_sum += b_mem[i * dim_k + j] + ob; + } + + dst[i] += scale * b_dim_k_sum + oc_mem[i]; + } + } else { + for (auto i = 0; i < dim_k; i++) { + if (i == 0) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < dim_n; j++) { + dst[j] += scale * (b_mem[i * dim_n + j] + ob) + oc_mem[j]; + } + } else { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < dim_n; j++) { + dst[j] += scale * (b_mem[i * dim_n + j] + ob); + } + } + } + } + } else if (args->oc_mode == 'C') { + if (args->trans_b) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto i = 0; i < dim_n; i++) { + int32_t b_dim_k_sum = 0; +#pragma omp simd + for (auto j = 0; j < dim_k; j++) { + b_dim_k_sum += b_mem[i * dim_k + j] + ob; + } + + dst[i] += scale * b_dim_k_sum; + } + } else { + for (auto i = 0; i < dim_k; i++) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < dim_n; j++) { + dst[j] += scale * (b_mem[i * dim_n + j] + ob); + } + } + } + } + + return dst; + } + + return nullptr; +} +template<> +void MKLGEMM::add_mem_oc_s8a(char oc_mode, const int* oc_mem, + const void* b_in, int8_t ob, + size_t m, size_t k, size_t n, float alpha, bool trans_b) { + CHECK_EQ(oc_mode, 'R') << "only support C offset"; + CHECK_EQ(trans_b, false) << "only support no trans b now"; + auto thread_num = omp_max_thread; + + if (m <= 2) { + thread_num = 1; + } else if (m < omp_max_thread) { + thread_num = m; + } + + int8_t* b_mem = (int8_t*)b_in; + int scale = (int)round(alpha * -128); + int* oc_offset = (int*)_inner_c_offset.mutable_data(); + memset(oc_offset, 0, sizeof(int)*n); + + for (auto i = 0; i < k; i++) { + if (i == 0) { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < n; j++) { + oc_offset[j] += scale * (b_mem[i * n + j] + ob) + oc_mem[j]; + } + } else { +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto j = 0; j < n; j++) { + oc_offset[j] += scale * (b_mem[i * n + j] + ob); + } + } + } + +} +template<> +SaberStatus MKLGEMM::add_mem_oc_s8a(bool a_s82u8, char oc_mode, + const void* in, void* out, + size_t dim_m, size_t dim_n) { + if (a_s82u8 && oc_mode == 'C') { + if (in == nullptr || out == nullptr) { + LOG(FATAL) << "wrong empty pointer !"; + return SaberInvalidValue; + } + + auto src = static_cast(in); + auto dst = static_cast(out); + + auto thread_num = omp_max_thread; + + if (dim_m <= 2) { + thread_num = 1; + } else if (dim_m < omp_max_thread) { + thread_num = dim_m; + } + +#pragma omp parallel for collapse(1) num_threads(thread_num) + for (auto h = 0; h < dim_m; h++) { +#pragma omp simd + for (auto w = 0; w < dim_n; w++) { + dst[h * dim_n + w] += src[w]; + } + } + } else{ + DLOG(INFO)<<"do nothing"; + } + + return SaberSuccess; +} + +template<> +void* MKLGEMM::pack_mem(const void* mem_in, + const bool pack_b, + const bool trans, + const size_t m, + const size_t n, + const size_t k, + const size_t stride, + const float alpha) { + CHECK_EQ(mem_in != nullptr, true) << "wrong empty pointer !"; + + void* mem_out = nullptr; + auto identifier = pack_b ? CblasBMatrix : CblasAMatrix; + auto need_trans = trans ? CblasTrans : CblasNoTrans; + auto length = cblas_gemm_s8u8s32_pack_get_size(identifier, m, n, k); + mem_out = malloc(length); + cblas_gemm_s8u8s32_pack(CblasRowMajor, + identifier, + need_trans, + m, + n, + k, + mem_in, + stride, + mem_out); + + return mem_out; +} + +template<> +SaberStatus MKLGEMM::execute(const void* mem_a, + const void* mem_b, + const void* mem_oc, + void* mem_c, + const bool s8_a, + const size_t m, + const size_t n, + const size_t k, + const int8_t oa, + const int8_t ob, + const size_t lda, + const size_t ldb, + const size_t ldc, + const bool pack_b, + const bool trans_a, + const bool trans_b, + const float beta, + const float alpha, + const char offset_mode) { + auto status = execute_check(mem_a, mem_b, mem_oc, mem_c, oa, ob, offset_mode); + + if (status != SaberSuccess) { + LOG(ERROR) << "check failed"; + return status; + } + + auto dst = static_cast(mem_c); + auto offset = static_cast(mem_oc); + auto a_trans = trans_a ? CblasTrans : CblasNoTrans; + auto b_trans = trans_b ? CblasTrans : CblasNoTrans; + auto b_mode = pack_b ? (CBLAS_TRANSPOSE)CblasPacked : b_trans; + auto oc_mode = CblasFixOffset; + + if (offset_mode == 'F') { + oc_mode = CblasFixOffset; + } else if (offset_mode == 'R') { + oc_mode = CblasRowOffset; + } else if (offset_mode == 'C') { + oc_mode = CblasColOffset; + } + + + if (pack_b) { + + cblas_gemm_s8u8s32_compute(CblasRowMajor, + a_trans, + b_mode, + oc_mode, + m, + n, + k, + alpha, + mem_a, + lda, + oa, + mem_b, + ldb, + ob, + beta, + dst, + ldc, + offset); + + } else { + cblas_gemm_s8u8s32(CblasRowMajor, + a_trans, + b_trans, + oc_mode, + m, + n, + k, + alpha, + mem_a, + lda, + oa, + mem_b, + ldb, + ob, + beta, + dst, + ldc, + offset); + } + + return SaberSuccess; +}; + +template<> +SaberStatus MKLGEMM::execute(const void* handle, const int m, const void* a_matrix, void* c_matrix, + const void* b_matrix) { + auto args = static_cast(handle); + + auto status = SaberSuccess; + + CHECK(args->pack_b || b_matrix != nullptr); + ((gemm_param*)(handle))->m=m; + + if (args->s8_a) { + mem_a_s82u8(static_cast(a_matrix), args->m * args->k); + + if (args->pack_b) { + CHECK_EQ(args->oc_mode, 'R'); + status = execute(_inner_u8_matrix_a.data(), args->pack_b ? args->packed_mem : b_matrix, + args->oc_mem_s8a, + c_matrix, args->s8_a, args->m, args->n, args->k, args->oa, + args->ob, args->lda, args->ldb, args->ldc, args->pack_b, + args->trans_a, args->trans_b, args->beta, args->alpha, + args->s8a_oc_mode); + } else { + CHECK_EQ(args->oc_mode, 'R'); + add_mem_oc_s8a(args->oc_mode, (int*)args->matrix_oc, b_matrix, args->ob, args->m, args->k, args->n, + args->alpha, args->trans_b); + status = execute(_inner_u8_matrix_a.data(), args->pack_b ? args->packed_mem : b_matrix, + (int*)_inner_c_offset.mutable_data(), + c_matrix, args->s8_a, args->m, args->n, args->k, args->oa, + args->ob, args->lda, args->ldb, args->ldc, args->pack_b, + args->trans_a, args->trans_b, args->beta, args->alpha, + args->s8a_oc_mode); + } + } else { + status = execute(a_matrix, args->pack_b ? args->packed_mem : b_matrix, + args->matrix_oc, c_matrix, args->s8_a, args->m, args->n, args->k, + args->oa, args->ob, args->lda, args->ldb, args->ldc, args->pack_b, + args->trans_a, args->trans_b, args->beta, args->alpha, args->oc_mode); + } + + if (status != SaberSuccess) { + return status; + } + + return SaberSuccess; +} + +} // namespace saber +} // namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/x86/mkl_gemm_int8.h b/saber/funcs/impl/x86/mkl_gemm_int8.h new file mode 100644 index 000000000..46df10b91 --- /dev/null +++ b/saber/funcs/impl/x86/mkl_gemm_int8.h @@ -0,0 +1,291 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_MKL_GEMM_INT8_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_MKL_GEMM_INT8_H + +#include "mkl.h" + +#include "saber/saber_types.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/anakin_thread.h" + +namespace anakin { +namespace saber { + +template +class MKLGEMM { + +public: + typedef typename DataTrait::Dtype OP_DType; + + MKLGEMM() + : omp_max_thread(anakin_get_max_threads()) + {} + + ~MKLGEMM() {} + + SaberStatus init(const void* mem_b, + const void* mem_oc, + void** handle, + const char oc_mode, + const size_t m, + const size_t n, + const size_t k, + const int8_t oa, + const int8_t ob, + const bool s8_a, + const bool pack_b, + const bool trans_a, + const bool trans_b, + const float beta, + const float alpha, + const size_t lda, + const size_t ldb, + const size_t ldc); + + SaberStatus execute(const void* handle,const int m, + const void* a_matrix, + void* c_matrix, const void* b_matrix=nullptr); + + SaberStatus release(void* handle); + + void* pack_mem(const void* mem_in, + const bool pack_b, + const bool trans, + const size_t m, + const size_t n, + const size_t k, + const size_t stride, + const float alpha); + + SaberStatus execute(const void* a_matrix, + const void* b_matrix, + const void* oc_matrix, + void* c_matrix, + const bool s8_a, + const size_t m, + const size_t n, + const size_t k, + const int8_t oa, + const int8_t ob, + const size_t lda, + const size_t ldb, + const size_t ldc, + const bool pack_b, + const bool trans_a, + const bool trans_b, + const float beta, + const float alpha, + const char oc_mode); + +private: + size_t omp_max_thread; + + struct gemm_param { + const void* matrix_b{nullptr}; + const void* matrix_oc{nullptr}; + void* packed_mem{nullptr}; + void* oc_mem_s8a{nullptr}; + char oc_mode{' '}; + char s8a_oc_mode{' '}; + size_t m{0}; + size_t n{0}; + size_t k{0}; + size_t lda{0}; + size_t ldb{0}; + size_t ldc{0}; + size_t oa{0}; + int8_t ob{0}; + bool s8_a{false}; + bool pack_b{false}; + bool trans_a{false}; + bool trans_b{false}; + float beta{0.f}; + float alpha{0.f}; + }; + + SaberStatus init_check(const void* mem_b, + const void* mem_oc, + const size_t oa, + const int8_t ob, + const char oc_mode); + + SaberStatus execute_check(const void* mem_a, + const void* mem_b, + const void* mem_oc, + void* mem_c, + const size_t oa, + const int8_t ob, + const char oc_mode); + + SaberStatus mem_a_s82u8(const int8_t* src, size_t length); + + void* mem_oc_s8a_compute(void* handle); + + SaberStatus add_mem_oc_s8a(bool a_s82u8, char oc_mode, const void* in, + void* out, size_t m, size_t n); + void add_mem_oc_s8a(char oc_mode, const int* oc_mem, const void* b_in,int8_t ob, + size_t m, size_t k, size_t n, float alpha, bool trans_b); + Tensor _inner_u8_matrix_a; + Tensor _inner_c_offset; +}; + +template +SaberStatus MKLGEMM::init_check(const void* mem_b, + const void* mem_oc, + const size_t oa, + const int8_t ob, + const char oc_mode) { + if (mem_b == nullptr || mem_oc == nullptr) { + LOG(ERROR) << "wrong empty pointer !"; + return SaberInvalidValue; + } + + if (oc_mode != 'F' && + oc_mode != 'C' && + oc_mode != 'R') { + LOG(ERROR) << "wrong mem_oc mode !"; + return SaberInvalidValue; + } + + if (op_dtype == AK_FLOAT && (oa != 0 || ob != 0)) { + LOG(ERROR) << "don't support offset a,b for float op!"; + return SaberInvalidValue; + } + + return SaberSuccess; +}; + +template +SaberStatus MKLGEMM::execute_check(const void* mem_a, + const void* mem_b, + const void* mem_oc, + void* mem_c, + const size_t oa, + const int8_t ob, + const char oc_mode) { + if (mem_a == nullptr || + mem_b == nullptr || + mem_c == nullptr || + mem_oc == nullptr) { + LOG(FATAL) << "wrong empty pointer !"; + return SaberInvalidValue; + } + + if (oc_mode != 'F' && + oc_mode != 'C' && + oc_mode != 'R') { + LOG(FATAL) << "wrong mem_oc mode !"; + return SaberInvalidValue; + } + + if (op_dtype == AK_FLOAT && (oa != 0 || ob != 0)) { + LOG(FATAL) << "don't support offset a,b for float op!"; + return SaberInvalidValue; + } + + return SaberSuccess; +}; + +template +SaberStatus MKLGEMM::init(const void* mem_b, + const void* mem_oc, + void** handle, + const char oc_mode, + const size_t m, + const size_t n, + const size_t k, + const int8_t oa, + const int8_t ob, + const bool s8_a, + const bool pack_b, + const bool trans_a, + const bool trans_b, + const float beta, + const float alpha, + const size_t lda, + const size_t ldb, + const size_t ldc) { + auto status = init_check(mem_b, mem_oc, oa, ob, oc_mode); + + if (status != SaberSuccess) { + return status; + } + + auto args = new gemm_param; + + args->s8_a = op_dtype == AK_INT8 ? s8_a : false; + args->oc_mode = oc_mode; + args->s8a_oc_mode = args->oc_mode == 'C' ? 'C' : 'R'; + args->m = m; + args->n = n; + args->k = k; + args->oa = oa; + args->ob = ob; + args->lda = lda; + args->ldb = ldb; + args->ldc = ldc; + args->pack_b = pack_b; + args->trans_a = trans_a; + args->trans_b = trans_b; + args->beta = beta; + args->alpha = alpha; + + args->matrix_b = mem_b; + args->matrix_oc = mem_oc; + args->packed_mem = nullptr; + args->oc_mem_s8a = nullptr; + + if (args->pack_b) { + args->packed_mem = pack_mem(args->matrix_b, true, args->trans_b, + args->m, args->n, args->k, args->ldb, args->alpha); + } + if (args->s8_a){ + _inner_u8_matrix_a.re_alloc(Shape({1,1,m,k}), AK_UINT8); + _inner_c_offset.re_alloc(Shape({1,1,1,n}),AK_INT32); + } + + args->oc_mem_s8a = mem_oc_s8a_compute(args); + + *handle = args; + args = nullptr; + return SaberSuccess; +}; + +template +SaberStatus MKLGEMM::release(void* handle) { + auto args = static_cast(handle); + + if (args->packed_mem) { + free(args->packed_mem); + args->packed_mem = nullptr; + } + + if (args->oc_mem_s8a) { + free(args->oc_mem_s8a); + args->oc_mem_s8a = nullptr; + } + + delete (args); + return SaberSuccess; +} + +} +} + + +#endif //ANAKIN_MKL_GEMM_INT8_H diff --git a/saber/funcs/impl/x86/mkl_packed_int8_gemm.cpp b/saber/funcs/impl/x86/mkl_packed_int8_gemm.cpp new file mode 100644 index 000000000..d38ae023d --- /dev/null +++ b/saber/funcs/impl/x86/mkl_packed_int8_gemm.cpp @@ -0,0 +1,100 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "saber/funcs/impl/x86/mkl_packed_int8_gemm.h" +#include "saber/funcs/impl/x86/x86_utils.h" + +namespace anakin { +namespace saber { + +SaberStatus PackedMKLInt8Gemm::init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, Tensor& b, float scale_a) { + _scale.clear(); + if (b.get_dtype() == AK_FLOAT) { + _int8_weights_wx.re_alloc(Shape({1, 1, k, n}), AK_INT8); + utils::ScaleUtils::scale_gemm_xw_weights_to_nchw_host(_int8_weights_wx, b, !trans_b); + _wx_gemm.init(trans_a, trans_b, m, n, k, 0, (int8_t*)_int8_weights_wx.data(), PACKED_MKLGEMM); + } else if (b.get_dtype() == AK_INT8){ + _int8_weights_wx.set_scale(b.get_scale()); + _wx_gemm.init(trans_a, trans_b, m, n, k, 0, (int8_t*)b.data(), PACKED_MKLGEMM); + } else{ + LOG(FATAL)<<"not support"; + } + for (auto i:_int8_weights_wx.get_scale()){ + _scale.push_back(i * scale_a); + } + + + _scale_in.re_alloc(Shape({1, 1, m, k}, Layout_NCHW), AK_INT8); + _m = m; + _n = n; + _k = k; + return SaberSuccess; +} +SaberStatus PackedMKLInt8Gemm::dispatch(const float alpha, const float beta, int m, + const Tensor& a, Tensor& c, Tensor* bias) { + if (a.get_dtype() == AK_FLOAT && c.get_dtype() == AK_INT32) { + CHECK(bias == nullptr || bias->valid_size() == 0); + CHECK_EQ(a.get_layout(), Layout_NCHW); + utils::try_expand_tensor(_scale_in, m * _k); + utils::ScaleUtils::scale_fp32_int8(_scale_in, a); + _wx_gemm.dispatch(alpha, beta, m, (int8_t*)_scale_in.data(), nullptr, (int32_t*)c.data()); + } else if (a.get_dtype() == AK_FLOAT && c.get_dtype() == AK_FLOAT) { + CHECK_EQ(a.get_layout(), Layout_NCHW); + utils::try_expand_tensor(_scale_in, m * _k); + utils::ScaleUtils::scale_fp32_int8(_scale_in, a); + _wx_gemm.dispatch(alpha, beta, m, (int8_t*)_scale_in.data(), nullptr, (int32_t*)c.data()); + CHECK(_int8_weights_wx.get_scale().size() > 0); + float* out_fp32 = static_cast(c.mutable_data()); + const float* scale_vec = _scale.data(); + int32_t* in_epi32 = static_cast(c.data()); + if (bias == nullptr || bias->valid_size() == 0) { + if (_scale.size() == _n) { + for (int i = 0; i < m * _n; i++) { + out_fp32[i] = (float) in_epi32[i] * scale_vec[i % _n]; + } + } else if (_scale.size() == 1) { + float scale = scale_vec[0]; + + for (int i = 0; i < m * _n; i++) { + out_fp32[i] = (float) in_epi32[i] * scale; + } + } + } else { + CHECK_EQ(bias->get_dtype(), AK_FLOAT); + const float* bias_ptr = static_cast(bias->data()); + if (_scale.size() == _n) { + for (int i = 0; i < m * _n; i++) { + out_fp32[i] = (float) in_epi32[i] * scale_vec[i % _n] + bias_ptr[i % _n]; + } + } else if (_scale.size() == 1) { + float scale = scale_vec[0]; + + for (int i = 0; i < m * _n; i++) { + out_fp32[i] = (float) in_epi32[i] * scale + bias_ptr[i % _n]; + } + } + } + } else if (a.get_dtype() == AK_INT8 && c.get_dtype() == AK_INT32) { + CHECK(bias == nullptr || bias->valid_size() == 0); + _wx_gemm.dispatch(alpha, beta, m, (int8_t*)a.data(), nullptr, (int32_t*)c.data()); + } else{ + LOG(FATAL)<<"not support "; + } + return SaberSuccess; +} + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/mkl_packed_int8_gemm.h b/saber/funcs/impl/x86/mkl_packed_int8_gemm.h new file mode 100644 index 000000000..e068471f7 --- /dev/null +++ b/saber/funcs/impl/x86/mkl_packed_int8_gemm.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_INT8_GEMM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_INT8_GEMM_H + +#include "saber/funcs/impl/x86/mkl_gemm.h" +#include "saber/core/tensor.h" +namespace anakin { +namespace saber { +class PackedMKLInt8Gemm { +public: + SaberStatus init(const bool trans_a, const bool trans_b, + const int m, const int n, const int k, Tensor& b, float scale_a = 1.f); + SaberStatus dispatch(const float alpha, const float beta, int m, + const Tensor& a, Tensor& c, Tensor* bias = nullptr); + +private: + MklDnnGemm _wx_gemm; + Tensor _int8_weights_wx; + Tensor _scale_in; + Tensor _scale_out; + std::vector _scale; + int _m; + int _n; + int _k; +}; +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_MKL_PACKED_INT8_GEMM_H diff --git a/saber/funcs/impl/x86/mkldnn_helper.cpp b/saber/funcs/impl/x86/mkldnn_helper.cpp new file mode 100644 index 000000000..2ba3f6eaf --- /dev/null +++ b/saber/funcs/impl/x86/mkldnn_helper.cpp @@ -0,0 +1,93 @@ +#include "anakin_config.h" +#ifndef USE_SGX +#include "saber/funcs/impl/x86/mkldnn_helper.h" + +namespace anakin{ +namespace saber{ + +mkldnn_mem_format get_mkldnn_format(LayoutType layout){ + switch (layout){ + case Layout_NCHW: + return mkldnn_mem_format::nchw; + case Layout_NCHW_C8R: + return mkldnn_mem_format::nChw8c; + default : + return mkldnn_mem_format::nchw; + } +} +mkldnn_mem_format get_mkldnn_format(LayoutType in_layout, LayoutType out_layout){ + if (in_layout == Layout_NCHW){ + switch (out_layout){ + case Layout_NCHW: + return mkldnn_mem_format::oihw; + case Layout_NCHW_C8R: + return mkldnn_mem_format::Oihw8o; + default: + return mkldnn_mem_format::format_undef; + } + + } + if (in_layout == Layout_NCHW_C8R){ + switch (out_layout){ + case Layout_NCHW: + return mkldnn_mem_format::oIhw8i; + case Layout_NCHW_C8R: + return mkldnn_mem_format::OIhw8i8o; + default: + return mkldnn_mem_format::format_undef; + } + } + return mkldnn_mem_format::format_undef; +} +mkldnn_mem_dtype get_mkldnn_dtype(DataType dtype){ + switch (dtype){ + case AK_FLOAT: + return mkldnn_mem_dtype::f32; + case AK_INT8: + return mkldnn_mem_dtype::u8; + default: + return mkldnn_mem_dtype::f32; + } +} +desc create_mkldnn_memory_desc( + const std::vector& dims, + mkldnn_mem_dtype dtype, + mkldnn_mem_format layout){ + mkldnn_mem_dim tz = dims; + return desc({tz}, dtype, layout); +} + +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor, mkldnn::engine e){ + + mkldnn_mem_format mft = get_mkldnn_format(tensor -> get_layout()); + mkldnn_mem_dtype dt = get_mkldnn_dtype(tensor -> get_dtype()); + mkldnn_mem_dim dim = tensor -> shape(); + + return mkldnn_mem_ptr(new mkldnn_mem({ { {dim}, dt, mft}, e}, tensor->mutable_data())); +} +mkldnn_mem_ptr create_mkldnn_memory_no_data(const Tensor* tensor, mkldnn::engine e){ + + mkldnn_mem_format mft = get_mkldnn_format(tensor -> get_layout()); + mkldnn_mem_dtype dt = get_mkldnn_dtype(tensor -> get_dtype()); + mkldnn_mem_dim dim = tensor -> shape(); + + return mkldnn_mem_ptr(new mkldnn_mem({ { {dim}, dt, mft}, e})); +} +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor, const std::vector& sh, mkldnn::engine e){ + mkldnn_mem_format mft = get_mkldnn_format(tensor -> get_layout()); + mkldnn_mem_dtype dt = get_mkldnn_dtype(tensor -> get_dtype()); + mkldnn_mem_dim dim = sh; + + return mkldnn_mem_ptr(new mkldnn_mem({ { {dim}, dt, mft}, e}, tensor->mutable_data())); +} + +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor,const std::vector& sh, + mkldnn_mem_format mft, mkldnn_mem_dtype dt, mkldnn::engine e){ + mkldnn_mem_dim dim = sh; + return mkldnn_mem_ptr(new mkldnn_mem({ { {dim}, dt, mft}, e}, tensor->mutable_data())); +} + + +} +} +#endif diff --git a/saber/funcs/impl/x86/mkldnn_helper.h b/saber/funcs/impl/x86/mkldnn_helper.h new file mode 100644 index 000000000..b75c36afb --- /dev/null +++ b/saber/funcs/impl/x86/mkldnn_helper.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_MKLDNN_HELPER_H +#define ANAKIN_SABER_MKLDNN_HELPER_H + +#include "anakin_config.h" +#include "saber/core/common.h" +#include "saber/saber_types.h" +#include "saber/core/tensor.h" + +#include "mkldnn.hpp" + +namespace anakin{ + +namespace saber{ + +typedef mkldnn::memory::data_type mkldnn_mem_dtype; +typedef mkldnn::memory::format mkldnn_mem_format; +typedef mkldnn::memory::dims mkldnn_mem_dim; +typedef mkldnn::memory mkldnn_mem; +typedef std::shared_ptr mkldnn_mem_ptr; +typedef mkldnn::deconvolution_forward mkldnn_deconv; +typedef mkldnn::convolution_forward mkldnn_conv; +typedef mkldnn::eltwise_forward mkldnn_relu; + +template +using desc = typename T::desc; +template +using pdesc = typename T::primitive_desc; + +mkldnn_mem_format get_mkldnn_format(LayoutType layout); +mkldnn_mem_format get_mkldnn_format(LayoutType in_layout, LayoutType out_layout); +mkldnn_mem_dtype get_mkldnn_dtype(DataType dtype); + +desc create_mkldnn_memory_desc( + const std::vector& dims, + mkldnn_mem_dtype dtype, + mkldnn_mem_format layout); + +template +desc create_mkldnn_memory_desc(const std::vector& sh, + mkldnn_mem_format fmt = mkldnn_mem_format::any){ + mkldnn_mem_dim tz = sh; + mkldnn_mem_dtype dt = get_mkldnn_dtype(Dtype); + return desc({tz}, dt, fmt); +} + +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor, mkldnn::engine e); + +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor, const std::vector& sh, mkldnn::engine e); + +mkldnn_mem_ptr create_mkldnn_memory(Tensor* tensor, const std::vector& sh, + mkldnn_mem_format mft, mkldnn_mem_dtype dt, mkldnn::engine e); + +mkldnn_mem_ptr create_mkldnn_memory_no_data(const Tensor* tensor, mkldnn::engine e); + + +} // namespace mkldnn +} // namespace anakin + +#endif //SABER_MKLDNN_HELPER_H diff --git a/saber/funcs/impl/x86/saber_activation.cpp b/saber/funcs/impl/x86/saber_activation.cpp index ff4c60783..e91d724b8 100644 --- a/saber/funcs/impl/x86/saber_activation.cpp +++ b/saber/funcs/impl/x86/saber_activation.cpp @@ -1,5 +1,13 @@ #include "saber/funcs/impl/x86/saber_activation.h" +#include "saber/funcs/impl/x86/saber_normal_activation.h" +#include "mkl.h" +#include "saber/funcs/impl/x86/saber_avx512_funcs.h" +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#include +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif #include namespace anakin{ @@ -25,6 +33,94 @@ SaberStatus SaberActivation::create( return SaberSuccess; } +static void excute_prelu(const std::vector*>& inputs, + std::vector*>& outputs, + ActivationParam& param) { + LayoutType in_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + PreluParam prelu = param.prelu_param; + +#if defined(__AVX2__) and defined(__FMA__) + + if (prelu.channel_shared) { + for (size_t i = 0; i < inputs.size(); i++) { + const float* input_data = (float*)inputs[i]->data(); + float* output_data = (float*)outputs[i]->mutable_data(); + int size = inputs[i]->valid_size(); + float* slope_ptr = (float*)prelu.slope->data(); + float alpha = slope_ptr[0]; + const __m256 prelu_alpha = _mm256_set1_ps(alpha); + int round_length = size/8*8; + int remainder = size % 8; + if (alpha > 1.f) { +#pragma omp parallel for + + for (int index = 0; index < round_length; index += 8) { + __m256 temp = _mm256_load_ps(&input_data[index]); + __m256 temp_mul = _mm256_mul_ps(temp, prelu_alpha); + temp = _mm256_min_ps(temp, temp_mul); + _mm256_store_ps(&output_data[index], temp); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 temp = _mm256_maskload_ps(&input_data[round_length], _vec_mask); + __m256 temp_mul = _mm256_mul_ps(temp, prelu_alpha); + __m256 _vec_mask_m256 = _m256_continue_mask_m256(remainder); + temp = _mm256_min_ps(temp, temp_mul); + _mm256_maskstore_ps(&output_data[round_length], _vec_mask, temp); + } + } else { +#pragma omp parallel for + + for (int index = 0; index < round_length; index += 8) { + __m256 temp = _mm256_load_ps(&input_data[index]); + __m256 temp_mul = _mm256_mul_ps(temp, prelu_alpha); + temp = _mm256_max_ps(temp, temp_mul); + _mm256_store_ps(&output_data[index], temp); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 temp = _mm256_maskload_ps(&input_data[round_length], _vec_mask); + __m256 temp_mul = _mm256_mul_ps(temp, prelu_alpha); + __m256 _vec_mask_m256 = _m256_continue_mask_m256(remainder); + temp = _mm256_max_ps(temp, temp_mul); + _mm256_maskstore_ps(&output_data[round_length], _vec_mask, temp); + } + } + } + return; + } + +#endif + + + for (size_t i = 0; i < inputs.size(); i++) { + const float* input_data = (float*)inputs[i]->data(); + float* output_data = (float*)outputs[i]->mutable_data(); + Shape shin = inputs[i]->valid_shape(); + int num = shin[0]; + int channel = shin[1]; + int size = shin[2] * shin[3]; + + for (int n = 0; n < num; n++) { + const float* in_ptr = input_data + n * channel * size; + float* out_ptr = output_data + n * channel * size; + float* slope_ptr = (float*)prelu.slope->data(); + + for (int c = 0; c < channel; c++) { + const float* in_ch_ptr = in_ptr + c * size; + float* out_ch_ptr = out_ptr + c * size; + float slope = prelu.channel_shared ? slope_ptr[0] : slope_ptr[c]; + + for (int k = 0; k < size; k++) { + out_ch_ptr[k] = in_ch_ptr[k] > 0 ? in_ch_ptr[k] : in_ch_ptr[k] * slope; + } + } + } + } + +} + template SaberStatus SaberActivation::dispatch( const std::vector*>& inputs, @@ -37,12 +133,15 @@ SaberStatus SaberActivation::dispatch( size_t len = inputs[vc]->valid_size(); OpDataType *input_data = (OpDataType*)inputs[vc]->mutable_data(); OpDataType *output_data = (OpDataType*)outputs[vc]->mutable_data(); - + outputs[vc]->set_posstive_flag(true); +#if defined(__AVX2__) and defined(__FMA__) + avx2_vector_relu(input_data,len,output_data); +#else +#pragma omp parallel for schedule(static) for (size_t i = 0; i < len; i++) { - *output_data = *input_data > (OpDataType)0 ? *input_data : (OpDataType)0; - input_data++; - output_data++; + output_data[i] = input_data[i] > (OpDataType)0 ? input_data[i] : (OpDataType)0; } +#endif } } @@ -64,24 +163,33 @@ SaberStatus SaberActivation::dispatch( for ( size_t i = 0; i < inputs.size() ; i++) { size_t len = inputs[i]->valid_size(); const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + outputs[i]->set_posstive_flag(true); OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); - +#if defined(__AVX512F__) + avx512_vector_sigmoid(input_data, len, output_data); +#elif defined(__AVX2__) and defined(__FMA__) + avx2_vector_sigmoid(input_data, len, output_data); +#else for (size_t j = 0; j < len; j++) { output_data[j] = 1.0f / (1.0f + exp(-input_data[j])); } +#endif } } // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) if (param.active == Active_tanh) { + for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); - - for (size_t j = 0; j < len; j++) { - output_data[j] = tanh(input_data[j]); - } + vsTanh(len,input_data,output_data); +// for (size_t j = 0; j < len; j++) { +// output_data[j] = tanh(input_data[j]); +// } } } @@ -94,12 +202,26 @@ SaberStatus SaberActivation::dispatch( size_t len = inputs[i]->valid_size(); const OpDataType *input_data = (OpDataType*)inputs[i]->data(); OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); - + outputs[i]->set_posstive_flag(true); for(size_t j = 0; j < len; j++){ output_data[j] = input_data[j] > 0 ? input_data[j] : 0; output_data[j] = output_data[j] < threshold ? output_data[j] : threshold; } } + + } + //swish: x /(1 + exp(-(b * x))) + if (param.active == Active_swish) { + for (size_t i = 0; i < inputs.size(); i++) { + const OpDataType beta = param.coef; + size_t len = inputs[i]->valid_size(); + const OpDataType *input_data = (OpDataType*)inputs[i]->data(); + OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); + + for (size_t j = 0; j < len; j++) { + output_data[j] = input_data[j] / (1.0f + exp(-input_data[j] * beta)); + } + } } //elu: x > 0 ? x : coef * (exp(x) - 1) @@ -115,31 +237,26 @@ SaberStatus SaberActivation::dispatch( } } } - //prelu: x > 0 ? x : slope[c] * x - if (param.active == Active_prelu) { - PreluParam prelu = param.prelu_param; + + //gelu: y = x * (0.5 * erf(x/sqrt(2)) + 1) + if (param.active == Active_gelu) { for (size_t i = 0; i < inputs.size(); i++) { + size_t len = inputs[i]->valid_size(); const OpDataType *input_data = (OpDataType*)inputs[i]->data(); OpDataType *output_data = (OpDataType*)outputs[i]->mutable_data(); - Shape shin = inputs[i]->valid_shape(); - int num = shin[0]; - int channel = shin[1]; - int size = shin[2] * shin[3]; - for (int n = 0; n < num; n++){ - const OpDataType *in_ptr = input_data + n * channel * size; - OpDataType *out_ptr = output_data + n * channel * size; - OpDataType *slope_ptr = (OpDataType*)prelu.slope->data(); - for (int c = 0; c < channel; c++){ - const OpDataType *in_ch_ptr = in_ptr + c * size; - OpDataType *out_ch_ptr = out_ptr + c * size; - OpDataType slope = prelu.channel_shared ? slope_ptr[0]: slope_ptr[c]; - for (int k = 0; k < size; k++){ - out_ch_ptr[k] = in_ch_ptr[k] > 0 ? in_ch_ptr[k] : in_ch_ptr[k] * slope; - } - } + + for(size_t j = 0; j < len; j++){ + OpDataType x = input_data[j]; + OpDataType coeff = 0.5 * (std::erf(x/sqrt(2)) + 1); + + output_data[j] = x * coeff; } } } + //prelu: x > 0 ? x : slope[c] * x + if (param.active == Active_prelu) { + excute_prelu(inputs, outputs, param); + } for (size_t i = 0; i < inputs.size(); i++) { outputs[i]->set_seq_offset(inputs[i]->get_seq_offset()); } diff --git a/saber/funcs/impl/x86/saber_affine_channel.cpp b/saber/funcs/impl/x86/saber_affine_channel.cpp index 85ed4fc62..af20977c4 100644 --- a/saber/funcs/impl/x86/saber_affine_channel.cpp +++ b/saber/funcs/impl/x86/saber_affine_channel.cpp @@ -16,23 +16,28 @@ SaberStatus SaberAffineChannel::dispatch(\ const std::vector *>& inputs, \ std::vector *>& outputs, \ AffineChannelParam& param) { + outputs[0]->reshape(outputs[0]->valid_shape()); const OpDataType* src = (const OpDataType*)inputs[0]->data(); - const OpDataType* scale = (const OpDataType*)inputs[1]->data(); - const OpDataType* bias = (const OpDataType*)inputs[2]->data(); + const OpDataType* scale = (const OpDataType*)param.weight()->data(); + const OpDataType* bias = (const OpDataType*)param.bias()->data(); OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); int channel_idx = inputs[0]->channel_index(); int channel = inputs[0]->channel(); - CHECK_EQ(inputs[1]->valid_size(), channel) << "affine channel input scale dims are not valid"; - CHECK_EQ(inputs[2]->valid_size(), channel) << "affine channel input bias dims are not valid"; + CHECK_EQ(param.weight()->valid_size(), channel) << "affine channel input scale dims are not valid"; + CHECK_EQ(param.bias()->valid_size(), channel) << "affine channel input bias dims are not valid"; int outer_num = inputs[0]->count_valid(0, channel_idx); int inner_num = inputs[0]->count_valid(channel_idx+1, inputs[0]->dims()); int id = 0; + //for (int i = 0; i < outputs[0]->valid_size(); i++) { + // dst[i] = 0.1f; + //} for (int i = 0; i < outer_num; i++) { for (int j = 0; j < channel; j++) { for (int k = 0; k < inner_num; k++) { dst[id] = src[id] * scale[j] + bias[j]; id++; + //LOG(INFO) << "id" << id << " channel:" << channel << "inner_num: " << inner_num << " j: " << j; } } } diff --git a/saber/funcs/impl/x86/saber_aligned_mat_mul.cpp b/saber/funcs/impl/x86/saber_aligned_mat_mul.cpp new file mode 100644 index 000000000..28b21e829 --- /dev/null +++ b/saber/funcs/impl/x86/saber_aligned_mat_mul.cpp @@ -0,0 +1,70 @@ +#include "saber/funcs/impl/x86/saber_aligned_mat_mul.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberAlignedMatMul::init( + const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m, + Context &ctx) { + _alpha = param.scale; + _beta = 0.f; + _trans_a = param.is_transpose_X ? CblasTrans : CblasNoTrans; + _trans_b = param.is_transpose_Y ? CblasTrans : CblasNoTrans; + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberAlignedMatMul::create( + const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberAlignedMatMul::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m) { + const OpDataType* src0 = (OpDataType*)inputs[0]->data(); + const OpDataType* src1 = (OpDataType*)inputs[1]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + auto seq_offset_0 = inputs[0]->get_seq_offset()[0]; + auto seq_offset_1 = inputs[1]->get_seq_offset()[0]; + int inner_A = inputs[0]->count_valid(1, inputs[0]->dims()); + int inner_B = inputs[1]->count_valid(1, inputs[1]->dims()); + int batch_A = seq_offset_0[1]; + int batch_B = seq_offset_1[1]; + int M = param.is_transpose_X ? inner_A : batch_A; + int N = param.is_transpose_Y ? batch_B: inner_B; + int K_A = param.is_transpose_X ? batch_A : inner_A; + int K_B = param.is_transpose_Y ? inner_B : batch_B; + CHECK_EQ(K_A, K_B) << "mat mul two inputs K is not equal"; + int K = K_A; + int lda = param.is_transpose_X ? M : K; + int ldb = param.is_transpose_Y ? K : N; + int ldc = N; + int seq_num = seq_offset_0.size() - 1; + for (int i = 0; i < seq_num; i++) { + cblas_sgemm(CblasRowMajor, _trans_a, _trans_b, M, N, K_A, _alpha, src0 + i * batch_A * inner_A, lda, src1 + i * batch_B * inner_B, ldb, _beta, dst + i * M * N, ldc); + } + + return SaberSuccess; +} + +template class SaberAlignedMatMul; +DEFINE_OP_TEMPLATE(SaberAlignedMatMul, AlignedMatMulParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAlignedMatMul, AlignedMatMulParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_aligned_mat_mul.h b/saber/funcs/impl/x86/saber_aligned_mat_mul.h new file mode 100644 index 000000000..bf60c61e1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_aligned_mat_mul.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ALIGNED_MAT_MUL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ALIGNED_MAT_MUL_H + +#include "saber/funcs/impl/impl_aligned_mat_mul.h" +#include "mkl.h" + +namespace anakin { +namespace saber { + +template +class SaberAlignedMatMul : + public ImplBase< + X86, OpDtype, + AlignedMatMulParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberAlignedMatMul() {} + + ~SaberAlignedMatMul() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam ¶m) override; + +private: + CBLAS_LAYOUT _layout; //CblasRowMajor or CblasColMajor + CBLAS_TRANSPOSE _trans_a; //matrix A whether to tranpose. + CBLAS_TRANSPOSE _trans_b; //matrix B whether to tranpose. + float _alpha{1.0f}; + float _beta{0.0f}; + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_anchor_generator.cpp b/saber/funcs/impl/x86/saber_anchor_generator.cpp new file mode 100644 index 000000000..822bdae41 --- /dev/null +++ b/saber/funcs/impl/x86/saber_anchor_generator.cpp @@ -0,0 +1,74 @@ +#include "saber/funcs/impl/x86/saber_anchor_generator.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include +namespace anakin { +namespace saber { + + +/** + * @brief formula: (k + alpha * sigma((x(i))^2)) ^ beta. + * where, + * local_size = 5(default), means 5 channels in succession. + * sigma((x(i))^2): sum of x^2 of k channels in succession. + * + * + */ +template +SaberStatus SaberAnchorGenerator::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + AnchorGeneratorParam& param) { + + const OpDataType* src = (const OpDataType*)inputs[0]->data(); + OpDataType* dst = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* var = (OpDataType*)outputs[1]->mutable_data(); + auto anchor_sizes = param.anchor_sizes; + auto aspect_ratios = param.aspect_ratios; + auto stride = param.stride; + auto variances = param.variances; + auto offset = param.offset; + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int stride_w = stride[0]; + int stride_h = stride[1]; + auto anchor_tmp = dst; + auto var_tmp = var; + for (int h_idx = 0; h_idx < height; h_idx++) { + for (int w_idx = 0; w_idx < width; w_idx++) { + OpDataType x_ctr = (w_idx * stride_w) + offset * (stride_w - 1); + OpDataType y_ctr = (h_idx * stride_h) + offset * (stride_h - 1); + for (size_t r = 0; r < aspect_ratios.size(); r++) { + auto ar = aspect_ratios[r]; + for (size_t s = 0; s < anchor_sizes.size(); s++) { + auto anchor_size = anchor_sizes[s]; + OpDataType area = stride_w * stride_h; + OpDataType area_ratios = area / ar; + OpDataType base_w = round(sqrt(area_ratios)); + OpDataType base_h = round(base_w * ar); + OpDataType scale_w = anchor_size / stride_w; + OpDataType scale_h = anchor_size / stride_h; + OpDataType half_width = 0.5 * (scale_w * base_w - 1); + OpDataType half_height = 0.5 * (scale_h * base_h - 1); + anchor_tmp[0] = x_ctr - half_width; + anchor_tmp[1] = y_ctr - half_height; + anchor_tmp[2] = x_ctr + half_width; + anchor_tmp[3] = y_ctr + half_height; + var_tmp[0] = variances[0]; + var_tmp[1] = variances[1]; + var_tmp[2] = variances[2]; + var_tmp[3] = variances[3]; + anchor_tmp += 4; + var_tmp += 4; + } + } + } + } + + return SaberSuccess; +} + +template class SaberAnchorGenerator; +DEFINE_OP_TEMPLATE(SaberAnchorGenerator, AnchorGeneratorParam, X86, AK_INT16); +DEFINE_OP_TEMPLATE(SaberAnchorGenerator, AnchorGeneratorParam, X86, AK_INT8); +} +} diff --git a/saber/funcs/impl/x86/saber_fake_quantize_abs_max.h b/saber/funcs/impl/x86/saber_anchor_generator.h similarity index 69% rename from saber/funcs/impl/x86/saber_fake_quantize_abs_max.h rename to saber/funcs/impl/x86/saber_anchor_generator.h index 63382ccc5..df132791c 100644 --- a/saber/funcs/impl/x86/saber_fake_quantize_abs_max.h +++ b/saber/funcs/impl/x86/saber_anchor_generator.h @@ -13,27 +13,27 @@ limitations under the License. */ -#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_FAKE_QUANTIZE_ABS_MAX_H -#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_FAKE_QUANTIZE_ABS_MAX_H +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ANCHOR_GENERATOR_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ANCHOR_GENERATOR_H -#include "saber/funcs/impl/impl_fake_quantize_abs_max.h" +#include "saber/funcs/impl/impl_anchor_generator.h" namespace anakin{ namespace saber{ template -class SaberFakeQuantizeAbsMax: public ImplBase > { +class SaberAnchorGenerator: public ImplBase > { public: typedef typename DataTrait::Dtype OpDataType; - SaberFakeQuantizeAbsMax() {} - ~SaberFakeQuantizeAbsMax() {} + SaberAnchorGenerator() {} + ~SaberAnchorGenerator() {} virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam ¶m, + AnchorGeneratorParam ¶m, Context &ctx) { this->_ctx = &ctx; return create(inputs, outputs, param, ctx); @@ -41,14 +41,14 @@ class SaberFakeQuantizeAbsMax: public ImplBase *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam &crop_param, + AnchorGeneratorParam &crop_param, Context &ctx) { return SaberSuccess; } virtual SaberStatus dispatch(const std::vector *>& inputs, std::vector *>& outputs, - FakeQuantizeAbsMaxParam ¶m); + AnchorGeneratorParam ¶m); private: }; @@ -57,4 +57,4 @@ class SaberFakeQuantizeAbsMax: public ImplBase + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberArithmetic::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberArithmetic::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberArithmetic::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m) { + const OpDataType *input_data_0 = (const OpDataType*)inputs[0]->data(); + const OpDataType *input_data_1 = (const OpDataType*)inputs[1]->data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + auto seq_offset_0 = inputs[0]->get_seq_offset()[0]; + auto seq_offset_1 = inputs[1]->get_seq_offset()[0]; + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int inner_size = inputs[0]->count_valid(1, inputs[0]->dims()); + + + // out[j] = input_0[j] + input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == SUM) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + int len = std::min(len_0, len_1); +#if defined(__AVX2__) and defined(__FMA__) + avx2_vector_sum(input_0, input_1, len, out); +#else +#pragma omp parallel for schedule(static) + for (int j = 0; j < len; j++) { + out[j] = input_0[j] + input_1[j]; + } +#endif + if (len_0 > len) { + memcpy(out + len, input_0 + len, sizeof(OpDataType) * (len_0 -len)); + } + + } + } + + // out[j] = input_0[j] - input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == SUB) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + int len = std::min(len_0, len_1); +#if defined(__AVX2__) and defined(__FMA__) + avx2_vector_sub(input_0, input_1, len, out); +#else +#pragma omp parallel for schedule(static) + for (int j = 0; j < len; j++) { + out[j] = input_0[j] - input_1[j]; + } +#endif + if (len_0 > len) { + memcpy(out + len, input_0 + len, sizeof(OpDataType) * (len_0 -len)); + } + } + } + // out[j] = input_0[j] * input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == MUL) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + int len = std::min(len_0, len_1); +#if defined(__AVX2__) and defined(__FMA__) + avx2_vector_mul(input_0, input_1, len, out); +#else +#pragma omp parallel for schedule(static) + for (int j = 0; j < len; j++) { + out[j] = input_0[j] * input_1[j]; + } +#endif + if (len_0 > len) { + memcpy(out + len, input_0 + len, sizeof(OpDataType) * (len_0 -len)); + } + } + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; +} + +template class SaberArithmetic; +DEFINE_OP_TEMPLATE(SaberArithmetic, ArithmeticParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberArithmetic, ArithmeticParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_arithmetic.h b/saber/funcs/impl/x86/saber_arithmetic.h new file mode 100644 index 000000000..9cf60574f --- /dev/null +++ b/saber/funcs/impl/x86/saber_arithmetic.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARITHMETIC_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ARITHMETIC_H + +#include "saber/funcs/impl/impl_arithmetic.h" + +namespace anakin { +namespace saber { + +template +class SaberArithmetic : + public ImplBase< + X86, OpDtype, + ArithmeticParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberArithmetic() {} + + ~SaberArithmetic() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ArithmeticParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_attension_lstm.cpp b/saber/funcs/impl/x86/saber_attension_lstm.cpp index 43bfe71c9..d696af933 100644 --- a/saber/funcs/impl/x86/saber_attension_lstm.cpp +++ b/saber/funcs/impl/x86/saber_attension_lstm.cpp @@ -1,5 +1,3 @@ - -#include #include #include "saber_types.h" #include "saber/funcs/impl/x86/saber_attension_lstm.h" @@ -97,7 +95,7 @@ void sequence_pool(const Dtype* data, const Dtype* weight, std::vector& seq for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { Dtype scale = weight[j]; - Dtype* tmp_data = data + j * dim; + const Dtype* tmp_data = data + j * dim; for (int k = 0; k < dim; k++) { tmp_out[k] += scale * tmp_data[k]; @@ -337,4 +335,4 @@ DEFINE_OP_TEMPLATE(SaberAttensionLstm, AttensionLstmParam, X86, AK_HALF); DEFINE_OP_TEMPLATE(SaberAttensionLstm, AttensionLstmParam, X86, AK_INT8); } -} \ No newline at end of file +} diff --git a/saber/funcs/impl/x86/saber_attension_lstm.h b/saber/funcs/impl/x86/saber_attension_lstm.h index 3aefdb7dd..11064920b 100644 --- a/saber/funcs/impl/x86/saber_attension_lstm.h +++ b/saber/funcs/impl/x86/saber_attension_lstm.h @@ -83,4 +83,4 @@ class SaberAttensionLstm: public ImplBase < } // namespace saber } // namespace anakin -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ATTENSION_LSTM_H \ No newline at end of file +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ATTENSION_LSTM_H diff --git a/saber/funcs/impl/x86/saber_attention_padding_mask.cpp b/saber/funcs/impl/x86/saber_attention_padding_mask.cpp new file mode 100644 index 000000000..73b4cd550 --- /dev/null +++ b/saber/funcs/impl/x86/saber_attention_padding_mask.cpp @@ -0,0 +1,66 @@ + +#include "saber/funcs/impl/x86/saber_attention_padding_mask.h" +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberAttentionPaddingMask::init( + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberAttentionPaddingMask::create( + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberAttentionPaddingMask::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m) { + auto src_offset = inputs[1]->get_seq_offset()[0]; + auto attn_offset = inputs[0]->get_seq_offset()[0]; + int src_len = inputs[1]->count_valid(1, inputs[1]->dims()); + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = src_offset[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0) << "Missmatch batch size"; + + size_t count = inputs[0]->valid_size(); + OpDataType *attn_data = (OpDataType*)inputs[0]->mutable_data(); + OpDataType *src_data = (OpDataType*)inputs[1]->mutable_data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + memcpy(output_data, attn_data, count * sizeof(OpDataType)); + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_output_data = output_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx+1]-src_offset[src_seq_idx]; + auto tmp_src_data = src_data + src_seq_idx * src_seq_len; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_output_data[k] = param.mask; + } + } + } + + return SaberSuccess; +} + +template class SaberAttentionPaddingMask; +DEFINE_OP_TEMPLATE(SaberAttentionPaddingMask, AttentionPaddingMaskParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberAttentionPaddingMask, AttentionPaddingMaskParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_attention_padding_mask.h b/saber/funcs/impl/x86/saber_attention_padding_mask.h new file mode 100644 index 000000000..f57cb13db --- /dev/null +++ b/saber/funcs/impl/x86/saber_attention_padding_mask.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ATTENTION_PADDING_MASK_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ATTENTION_PADDING_MASK_H + +#include "saber/funcs/impl/impl_attention_padding_mask.h" + +namespace anakin { +namespace saber { + +template +class SaberAttentionPaddingMask : + public ImplBase< + X86, OpDtype, + AttentionPaddingMaskParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberAttentionPaddingMask() {} + + ~SaberAttentionPaddingMask() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_avx2_expand.h b/saber/funcs/impl/x86/saber_avx2_expand.h index 1a753ac01..0fff52143 100644 --- a/saber/funcs/impl/x86/saber_avx2_expand.h +++ b/saber/funcs/impl/x86/saber_avx2_expand.h @@ -2,6 +2,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX2_EXPAND_H #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX2_EXPAND_H #if defined(__AVX2__) and defined(__FMA__) +#include namespace anakin { namespace saber { @@ -66,27 +67,23 @@ inline float _m256_self_max(const __m256& x) { inline float _m256_max_array(const float* in, int length) { __m256 max_vec = _mm256_set1_ps(-1e32); - - for (int j = 0; j < length; j += 8) { + int round_length = length/8*8; + int remainder = length % 8; + for (int j = 0; j < round_length; j += 8) { __m256 temp_in = _mm256_loadu_ps(&in[j]); max_vec = _mm256_max_ps(temp_in, max_vec); } - int remainder = length % 8; - if (remainder > 0) { - int iter = length / 8 * 8; __m256i _vec_mask = _m256_continue_mask_m256i(remainder); - __m256 temp_in = _mm256_maskload_ps(&in[iter], _vec_mask); + __m256 temp_in = _mm256_maskload_ps(&in[round_length], _vec_mask); __m256 _vec_mask_m256 = _m256_continue_mask_m256(remainder); max_vec = _mm256_blendv_ps(max_vec, _mm256_max_ps(temp_in, max_vec), _vec_mask_m256); } - return _m256_self_max(max_vec); } - } } diff --git a/saber/funcs/impl/x86/saber_avx2_funcs.cpp b/saber/funcs/impl/x86/saber_avx2_funcs.cpp index 51b0d30ab..e937c4dfe 100644 --- a/saber/funcs/impl/x86/saber_avx2_funcs.cpp +++ b/saber/funcs/impl/x86/saber_avx2_funcs.cpp @@ -1,4 +1,3 @@ - #include "saber_avx2_funcs.h" #include "saber/funcs/impl/x86/saber_normal_activation.h" #include "saber/funcs/debug.h" @@ -7,6 +6,137 @@ namespace anakin { namespace saber { +inline __m256 avx2_load_mask(const float* in, int length) { + __m256i vec_mask = _m256_continue_mask_m256i(length); + return _mm256_maskload_ps(in, vec_mask); +} + +inline void avx2_save_mask(__m256& in, float* out, int length) { + __m256i vec_mask = _m256_continue_mask_m256i(length); + _mm256_maskstore_ps(out, vec_mask, in); +} + +void avx2_vector_relu(const float* in, int length, float* out) { + int remainder = length % 8; + int round_length = length / 8 * 8; + __m256 zero = _mm256_setzero_ps(); + #pragma omp parallel for schedule(static) + + for (int i = 0; i < length; i += 8) { + __m256 temp = _mm256_loadu_ps(&in[i]); + _mm256_storeu_ps(&out[i], _mm256_max_ps(zero, temp)); + } + + if (remainder > 0) { + __m256i vec_mask = _m256_continue_mask_m256i(remainder); + __m256 temp = _mm256_maskload_ps(&in[round_length], vec_mask); + _mm256_maskstore_ps(&out[round_length], vec_mask, _mm256_max_ps(zero, temp)); + } + +}; + +void avx2_vector_sigmoid(const float* in, int length, float* out) { + int remainder = length % 8; + int round_length = length / 8 * 8; + #pragma omp parallel for schedule(static) + + for (int i = 0; i < length; i += 8) { + __m256 temp = _mm256_loadu_ps(&in[i]); + _mm256_storeu_ps(&out[i], Sigmoid(temp)); + } + + if (remainder > 0) { + __m256i vec_mask = _m256_continue_mask_m256i(remainder); + __m256 temp = _mm256_maskload_ps(&in[round_length], vec_mask); + _mm256_maskstore_ps(&out[round_length], vec_mask, Sigmoid(temp)); + } + +}; + +void avx2_vector_soft_sign(const float* in, int length, float* out) { + int remainder = length % 8; + int round_length = length / 8 * 8; + + __m256 one = _mm256_set1_ps(1.f); + __m256 zero = _mm256_setzero_ps(); + #pragma omp parallel for schedule(static) + + for (int i = 0; i < length; i += 8) { + __m256 src = _mm256_loadu_ps(&in[i]); + __m256 src_abs = _mm256_max_ps(src, -src); + __m256 denominator = _mm256_add_ps(src_abs, one); + _mm256_storeu_ps(&out[i], _mm256_div_ps(src, denominator)); + } + + if (remainder > 0) { + __m256i vec_mask = _m256_continue_mask_m256i(remainder); + __m256 src = _mm256_maskload_ps(&in[round_length], vec_mask); + __m256 src_abs = _mm256_max_ps(src, -src); + __m256 denominator = _mm256_add_ps(src_abs, one); + _mm256_maskstore_ps(&out[round_length], vec_mask, _mm256_div_ps(src, denominator)); + } + +}; + +void avx2_vector_softmax_stride(const float* in, int col, int row, float* out) { + int remainder_col = col % 8; + int round_col = col / 8 * 8; + + for (int col_id = 0; col_id < round_col; col_id += 8) { + + __m256 max_vec = _mm256_set1_ps(-1e20); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_loadu_ps(&in[row_id * col + col_id]); + max_vec = _mm256_max_ps(max_vec, temp_in); + } + + __m256 exp_sum = _mm256_setzero_ps(); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_loadu_ps(&in[row_id * col + col_id]); + __m256 temp_in_exp = exp256_ps_fma(temp_in - max_vec); + exp_sum = _mm256_add_ps(exp_sum, temp_in_exp); + _mm256_storeu_ps(&out[row_id * col + col_id], temp_in_exp); + } + + __m256 exp_sum_rev = _mm256_div_ps(_mm256_set1_ps(1), exp_sum); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_loadu_ps(&out[row_id * col + col_id]); + _mm256_storeu_ps(&out[row_id * col + col_id], _mm256_mul_ps(temp_in, exp_sum_rev)); + } + } + + if (remainder_col > 0) { + + const __m256i vec_mask = _m256_continue_mask_m256i(remainder_col); + __m256 max_vec = _mm256_set1_ps(-1e20); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_maskload_ps(&in[row_id * col + round_col], vec_mask); + max_vec = _mm256_max_ps(max_vec, temp_in); + } + + __m256 exp_sum = _mm256_setzero_ps(); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_maskload_ps(&in[row_id * col + round_col], vec_mask); + __m256 temp_in_exp = exp256_ps_fma(temp_in - max_vec); + exp_sum = exp_sum + temp_in_exp; + _mm256_maskstore_ps(&out[row_id * col + round_col], vec_mask, temp_in_exp); + } + + __m256 exp_sum_rev = _mm256_div_ps(_mm256_set1_ps(1), exp_sum); + + for (int row_id = 0; row_id < row; row_id++) { + __m256 temp_in = _mm256_maskload_ps(&out[row_id * col + round_col], vec_mask); + _mm256_maskstore_ps(&out[row_id * col + round_col], vec_mask, _mm256_mul_ps(temp_in, exp_sum_rev)); + } + } +} + + void avx2_vector_softmax(const float* in, int length, float* out) { float max = _m256_max_array(in, length); __m256 max_vec = _mm256_set1_ps(max); @@ -27,19 +157,20 @@ void avx2_vector_softmax(const float* in, int length, float* out) { __m256 temp_in = _mm256_maskload_ps(&in[round_length], vec_mask); __m256 temp_exp = _mm256_blendv_ps(_mm256_setzero_ps(), exp256_ps_fma(temp_in - max_vec), vec_mask_m256); + _mm256_maskstore_ps(&out[round_length], vec_mask, temp_exp); exp_sum += temp_exp; float sum = _m256_self_sum(exp_sum); - __m256 sum_vec = _mm256_set1_ps(sum); + __m256 sum_vec = _mm256_set1_ps(1.f / sum); for (int j = 0; j < round_length; j += 8) { __m256 temp_in = _mm256_loadu_ps(&out[j]); - _mm256_storeu_ps(&out[j], temp_in / sum_vec); + _mm256_storeu_ps(&out[j], temp_in * sum_vec); } temp_in = _mm256_maskload_ps(&out[round_length], vec_mask); - _mm256_maskstore_ps(&out[round_length], vec_mask, temp_in / sum_vec); + _mm256_maskstore_ps(&out[round_length], vec_mask, temp_in * sum_vec); } else { for (int j = 0; j < round_length; j += 8) { @@ -50,11 +181,11 @@ void avx2_vector_softmax(const float* in, int length, float* out) { } float sum = _m256_self_sum(exp_sum); - __m256 sum_vec = _mm256_set1_ps(sum); + __m256 sum_vec = _mm256_set1_ps(1.f / sum); for (int j = 0; j < round_length; j += 8) { __m256 temp_in = _mm256_loadu_ps(&out[j]); - _mm256_storeu_ps(&out[j], temp_in / sum_vec); + _mm256_storeu_ps(&out[j], temp_in * sum_vec); } } @@ -206,7 +337,139 @@ void avx2_sequence_pool(const float* data, const float* weight, std::vector } } +void avx2_cos_sim(const float* in_0, + const float* in_1, + const int num, + const int len, + const float epsilon, + float* out) { + int round_dim = len / 8 * 8; + int remainder = len % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + + for (int n = 0; n < num; n++) { + __m256 aa_sum = _mm256_setzero_ps(); + __m256 bb_sum = _mm256_setzero_ps(); + __m256 ab_sum = _mm256_setzero_ps(); + + for (int k = 0; k < round_dim; k += 8) { + __m256 a = _mm256_loadu_ps(&in_0[k]); + __m256 b = _mm256_loadu_ps(&in_1[k]); + aa_sum = _mm256_fmadd_ps(a, a, aa_sum); + bb_sum = _mm256_fmadd_ps(b, b, bb_sum); + ab_sum = _mm256_fmadd_ps(a, b, ab_sum); + } + + if (remainder > 0) { + __m256 a = _mm256_maskload_ps(&in_0[round_dim], mask_m256i); + __m256 b = _mm256_maskload_ps(&in_1[round_dim], mask_m256i); + aa_sum = _mm256_fmadd_ps(a, a, aa_sum); + bb_sum = _mm256_fmadd_ps(b, b, bb_sum); + ab_sum = _mm256_fmadd_ps(a, b, ab_sum); + } + + float a_square_sum = _m256_self_sum(aa_sum); + float b_square_sum = _m256_self_sum(bb_sum); + float ab_prod_sum = _m256_self_sum(ab_sum); + float c = a_square_sum * b_square_sum; + + if (c < epsilon) { + out[n] = 0.f; + } else { + out[n] = ab_prod_sum / sqrt(c); + } + + in_0 += len; + in_1 += len; + } + +} + +void avx2_vector_sum(const float* in_0, + const int len, + float* out) { + int round_dim = len / 8 * 8; + int remainder = len % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + #pragma omp parallel for schedule(static) + + for (int k = 0; k < round_dim; k += 8) { + __m256 a = _mm256_loadu_ps(&in_0[k]); + __m256 b = _mm256_loadu_ps(&out[k]); + _mm256_storeu_ps(&out[k], _mm256_add_ps(a, b)); + } + + if (remainder > 0) { + __m256 a = _mm256_maskload_ps(&in_0[round_dim], mask_m256i); + __m256 b = _mm256_maskload_ps(&out[round_dim], mask_m256i); + _mm256_maskstore_ps(out + round_dim, mask_m256i, _mm256_add_ps(a, b)); + } +} + +void avx2_vector_sum(const float* in_0, + const float* in_1, + const int len, + float* out) { + int round_dim = len / 8 * 8; + int remainder = len % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + + for (int k = 0; k < round_dim; k += 8) { + __m256 a = _mm256_loadu_ps(&in_0[k]); + __m256 b = _mm256_loadu_ps(&in_1[k]); + _mm256_storeu_ps(&out[k], _mm256_add_ps(a, b)); + } + + if (remainder > 0) { + __m256 a = _mm256_maskload_ps(&in_0[round_dim], mask_m256i); + __m256 b = _mm256_maskload_ps(&in_1[round_dim], mask_m256i); + _mm256_maskstore_ps(out + round_dim, mask_m256i, _mm256_add_ps(a, b)); + } +} + +void avx2_vector_sub(const float* in_0, + const float* in_1, + const int len, + float* out) { + int round_dim = len / 8 * 8; + int remainder = len % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + + for (int k = 0; k < round_dim; k += 8) { + __m256 a = _mm256_loadu_ps(&in_0[k]); + __m256 b = _mm256_loadu_ps(&in_1[k]); + _mm256_storeu_ps(&out[k], _mm256_sub_ps(a, b)); + } + + if (remainder > 0) { + __m256 a = _mm256_maskload_ps(&in_0[round_dim], mask_m256i); + __m256 b = _mm256_maskload_ps(&in_1[round_dim], mask_m256i); + _mm256_maskstore_ps(out + round_dim, mask_m256i, _mm256_sub_ps(a, b)); + } +} + + +void avx2_vector_mul(const float* in_0, + const float* in_1, + const int len, + float* out) { + int round_dim = len / 8 * 8; + int remainder = len % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + + for (int k = 0; k < round_dim; k += 8) { + __m256 a = _mm256_loadu_ps(&in_0[k]); + __m256 b = _mm256_loadu_ps(&in_1[k]); + _mm256_storeu_ps(&out[k], _mm256_mul_ps(a, b)); + } + + if (remainder > 0) { + __m256 a = _mm256_maskload_ps(&in_0[round_dim], mask_m256i); + __m256 b = _mm256_maskload_ps(&in_1[round_dim], mask_m256i); + _mm256_maskstore_ps(out + round_dim, mask_m256i, _mm256_mul_ps(a, b)); + } +} } } -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/impl/x86/saber_avx2_funcs.h b/saber/funcs/impl/x86/saber_avx2_funcs.h index 3ae3656e7..641531db4 100644 --- a/saber/funcs/impl/x86/saber_avx2_funcs.h +++ b/saber/funcs/impl/x86/saber_avx2_funcs.h @@ -1,21 +1,87 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX2_FUNCS_H #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX2_FUNCS_H -#if defined(__AVX2__) and defined(__FMA__) + #include +#include "saber/funcs/impl/x86/kernel/jit_generator.h" namespace anakin { namespace saber { +inline bool avx2_is_compiled(){ +#if defined(__AVX2__) and defined(__FMA__) + return true; +#else + return false; +#endif +}; + +inline bool avx2_can_used(){ + return avx2_is_compiled()&&jit::mayiuse(jit::avx2); +}; +#if defined(__AVX2__) and defined(__FMA__) +void avx2_vector_softmax_stride(const float* in, int col, int row, float* out); void avx2_vector_softmax(const float* in, int length, float* out); +void avx2_vector_relu(const float* in, int length, float* out); +void avx2_vector_sigmoid(const float* in, int length, float* out); void avx2_sequence_softmax(const float* data, std::vector& seq_offset, float* out); void avx2_lstm_bias_and_act(const float* hidden_in, const float* bias_data, float* out, float* cell_data, const int seq_num, const int hidden_size, const int with_peephole); -void avx2_sequence_pool(const float* data, const float* weight, std::vector& seq_offset, int dim, + +void avx2_sequence_pool(const float* data, + const float* weight, + std::vector& seq_offset, + int dim, float* out); +void avx2_vector_soft_sign(const float* in, + int length, + float* out); + +/* Calculate the angle between two vectors + * cos(theta) = a'b / (|a| * |b|) + * output is cos(theta) + * */ +void avx2_cos_sim(const float* in_0, + const float* in_1, + const int num, + const int len, + const float epsilon, + float* out); + +/* Calculate the sum of two vectors + * y[i] += x[i] + * */ +void avx2_vector_sum(const float* in_0, + const int len, + float* out); + +/* Calculate the sum of two vectors + * z[i] = x[i] + y[i] + * */ +void avx2_vector_sum(const float* in_0, + const float* in_1, + const int len, + float* out); + +/* Calculate the sub of two vectors + * z[i] = x[i] - y[i] + * */ +void avx2_vector_sub(const float* in_0, + const float* in_1, + const int len, + float* out); + +/* Calculate the product of two vectors + * z[i] = x[i] * y[i] + * */ +void avx2_vector_mul(const float* in_0, + const float* in_1, + const int len, + float* out); +#endif } } -#endif + #endif //ANAKIN_SABER_AVX2_FUNCS_H diff --git a/saber/funcs/impl/x86/saber_avx512_expand.h b/saber/funcs/impl/x86/saber_avx512_expand.h new file mode 100644 index 000000000..cb9e32d6b --- /dev/null +++ b/saber/funcs/impl/x86/saber_avx512_expand.h @@ -0,0 +1,14 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX512_EXPAND_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX512_EXPAND_H +namespace anakin { +namespace saber { +#if defined(__AVX512F__) +inline __mmask16 __mm512_get_mask(int k) { + __mmask16 mask = 0xffff; + return mask >> (16 - k); +} +#endif +} +} + +#endif //ANAKIN_SABER_AVX512_EXPAND_H diff --git a/saber/funcs/impl/x86/saber_avx512_funcs.h b/saber/funcs/impl/x86/saber_avx512_funcs.h new file mode 100644 index 000000000..dd50d5198 --- /dev/null +++ b/saber/funcs/impl/x86/saber_avx512_funcs.h @@ -0,0 +1,36 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX512_FUNCS_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_AVX512_FUNCS_H + +#if defined(__AVX512F__) +#include "saber_normal_activation.h" +namespace anakin { + +namespace saber { + +void avx512_vector_sigmoid(const float* in, int length, float* out) { + const int simd_length = 16; + int remainder = length % simd_length; + int round_length = length / simd_length * simd_length; + +#pragma omp parallel for schedule(static) + + for (int i = 0; i < length; i += simd_length) { + __m512 temp = Sigmoid(_mm512_loadu_ps(&in[i])); + _mm512_storeu_ps(&out[i], temp); + } + + if (remainder > 0) { + __mmask16 vec_mask = 0xffff; + vec_mask = vec_mask >> (simd_length - remainder); + __m512 temp; + temp = _mm512_mask_loadu_ps(temp, vec_mask, &in[round_length]); + _mm512_mask_storeu_ps(&out[round_length], vec_mask, Sigmoid(temp)); + } +}; + +} +} +#endif + +#endif //ANAKIN_SABER_AVX512_FUNCS_H diff --git a/saber/funcs/impl/x86/saber_box_clip.cpp b/saber/funcs/impl/x86/saber_box_clip.cpp new file mode 100644 index 000000000..b8871a10d --- /dev/null +++ b/saber/funcs/impl/x86/saber_box_clip.cpp @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "saber/funcs/impl/x86/saber_box_clip.h" + + +namespace anakin { + +namespace saber { + +template +SaberStatus SaberBoxClip::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, EmptyParam& param) { + + static constexpr int im_info_size = 3; + static constexpr int box_info_size = 4; + auto seq_offset = inputs[1]->get_seq_offset(); + CHECK_EQ(inputs.size(), 2) << "need two input"; + CHECK_EQ(seq_offset.size(), 1) << "need offset to cal batch"; + CHECK_GT(seq_offset[0].size(), 1) << "need offset to cal batch"; + auto offset = seq_offset[0]; + auto img = inputs[1]; + auto im_info = inputs[0]; + const float* im_info_ptr = static_cast(im_info->data()); + const float* box_ptr_in = static_cast(img->data()); + float* box_ptr_out = static_cast(outputs[0]->data()); + int batch_size = offset.size() - 1; + CHECK_EQ(batch_size * im_info_size, im_info->valid_size()) << "im_info should be valid"; + + for (int batch_id = 0; batch_id < batch_size; batch_id++) { + const float img_h = im_info_ptr[batch_id * im_info_size + 0]; + const float img_w = im_info_ptr[batch_id * im_info_size + 1]; + const float scale = im_info_ptr[batch_id * im_info_size + 2]; + const float img_h_scale = round(img_h / scale) - 1; + const float img_w_scale = round(img_w / scale) - 1; + const int start_in_batch = offset[batch_id]; + const int end_in_batch = offset[batch_id + 1]; + + for (int im_id = start_in_batch; im_id < end_in_batch; im_id++) { + const float* batch_box_ptr_in = &box_ptr_in[im_id * box_info_size]; + float* batch_box_ptr_out = &box_ptr_out[im_id * box_info_size]; + batch_box_ptr_out[0] = std::max(std::min(batch_box_ptr_in[0], img_w_scale), 0.f); + batch_box_ptr_out[1] = std::max(std::min(batch_box_ptr_in[1], img_h_scale), 0.f); + batch_box_ptr_out[2] = std::max(std::min(batch_box_ptr_in[2], img_w_scale), 0.f); + batch_box_ptr_out[3] = std::max(std::min(batch_box_ptr_in[3], img_h_scale), 0.f); + } + } + + return SaberSuccess; +} + +template class SaberBoxClip; +DEFINE_OP_TEMPLATE(SaberBoxClip, EmptyParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberBoxClip, EmptyParam, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_box_clip.h b/saber/funcs/impl/x86/saber_box_clip.h new file mode 100644 index 000000000..96781d5f2 --- /dev/null +++ b/saber/funcs/impl/x86/saber_box_clip.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_BOX_CLIP_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_BOX_CLIP_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_box_clip.h" +#include "saber/core/tensor.h" + +namespace anakin { + +namespace saber { + +template +class SaberBoxClip : \ + public ImplBase < + X86, + OpDtype, + EmptyParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberBoxClip() = default; + ~SaberBoxClip() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param, Context& ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param, Context& ctx) { + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + EmptyParam& param)override; + +private: + +}; + +} //namespace saber + +} //namespace anakin +#endif //ANAKIN_SABER_BOX_CLIP_H diff --git a/saber/funcs/impl/x86/saber_box_coder.cpp b/saber/funcs/impl/x86/saber_box_coder.cpp new file mode 100644 index 000000000..ead8e6614 --- /dev/null +++ b/saber/funcs/impl/x86/saber_box_coder.cpp @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "saber/funcs/impl/x86/saber_box_coder.h" + +namespace anakin { + +namespace saber { + +enum BOX_CODER_VAR { + FIX_SIZE_VAR = 0, + NO_VAR = 1, + FROM_INPUT_VAR = 2 +}; + +/** + * NOTE: Fluid box coder no exp clamp + * @tparam Dtype + * @tparam fix_size_var + * @param proposals + * @param anchors + * @param bbox_deltas + * @param variances + * @param param + */ +template +static inline void box_coder(Tensor* proposals, + const Tensor* anchors, + const Tensor* bbox_deltas, + const Tensor* variances, + BoxCoderParam& param + ) { + const size_t row = bbox_deltas->num(); + const size_t col = bbox_deltas->channel(); + const size_t anchor_nums = row * col; + const size_t len = anchors->valid_shape()[1]; + CHECK_EQ(len, 5) << "anchor length is 5"; + int out_len = 4; + int var_len = 4; + int delta_len = 4; + + const Dtype* anchor_data = (const Dtype*) anchors->data(); + const Dtype* bbox_deltas_data = (const Dtype*) bbox_deltas->data(); + Dtype* proposals_data = (Dtype*) proposals->data(); + const Dtype* variances_data = nullptr; + float normalized = !param.box_normalized ? 1.f : 0; + + if (variances) { + variances_data = (const Dtype*)variances->data(); + } + + for (int64_t row_id = 0; row_id < row; ++row_id) { + for (int64_t col_id = 0; col_id < col; ++col_id) { + size_t delta_offset = row_id * col * delta_len + col_id * delta_len; + int prior_box_offset = param.axis == 0 ? col_id * len : row_id * len; + auto anchor_data_tmp = anchor_data + prior_box_offset + 1; + auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset; + auto proposals_data_tmp = proposals_data + delta_offset; + auto anchor_width = anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; + auto anchor_height = anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + Dtype bbox_center_x = 0, bbox_center_y = 0; + Dtype bbox_width = 0, bbox_height = 0; + + if (fix_size_var == FROM_INPUT_VAR) { + int var_offset = param.axis == 0 ? col_id * var_len : row_id * var_len; + auto variances_data_tmp = variances_data + var_offset; + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data_tmp[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(variances_data_tmp[2] * + bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(variances_data_tmp[3] * + bbox_deltas_data_tmp[3]) * anchor_height; + } + + if (fix_size_var == FIX_SIZE_VAR) { + bbox_center_x = + variances_data[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(variances_data[2] * + bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(variances_data[3] * + bbox_deltas_data_tmp[3]) * anchor_height; + + } else if (fix_size_var == NO_VAR) { + bbox_center_x = + bbox_deltas_data_tmp[0] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data_tmp[3]) * anchor_height; + } + + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; + } + } +} + +template +SaberStatus SaberBoxCoder::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, BoxCoderParam& param) { + Tensor* anchor = inputs[0]; + Tensor* delta = inputs[1]; + Tensor* variances = nullptr; + Tensor* proposal = outputs[0]; + + if (param.variance() != nullptr && param.variance()->valid_size() > 0) { + variances = param.variance(); + CHECK(variances->valid_size() == 4); + box_coder(proposal, anchor, delta, variances, param); + } else if (inputs.size() >= 3) { + variances = inputs[2]; + box_coder(proposal, anchor, delta, variances, param); + } else { + box_coder(proposal, anchor, delta, variances, param); + } + + return SaberSuccess; +} + +template class SaberBoxCoder; +DEFINE_OP_TEMPLATE(SaberBoxCoder, BoxCoderParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberBoxCoder, BoxCoderParam, X86, AK_INT8); +} //namespace anakin + +} //name diff --git a/saber/funcs/impl/x86/saber_box_coder.h b/saber/funcs/impl/x86/saber_box_coder.h new file mode 100644 index 000000000..d16906dcb --- /dev/null +++ b/saber/funcs/impl/x86/saber_box_coder.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2019 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_BOX_CODER_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_BOX_CODER_H +#include "anakin_config.h" +#include "saber/funcs/impl/impl_box_coder.h" +#include "saber/core/tensor.h" +namespace anakin { + +namespace saber { + +template +class SaberBoxCoder : \ + public ImplBase < + X86, + OpDtype, + BoxCoderParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberBoxCoder() = default; + ~SaberBoxCoder() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param, Context& ctx) { + //get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param, Context& ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + BoxCoderParam& param)override; + +private: +}; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_BOX_CODER_H diff --git a/saber/funcs/impl/x86/saber_cast.cpp b/saber/funcs/impl/x86/saber_cast.cpp index 88d4d4a14..7bb0c2d6a 100644 --- a/saber/funcs/impl/x86/saber_cast.cpp +++ b/saber/funcs/impl/x86/saber_cast.cpp @@ -30,14 +30,14 @@ SaberStatus SaberCast::dispatch(const std::vector*>& i cast_kernel(in_data, out_data, count); } - } - - if(inputs[0]->get_dtype() == 5){//AK_INT32 + } else if (inputs[0]->get_dtype() == 5){//AK_INT32 const int* in_data = (const int*)inputs[0]->data(); float* out_data = (float*)outputs[0]->mutable_data(); if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { cast_kernel(in_data, out_data, count); } + } else { + outputs[0]->copy_from(*inputs[0]); } return SaberSuccess; diff --git a/saber/funcs/impl/x86/saber_col2im_deconv.cpp b/saber/funcs/impl/x86/saber_col2im_deconv.cpp index 00ccc17bd..b478e77bc 100644 --- a/saber/funcs/impl/x86/saber_col2im_deconv.cpp +++ b/saber/funcs/impl/x86/saber_col2im_deconv.cpp @@ -7,13 +7,16 @@ namespace saber { void fill_bias_relu(float* tensor, const float* bias, int channel, int channel_size, bool flag_relu) { float* data = tensor; + for (int j = 0; j < channel; ++j) { for (int i = 0; i < channel_size; i++) { data[i] += bias[j]; + if (flag_relu) { data[i] = data[i] > 0 ? data[i] : 0.f; } } + data += channel_size; } } @@ -21,12 +24,14 @@ void fill_bias_relu(float* tensor, const float* bias, int channel, int channel_s void fill_relu(float* tensor, int channel, int channel_size, bool flag_relu) { float* data = tensor; + for (int j = 0; j < channel; ++j) { for (int i = 0; i < channel_size; i++) { if (flag_relu) { data[i] = data[i] > 0 ? data[i] : 0.f; } } + data += channel_size; } } @@ -64,10 +69,112 @@ void col2im(const Dtype* data_col, const int channels, if (is_a_ge_zero_and_a_lt_b(input_col, width)) { data_im[input_row * width + input_col] += *data_col; } + data_col++; input_col += stride_w; } } + + input_row += stride_h; + } + } + } + } +} + +template +void col2im_par(const Dtype* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_im, bool with_bias = false, const Dtype* bias = nullptr) { + int dil_patch_h = (kernel_h - 1) * dilation_h + 1; + int dil_patch_w = (kernel_w - 1) * dilation_w + 1; + int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1; + long chunk_len = kernel_h * kernel_w; + + if (with_bias) { + int channel_size = width * height; + #pragma omp parallel for schedule(static) if(channels>1) + + for (int j = 0; j < channels; ++j) { + float* data_out = data_im + j * channel_size; + float value = bias[j]; + + for (int i = 0; i < channel_size; i++) { + data_out[i] = value; + } + } + } else { + memset(data_im, 0, height * width * channels * sizeof(Dtype)); + } + + #pragma omp parallel for schedule(static) + + for (int idx = 0; idx < channels; ++idx) { + for (int inner_idx = 0; inner_idx < chunk_len; ++inner_idx) { + int c = idx * chunk_len + inner_idx; + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + + const int hc0 = h_offset * dilation_h - pad_h; + const int wc0 = w_offset * dilation_w - pad_w; + + for (int h = 0; h < height_col; ++h) { + for (int w = 0; w < width_col; ++w) { + int h_pad = h * stride_h + hc0; + const int srow_offset = (c_im * height + h_pad) * width; + const int row_offset = (c * height_col + h) * width_col; + int w_pad = w * stride_w + wc0; + + if ((((unsigned)h_pad) < ((unsigned)height)) && (((unsigned)w_pad) < ((unsigned)width))) { + data_im[srow_offset + w_pad] += data_col[row_offset + w]; + } + } + } + } + } +} + +template +void col2im_par_me(const Dtype* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_im) { + + memset(data_im, 0, height * width * channels * sizeof(Dtype)); + const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channel_size = height * width; + #pragma omp parallel for schedule(static) + + for (int channel = channels; channel > 0 ; channel--) { + for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_row = -pad_h + kernel_row * dilation_h; + float* data_im_to = data_im + (channels - channel) * channel_size; + + for (int output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + data_col += output_w; + } else { + int input_col = -pad_w + kernel_col * dilation_w; + + for (int output_col = output_w; output_col; output_col--) { + if (is_a_ge_zero_and_a_lt_b(input_col, width)) { + data_im_to[input_row * width + input_col] += *data_col; + } + + data_col++; + input_col += stride_w; + } + } + input_row += stride_h; } } @@ -77,8 +184,8 @@ void col2im(const Dtype* data_col, const int channels, template <> SaberStatus SaberCol2ImDeconv::create(const std::vector *>& inputs, - std::vector*>& outputs, - ConvParam ¶m, Context&ctx) { + std::vector*>& outputs, + ConvParam& param, Context& ctx) { this->_ctx = &ctx; int win = inputs[0]->width(); @@ -97,6 +204,7 @@ SaberStatus SaberCol2ImDeconv::create(const std::vector *> CHECK_EQ(chin % param.group, 0) << "input channel or group size error"; CHECK_EQ(chout % param.group, 0) << "output channel or group size error"; } + Shape workspace_shape({1, 1, 1, param.group* _m * _n}); workspace_tensor.re_alloc(workspace_shape, AK_FLOAT); @@ -106,16 +214,16 @@ SaberStatus SaberCol2ImDeconv::create(const std::vector *> template <> SaberStatus SaberCol2ImDeconv::init(const std::vector *>& inputs, - std::vector*>& outputs, - ConvParam ¶m, Context&ctx) { + std::vector*>& outputs, + ConvParam& param, Context& ctx) { this->_ctx = &ctx; return create(inputs, outputs, param, ctx); } template <> SaberStatus SaberCol2ImDeconv::dispatch(const std::vector *>& inputs, - std::vector*>& outputs, - ConvParam ¶m) { + std::vector*>& outputs, + ConvParam& param) { bool bias_term = param.bias() != nullptr && param.bias()->valid_size() > 0; int win = inputs[0]->width(); int hin = inputs[0]->height(); @@ -130,16 +238,10 @@ SaberStatus SaberCol2ImDeconv::dispatch(const std::vector int _m = chout * _kw * _kh / param.group; int _n = hin * win; - int _k = chin / param.group; - int group = param.group; int group_size_in = win * hin * chin / group; - int group_size_out = wout * hout * chout / group; int group_size_coldata = _m * _n; int group_size_weights = chin * chout * _kw * _kh / (group * group); - bool flag_1x1s1p1 = (_kw == 1) && (_kh == 1) && (param.stride_h == 1) && \ - (param.stride_w == 1) && (param.pad_w == 1) && (param.pad_h == 1) && \ - (param.dilation_w == 1) && (param.dilation_h == 1); bool with_relu = (param.activation_param.active == Active_relu); const float* din = static_cast(inputs[0]->data()); @@ -152,25 +254,31 @@ SaberStatus SaberCol2ImDeconv::dispatch(const std::vector float* dout_batch = dout + i * chout * hout * wout; float* col_data = workspace_ptr; + for (int g = 0; g < param.group; ++g) { const float* din_group = din_batch + g * group_size_in; const float* weights_group = weights + g * group_size_weights; float* coldata_group = col_data + g * group_size_coldata; _gemm.dispatch(1.f, 0.f, weights_group, din_group, coldata_group); } - col2im(col_data, chout, hout, wout, _kh, _kw, param.pad_h, param.pad_w, \ - param.stride_h, param.stride_w, param.dilation_h, param.dilation_w, \ - dout_batch); - //! add bias if (bias_term) { - fill_bias_relu(dout_batch, static_cast(param.bias()->data()), chout, wout * hout, - with_relu); + col2im_par(col_data, chout, hout, wout, _kh, _kw, param.pad_h, param.pad_w, \ + param.stride_h, param.stride_w, param.dilation_h, param.dilation_w, \ + dout_batch, bias_term, static_cast(param.bias()->data())); } else { + col2im_par(col_data, chout, hout, wout, _kh, _kw, param.pad_h, param.pad_w, \ + param.stride_h, param.stride_w, param.dilation_h, param.dilation_w, \ + dout_batch); + } + + if (with_relu) { fill_relu(dout_batch, chout, wout * hout, with_relu); } } + + return SaberSuccess; } } } diff --git a/saber/funcs/impl/x86/saber_concat.cpp b/saber/funcs/impl/x86/saber_concat.cpp index 8319492ab..fa5b28d5f 100644 --- a/saber/funcs/impl/x86/saber_concat.cpp +++ b/saber/funcs/impl/x86/saber_concat.cpp @@ -1,8 +1,8 @@ #include "saber/funcs/impl/x86/saber_concat.h" -namespace anakin{ +namespace anakin { -namespace saber{ +namespace saber { template void concat_kernel(const int len, const dtype* src, dtype* dst) { @@ -10,10 +10,19 @@ void concat_kernel(const int len, const dtype* src, dtype* dst) { memcpy(dst, src, sizeof(dtype) * len); } } +template <> +SaberStatus SaberConcat::create(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m, Context &ctx){ -template -SaberStatus SaberConcat::dispatch(const std::vector*>& inputs, - std::vector*>& outputs, ConcatParam ¶m) { + _num_concats = inputs[0]->count_valid(0, param.axis); + _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); + return SaberSuccess; +} + +template <> +SaberStatus SaberConcat::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, ConcatParam& param) { int input_size = inputs.size(); //! get output data, valid shape and stride shape @@ -21,6 +30,39 @@ SaberStatus SaberConcat::dispatch(const std::vector*>& Shape out_shape = outputs[0]->valid_shape(); const int out_concat_axis = out_shape[param.axis]; + if (inputs[0]->get_layout() == Layout_NCHW_C8R) { + for (int i = 1; i < input_size; i++) { + CHECK_EQ(inputs[i]->get_layout(), Layout_NCHW_C8R) << "concat layout should euqal"; + } + + CHECK_EQ(outputs[0]->get_layout(), Layout_NCHW_C8R) << "concat output layout should euqal"; + + if (inputs.size() == 1) { + outputs[0]->copy_from(*inputs[0]); + return SaberSuccess; + } + + OpDataType* dout = (OpDataType*)outputs[0]->mutable_data(); + + for (int i = 0; i < input_size; ++i) { + Shape sh_in = inputs[i]->valid_shape(); + const OpDataType* din = (const OpDataType*)inputs[i]->data(); + const int in_concat_axis = sh_in[param.axis]; + + for (int n = 0; n < _num_concats; ++n) { + concat_kernel(in_concat_axis * _concat_input_size, + din + n * in_concat_axis * _concat_input_size, + dout + (n * out_concat_axis + offset_concat_axis) + * _concat_input_size); + } + + offset_concat_axis += in_concat_axis; + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; + } + if (inputs.size() == 1) { outputs[0]->copy_from(*inputs[0]); return SaberSuccess; @@ -32,21 +74,72 @@ SaberStatus SaberConcat::dispatch(const std::vector*>& Shape sh_in = inputs[i]->valid_shape(); const OpDataType* din = (const OpDataType*)inputs[i]->data(); const int in_concat_axis = sh_in[param.axis]; + for (int n = 0; n < _num_concats; ++n) { concat_kernel(in_concat_axis * _concat_input_size, - din + n * in_concat_axis * _concat_input_size, - dout + (n * out_concat_axis + offset_concat_axis) - * _concat_input_size); + din + n * in_concat_axis * _concat_input_size, + dout + (n * out_concat_axis + offset_concat_axis) + * _concat_input_size); } + offset_concat_axis += in_concat_axis; } + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); return SaberSuccess; } +template <> +SaberStatus SaberConcat::create(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m, + Context &ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberConcat::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConcatParam ¶m) { + + return SaberSuccess; +} + +template +SaberStatus SaberConcat::init_conf(jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m){ + return SaberSuccess; +}; + +template +SaberStatus SaberConcat::check_conf(const jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m){ + return SaberSuccess; +}; +template <> +SaberStatus SaberConcat::init_conf(jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m) { + return SaberSuccess; +} + +template <> +SaberStatus SaberConcat::check_conf(const jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m) { + return SaberSuccess; +} + + template class SaberConcat; DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(SaberConcat, ConcatParam, X86, AK_INT8); } //namespace anakin } //namespace anakin diff --git a/saber/funcs/impl/x86/saber_concat.h b/saber/funcs/impl/x86/saber_concat.h index 1566d3d33..3d5a1e2f1 100644 --- a/saber/funcs/impl/x86/saber_concat.h +++ b/saber/funcs/impl/x86/saber_concat.h @@ -20,6 +20,7 @@ #include "saber/funcs/impl/impl_concat.h" #include "saber/core/tensor.h" +#include "saber/funcs/impl/x86/kernel/jit_call_conf.h" namespace anakin{ namespace saber{ @@ -33,8 +34,45 @@ class SaberConcat : \ public: typedef typename DataTrait::Dtype OpDataType; - SaberConcat() = default; - ~SaberConcat() {} + SaberConcat() : _num_concats(0), _concat_input_size(0), + dst_data_(nullptr), + srcs_data_(nullptr), src_with_offset_(nullptr), + tail_(nullptr), ic_(nullptr), + nb_ic_(nullptr), scale_(nullptr), + block_(nullptr){ + + }; + ~SaberConcat() { + + if (srcs_data_ != nullptr) { + delete srcs_data_; + srcs_data_ = nullptr; + } + if (src_with_offset_ != nullptr) { + delete src_with_offset_; + src_with_offset_ = nullptr; + } + if (tail_ != nullptr) { + delete tail_; + tail_ = nullptr; + } + if (ic_ != nullptr) { + delete ic_; + ic_ = nullptr; + } + if (nb_ic_ != nullptr) { + delete nb_ic_; + nb_ic_ = nullptr; + } + if (scale_ != nullptr) { + delete scale_; + scale_ = nullptr; + } + if (block_ != nullptr) { + delete block_; + block_ = nullptr; + } + } virtual SaberStatus init(const std::vector*>& inputs, std::vector*>& outputs, @@ -46,20 +84,34 @@ class SaberConcat : \ virtual SaberStatus create(const std::vector*>& inputs, std::vector*>& outputs, - ConcatParam ¶m, Context &ctx){ - - _num_concats = inputs[0]->count_valid(0, param.axis); - _concat_input_size = inputs[0]->count_valid(param.axis + 1, inputs[0]->dims()); - return SaberSuccess; - } + ConcatParam ¶m, Context &ctx)override; virtual SaberStatus dispatch(const std::vector*>& inputs, std::vector*>& outputs, ConcatParam ¶m)override; + private: int _num_concats; int _concat_input_size; + + unsigned long* tail_; + unsigned int* ic_; + unsigned int* nb_ic_; + unsigned int* block_; + float* scale_; + unsigned char* dst_data_; + const unsigned char** srcs_data_; + const unsigned char** src_with_offset_; + virtual SaberStatus init_conf(jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m); + + virtual SaberStatus check_conf(const jit::jit_concat_conf_t &jpp, + const std::vector*> &inputs, + std::vector*> &outputs, + ConcatParam ¶m); }; } //namespace saber diff --git a/saber/funcs/impl/x86/saber_conv.cpp b/saber/funcs/impl/x86/saber_conv.cpp index 9d1048f65..251092de7 100644 --- a/saber/funcs/impl/x86/saber_conv.cpp +++ b/saber/funcs/impl/x86/saber_conv.cpp @@ -2,13 +2,15 @@ #include "saber/funcs/impl/x86/saber_conv.h" #include "saber/funcs/impl/x86/saber_im2col_conv.h" #include "saber/funcs/impl/x86/kernel/jit_avx2_conv.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_group_conv.h" #include "saber/funcs/impl/x86/kernel/jit_uni_dwconv.h" #include "saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.h" #include "saber/funcs/impl/x86/kernel/jit_avx512_conv.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_conv.h" -#include "saber/funcs/impl/x86/kernel/jit_avx512_core_u8s8s32x_1x1_conv.h" -#include "saber/funcs/impl/x86/gemm_u8s8s32x_conv.h" - +#include "saber/funcs/impl/x86/gemm_x8s8s32x_conv.h" +#include "saber/funcs/impl/x86/saber_conv_1x1.h" +#include "saber/funcs/impl/x86/kernel/jit_uni_dwconv.h" +#include "saber/funcs/impl/x86/winograd.h" +#include "saber/funcs/debug.h" namespace anakin { namespace saber { @@ -23,7 +25,23 @@ SaberStatus SaberConv2D::create(const std::vector *>& elt_param.has_eltwise = false; ConvEltwiseParam conv_elt_param(param, elt_param); - return this->impl->create(inputs, outputs, conv_elt_param, ctx); + if (_input_trans) { + int in = inputs[0]->num(); + int ic = inputs[0]->channel(); + int ih = inputs[0]->height(); + int iw = inputs[0]->width(); + utils::try_expand_tensor(_input_trans_tensor, Shape({in, ic, ih, iw}, + _input_trans_tensor.get_layout())); + _input_trans_tensor.set_seq_offset(inputs[0]->get_seq_offset()); + } + + if (_input_trans) { + return this->impl->create(_fake_input_vec, outputs, conv_elt_param, ctx); + } else { + return this->impl->create(inputs, outputs, conv_elt_param, ctx); + } + + return SaberSuccess; } template <> @@ -34,24 +52,82 @@ SaberStatus SaberConv2D::init(const std::vector *>& i EltwiseParam elt_param(Eltwise_sum); elt_param.has_eltwise = false; ConvEltwiseParam conv_elt_param(param, elt_param); - bool use_avx512 = false;//mayiuse(avx512_common); + bool use_avx512 = mayiuse(avx512_common); bool use_avx2 = mayiuse(avx2); - - if (use_avx512 && param.group == inputs[0]->channel() && param.group == outputs[0]->channel()) { - this->impl = new JitUniDWConv; - } else if (use_avx512 && param.weight()->height() == 1 && param.weight()->width() == 1) { - this->impl = new JitAvx512Conv1x1; - } else if (use_avx512 && outputs[0]->get_layout() == Layout_NCHW_C16) { - this->impl = new JitAvx512Conv; - } else if (use_avx2 && (outputs[0]->get_layout() == Layout_NCHW_C8)) { - this->impl = new JitAvx2Conv; + int group = param.group; + int oc = outputs[0]->channel(); + int ic = inputs[0]->channel(); + int kh = param.weight()->height(); + int kw = param.weight()->width(); + int pad_h = param.pad_h; + int pad_w = param.pad_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + int dilation_h = param.dilation_h; + int dilation_w = param.dilation_w; + int ih = inputs[0]->height(); + int iw = inputs[0]->width(); + int in = inputs[0]->num(); + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + + bool conv_1x1_flag = (kh == 1 && kw == 1) && (pad_h == 0 && pad_w == 0) && (stride_h == 1 + && stride_w == 1) && group == 1; + bool is_c16 = (input_layout == Layout_NCHW_C16R) && (out_layout == Layout_NCHW_C16R) ; + bool is_strict_c16 = is_c16 && (ic % 16 == 0 && oc % 16 == 0); + bool is_first_c16 = (input_layout == Layout_NCHW) && (ic == 1 || ic == 3) + && (out_layout == Layout_NCHW_C16R || out_layout == Layout_NHWC); + bool is_c8 = (input_layout == Layout_NCHW_C8R) && (out_layout == Layout_NCHW_C8R); + bool is_strict_c8 = is_c8 && (ic % 8 == 0 && oc % 8 == 0); + bool is_c8_in = (input_layout == Layout_NCHW_C8R); + bool is_strict_c8_in = is_c8_in && (ic % 8 == 0 && oc % 8 == 0); + bool is_c8_out = (out_layout == Layout_NCHW_C8R); + bool is_strict_c8_out = is_c8_out && (ic % 8 == 0 && oc % 8 == 0); + + bool is_winorgrad = (kh == 3 && kw == 3) && (stride_h == 1 && stride_w == 1) && (dilation_h == 1 + && dilation_w == 1) && group == 1; +#ifndef USE_SGX + + if (is_winorgrad && (oc >= 16 && ic >= 16 && ih >= 12 && iw >= 12) + && (((input_layout == Layout_NCHW) && (out_layout == Layout_NCHW)))) { + this->impl = new SaberConvWinograd; + } else +#endif + if (conv_1x1_flag && (input_layout == Layout_NCHW) && (out_layout == Layout_NCHW)) { + this->impl = new SaberConv1X1; + } else if ((use_avx2 || use_avx512) && (oc == group && ic == group) && (is_strict_c8_out + || is_strict_c16)) { + if (is_strict_c8_out && input_layout != Layout_NCHW_C8R) { + _input_trans = true; + _input_trans_tensor.re_alloc(Shape({in, ic, ih, iw}, Layout_NCHW_C8R)); + _input_trans_tensor.set_seq_offset(inputs[0]->get_seq_offset()); + } + + this->impl = new JitUniDWConv; + } else if (use_avx512 && conv_1x1_flag && is_strict_c16) { + this->impl = new JitAvx512Conv1x1; + } else if (use_avx512 && param.group == 1 && (is_strict_c16 || is_first_c16)) { + this->impl = new JitAvx512Conv; + } else if (use_avx2 && param.group == 1 && pad_w <= 3) { + this->impl = new JitAvx2Conv; + } else if (use_avx2 && param.group != 1 && is_strict_c8_in && pad_w <= 3) { + this->impl = new JitAvx2GroupConv; + } else if (input_layout == Layout_NCHW && out_layout == Layout_NCHW) { + this->impl = new SaberIm2colConv; + } else { + LOG(FATAL) << "not support conv for in shape = " << inputs[0]->valid_shape() << ", out shape " + << outputs[0]->valid_shape() << ", group = " << group; + } + + _fake_input_vec.push_back(&_input_trans_tensor); + + if (_input_trans) { + return this->impl->init(_fake_input_vec, outputs, conv_elt_param, ctx); } else { - this->impl = new SaberIm2colConv; + return this->impl->init(inputs, outputs, conv_elt_param, ctx); } - this->impl->init(inputs, outputs, conv_elt_param, ctx); - return create(inputs, outputs, param, ctx); - + return SaberSuccess; } template <> @@ -62,7 +138,16 @@ dispatch(const std::vector *>& inputs, EltwiseParam elt_param(Eltwise_sum); elt_param.has_eltwise = false; ConvEltwiseParam conv_elt_param(param, elt_param); - return this->impl->dispatch(inputs, outputs, conv_elt_param); + + if (_input_trans) { + _input_trans_tensor.set_seq_offset(inputs[0]->get_seq_offset()); + input_reorder_nChwc8(*inputs[0], _input_trans_tensor); + return this->impl->dispatch(_fake_input_vec, outputs, conv_elt_param); + } else { + return this->impl->dispatch(inputs, outputs, conv_elt_param); + } + + return SaberSuccess; } @@ -71,12 +156,8 @@ SaberStatus SaberConv2D::\ create(const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param, Context& ctx) { - this->_ctx = &ctx; - EltwiseParam elt_param(Eltwise_sum); - elt_param.has_eltwise = false; - ConvEltwiseParam conv_elt_param(param, elt_param); - return this->impl->create(inputs, outputs, conv_elt_param, ctx); + return SaberSuccess; } template <> @@ -84,28 +165,9 @@ SaberStatus SaberConv2D::\ init(const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param, Context& ctx) { - this->_ctx = &ctx; - EltwiseParam elt_param(Eltwise_sum); - elt_param.has_eltwise = false; - ConvEltwiseParam conv_elt_param(param, elt_param); - ConvParam* conv_param = &(param); - - int kernel_h = param.weight()->height(); - int kernel_w = param.weight()->width(); - Shape src_shape(inputs[0]->shape()); - Shape dst_shape(outputs[0]->shape()); - int ic = src_shape[3], oc = dst_shape[3]; - - if (ic & 0xf || oc & 0xf) { - this->impl = new GemmU8S8S32XConv(); - } else if (kernel_h == 1 && kernel_w == 1 && conv_param->pad_h == 0 && conv_param->pad_w == 0 - && conv_param->stride_h == 1 && conv_param->stride_w == 1 && conv_param->group == 1) { - this->impl = new JitAvx512u8s8s32xConv1x1(); - } else { - this->impl = new JitAvx512U8S8S32XConv(); - } - return this->impl->init(inputs, outputs, conv_elt_param, ctx); + + return SaberSuccess; } template <> @@ -113,13 +175,12 @@ SaberStatus SaberConv2D::\ dispatch(const std::vector *>& inputs, std::vector *>& outputs, ConvParam& param) { - EltwiseParam elt_param(Eltwise_sum); - elt_param.has_eltwise = false; - ConvEltwiseParam conv_elt_param(param, elt_param); - return this->impl->dispatch(inputs, outputs, conv_elt_param); + + return SaberSuccess; } + DEFINE_OP_TEMPLATE(SaberConv2D, ConvParam, X86, AK_HALF); } } diff --git a/saber/funcs/impl/x86/saber_conv.h b/saber/funcs/impl/x86/saber_conv.h index 63e1cdbfc..e53f686ef 100644 --- a/saber/funcs/impl/x86/saber_conv.h +++ b/saber/funcs/impl/x86/saber_conv.h @@ -58,7 +58,14 @@ class SaberConv2D : public ImplBase< } private: + std::vector*> _fake_input_vec; + Tensor _input_trans_tensor; + bool _input_trans{false}; Impl_t* impl; + Tensor _input_scale; + Tensor _output_scale; + std::vector *> _input_vec; + std::vector *> _output_vec; }; } // namespace saber diff --git a/saber/funcs/impl/x86/saber_conv_1x1.cpp b/saber/funcs/impl/x86/saber_conv_1x1.cpp new file mode 100644 index 000000000..0d09b1929 --- /dev/null +++ b/saber/funcs/impl/x86/saber_conv_1x1.cpp @@ -0,0 +1,115 @@ +#include "saber/funcs/impl/x86/saber_conv_1x1.h" +#include "mkl_cblas.h" +#include "saber/funcs/timer.h" + +namespace anakin { +namespace saber { +//inline +static inline void gemm(const bool trans_a, const bool transb, int m, int n, int k, + const float alpha, + const float* a, const float* b, const float beta, float* c) { + // cout << "(" << m << "," << n << "," << k << ")" << endl; + int lda = (!trans_a/* == CblasNoTrans*/) ? k : m; + int ldb = (!transb/* == CblasNoTrans*/) ? n : k; + CBLAS_TRANSPOSE cblas_transa = + (!trans_a/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE cblas_transb = + (!transb/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + // LOG(INFO)<<"m "< +SaberStatus SaberConv1X1::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + ConvParam* conv_param = ¶m.conv_param; + _out_c = conv_param->weight()->num(); + _in_c = conv_param->weight()->channel(); + int h = inputs[0]->height(); + int w = inputs[0]->width(); + _in_inner_size = h * w; + _num_input = inputs[0]->num(); + _num_size_in = _in_c * h * w; + _num_size_out = _out_c * h * w; + + _add_output = 0.f; + + if (param.eltwise_param.has_eltwise) { + _add_output = 1.f; + } + + DLOG(INFO) << "flag :" << _flag_bias << "," << _flag_relu << "," << _flag_neg; + return SaberSuccess; +} + +template <> +SaberStatus SaberConv1X1::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + ConvParam* conv_param = ¶m.conv_param; + EltwiseParam* elt_param = ¶m.eltwise_param; + _flag_bias = (conv_param->bias() != nullptr) && (conv_param->bias()->valid_size() > 0); + + if (conv_param->activation_param.active == Active_relu) { + _flag_relu = true; + _flag_neg = conv_param->activation_param.negative_slope != 0.f; + _neg_slope = conv_param->activation_param.negative_slope; + } else if (elt_param->activation_param.active == Active_relu) { + _flag_relu = true; + _flag_neg = elt_param->activation_param.negative_slope != 0.f; + _neg_slope = elt_param->activation_param.negative_slope; + } else { + _flag_relu = false; + _flag_neg = false; + _neg_slope = 0.f; + } + + _bias_utils.reset(_flag_bias, _flag_relu, _flag_neg); + + + + return create(inputs, outputs, param, ctx); + +} + +template <> +SaberStatus SaberConv1X1::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + + ConvParam* conv_param = ¶m.conv_param; + const float* weights_data = static_cast(conv_param->weight()->data()); + const float* in_data = static_cast(inputs[0]->data()); + float* out_data = static_cast(outputs[0]->mutable_data()); + + + // SaberTimer timer; + // timer.start(*this->_ctx); + for (int batch_id = 0; batch_id < inputs[0]->num(); batch_id++) { + gemm(false, false, _out_c, _in_inner_size, _in_c, 1.f, weights_data, + &in_data[0 + batch_id * _in_c * _in_inner_size], _add_output, + &out_data[0 + batch_id * _out_c * _in_inner_size]); + } + + // timer.end(*this->_ctx); + // double use_ms=timer.get_average_ms(); + // double work_load=(double)_out_c*_in_inner_size*_in_c*2; + // double speed=work_load/use_ms/1000.0/1000.0; + // LOG(INFO)<<"speed "<(conv_param->bias()->data()); + } + + _bias_utils.run(out_data, _bias, _num_input, _out_c, _in_inner_size, _neg_slope); + + return SaberSuccess; +} + +} +} diff --git a/saber/funcs/impl/x86/saber_conv_1x1.h b/saber/funcs/impl/x86/saber_conv_1x1.h new file mode 100644 index 000000000..3b15dcee8 --- /dev/null +++ b/saber/funcs/impl/x86/saber_conv_1x1.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_1X1_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_1X1_H + +#include "saber/funcs/impl/impl_conv.h" +#include "saber/core/tensor.h" + +namespace anakin { +namespace saber { + +class BiasReluUtis { +public: + BiasReluUtis() { + + } + void reset(bool flag_bias, bool flag_relu, bool neg_relu) { + if (flag_bias && flag_relu && neg_relu) { + func = bias_relu; + } else if (flag_bias && flag_relu && !neg_relu) { + func = bias_relu; + } else if (flag_bias && !flag_relu && !neg_relu) { + func = bias_relu; + } else if (!flag_bias && flag_relu && neg_relu) { + func = bias_relu; + } else if (!flag_bias && flag_relu && !neg_relu) { + func = bias_relu; + } else if (!flag_bias && !flag_relu){ + func = bias_relu; + }else{ + LOG(FATAL) << "invalid init BiasReluUtis"; + } + } + + void run(float* output, const float* bias, int batch_size, int out_c, int out_stride, + float negative_slope) { + + func(output, bias, batch_size, out_c, out_stride, negative_slope); + } + + + template + static void bias_relu(float* output, const float* bias, int batch_size, int out_c, int out_stride, + float negative_slope) { + int batch_stride = out_c * out_stride; + if (flag_bias && !flag_relu) { + #pragma omp parallel for collapse(3) schedule(static) + + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id) { + int id = i * batch_stride + oc * out_stride + inner_id; + output[id] += bias[oc]; + } + } + } + } else if (!flag_bias && flag_relu) { + #pragma omp parallel for collapse(3) schedule(static) + + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id) { + int id = i * batch_stride + oc * out_stride + inner_id; + + if (neg_relu) { + if (output[id] < 0.f) { + output[id] = output[id] * negative_slope; + } + } else { + if (output[id] < 0.f) { + output[id] = 0.f; + } + } + } + } + } + } else if (flag_bias && flag_relu) { + #pragma omp parallel for collapse(3) schedule(static) + + for (int i = 0; i < batch_size; i++) { + for (int oc = 0; oc < out_c; ++oc) { + for (int inner_id = 0; inner_id < out_stride; ++inner_id) { + int id = i * batch_stride + oc * out_stride + inner_id; + float temp = output[id]; + temp += bias[oc]; + + if (neg_relu) { + if (temp < 0.f) { + temp = temp * negative_slope; + } + } else { + if (temp < 0.f) { + temp = 0.f; + } + } + + output[id] = temp; + } + } + } + } + } + + +private: + std::function func; + // void (*func)(float* output,const float* bias,int batch_size,int out_c, int out_stride,float negative_slope); + +}; + +template +class SaberConv1X1: public ImplBase < + X86, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConv1X1() + {} + + ~SaberConv1X1() { + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvEltwiseParam& param); + +private: + BiasReluUtis _bias_utils; + + bool _flag_relu; + bool _flag_neg; + bool _flag_bias; + float _neg_slope; + + int _out_c; + int _in_c; + int h; + int w; + int _in_inner_size; + int _num_input; + int _num_size_in; + int _num_size_out; + float _add_output; + const OpDataType* _bias; +}; + + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_SABER_CONV_H diff --git a/saber/funcs/impl/x86/saber_conv_eltwise.cpp b/saber/funcs/impl/x86/saber_conv_eltwise.cpp index 9343ecc77..d0e9b1d9d 100644 --- a/saber/funcs/impl/x86/saber_conv_eltwise.cpp +++ b/saber/funcs/impl/x86/saber_conv_eltwise.cpp @@ -1,13 +1,40 @@ -#include "saber/funcs/impl/x86/saber_conv.h" -#include "saber/funcs/impl/x86/saber_eltwise.h" #include "saber/funcs/impl/x86/saber_conv_eltwise.h" #include "saber/funcs/calibrate.h" #include "saber_conv_eltwise.h" +#include "saber/funcs/impl/x86/saber_im2col_conv.h" +#include "saber/funcs/impl/x86/kernel/jit_avx2_conv.h" +#include "saber/funcs/impl/x86/kernel/jit_uni_dwconv.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_conv1x1.h" +#include "saber/funcs/impl/x86/kernel/jit_avx512_conv.h" +#include "saber/funcs/impl/x86/gemm_x8s8s32x_conv.h" +#include "saber/funcs/impl/x86/saber_conv_1x1.h" +#include "saber/funcs/impl/x86/kernel/jit_uni_dwconv.h" + + namespace anakin { namespace saber { +template +SaberStatus SaberConvEltwise::trans_weights(Tensor &target_weights, Tensor &target_bias, + int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group){ + return SaberSuccess; +}; +template <> +SaberStatus SaberConvEltwise::trans_weights(Tensor &target_weights, Tensor &target_bias, + int pad_h, int pad_w, int dilation_h, int dilation_w, + int stride_h, int stride_w, int group){ + return SaberSuccess; +}; +//template <> +//SaberStatus SaberConvEltwise::trans_weights(Tensor &target_weights, Tensor &target_bias, +// int pad_h, int pad_w, int dilation_h, int dilation_w, +// int stride_h, int stride_w, int group){ +// return SaberSuccess; +//}; + template <> SaberStatus SaberConvEltwise::\ create(const std::vector *>& inputs, @@ -17,6 +44,24 @@ SaberStatus SaberConvEltwise::\ _ctx = &ctx; _inner_shape = conv_compute_shape(inputs[0]->valid_shape(), param.conv_param); + //choose impl kernel + bool use_avx512 = false;//mayiuse(avx512_common); + bool use_avx2 = mayiuse(avx2); + int group = param.conv_param.group; + int oc = outputs[0]->channel(); + int ic = inputs[0]->channel(); + int kh = _kernel_height; + int kw = _kernel_width; + int pad_h = param.conv_param.pad_h; + int pad_w = param.conv_param.pad_w; + int stride_h = param.conv_param.stride_h; + int stride_w = param.conv_param.stride_w; + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + if (_do_in_impl){ + this->_impl->create(inputs, outputs, param, ctx); + } + return SaberSuccess; } @@ -31,13 +76,58 @@ SaberStatus SaberConvEltwise:: _kernel_height = param.conv_param.weight()->height(); _kernel_width = param.conv_param.weight()->width(); - { + //choose impl kernel + bool use_avx512 = false;//mayiuse(avx512_common); + bool use_avx2 = mayiuse(avx2); + int group = param.conv_param.group; + int oc = outputs[0]->channel(); + int ic = inputs[0]->channel(); + int kh = _kernel_height; + int kw = _kernel_width; + int pad_h = param.conv_param.pad_h; + int pad_w = param.conv_param.pad_w; + int stride_h = param.conv_param.stride_h; + int stride_w = param.conv_param.stride_w; + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + + if ((kh == 1 && kw == 1) && (pad_h == 0 && pad_w == 0) && (stride_h == 1 && stride_w == 1) && + (input_layout == Layout_NCHW) && (out_layout == Layout_NCHW) && group == 1) { + _do_in_impl = true; + this->_impl = new SaberConv1X1; + this->_impl->init(inputs, outputs, param, ctx); + } else { + _do_in_impl = false; _inner_tensor.re_alloc(_inner_shape, AK_FLOAT); _inner_tensor_v.resize(2); _inner_tensor_v[0] = &_inner_tensor; _conv.init(inputs, _inner_tensor_v, param.conv_param, ctx); _eltwise.init(_inner_tensor_v, outputs, param.eltwise_param, ctx); } + //TODO:add some impl for eltwise + /* + else if (use_avx2 && input_layout == Layout_NCHW_C8R && out_layout == Layout_NCHW_C8R + && (oc == group && ic == group && oc % 8 == 0)) { + this->_impl = new JitUniDWConv; + } else if (use_avx512 && param.conv_param.group == inputs[0]->channel() + && param.conv_param.group == outputs[0]->channel()) { + this->_impl = new JitUniDWConv; + } else if (use_avx512 && param.conv_param.weight()->height() == 1 + && param.conv_param.weight()->width() == 1) { + this->_impl = new JitAvx512Conv1x1; + } else if (use_avx512 && outputs[0]->get_layout() == Layout_NCHW_C16) { + this->_impl = new JitAvx512Conv; + } else if (use_avx2 && param.conv_param.group == 1) { + this->_impl = new JitAvx2Conv; + } else if (input_layout == Layout_NCHW && out_layout == Layout_NCHW) { + this->_impl = new SaberIm2colConv; + } else { + LOG(FATAL) << "not support conv for in shape = " << inputs[0]->valid_shape() << ", out shape " + << outputs[0]->valid_shape() << ", group = " << group; + } + */ + + return create(inputs, outputs, param, ctx); } @@ -46,55 +136,47 @@ SaberStatus SaberConvEltwise::dispatch( const std::vector*>& inputs, std::vector*>& outputs, ConvEltwiseParam& param) { - - const float* bias_data; - if (param.conv_param.bias()->size() > 0) { - bias_data = (const float*)param.conv_param.bias()->data(); + + if (_do_in_impl){ + _impl->dispatch(inputs, outputs, param); } else { - bias_data = nullptr; - } - Shape shape_in = inputs[0]->valid_shape(); - Shape shape_out = outputs[0]->valid_shape(); - int num = inputs[0]->num(); - int chin = inputs[0]->channel(); - int win = inputs[0]->width(); - int hin = inputs[0]->height(); - int chout = outputs[0]->channel(); - int wout = outputs[0]->width(); - int hout = outputs[0]->height(); - int in_stride = chin * win * hin; - int out_stride = chout * wout * hout; - { - _conv.dispatch(inputs, _inner_tensor_v, param.conv_param); + _conv.dispatch(inputs, _inner_tensor_v, param.conv_param); _inner_tensor_v[1] = outputs[0]; - _eltwise.dispatch(_inner_tensor_v, outputs, param.eltwise_param); + _eltwise.dispatch(_inner_tensor_v, outputs, param.eltwise_param); } + return SaberSuccess; } template <> -SaberStatus SaberConvEltwise::trans_weights( - Tensor &target_weights, Tensor &target_bias, - int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { - return SaberSuccess; +SaberStatus SaberConvEltwise::\ +create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + + return this->_impl->create(inputs, outputs, param, ctx); } + template <> -SaberStatus SaberConvEltwise::trans_weights( - Tensor &target_weights, Tensor &target_bias, - int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { - return SaberSuccess; +SaberStatus SaberConvEltwise::\ +init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + return this->_impl->init(inputs, outputs, param, ctx); } + template <> -SaberStatus SaberConvEltwise::trans_weights( - Tensor &target_weights, Tensor &target_bias, - int pad_h, int pad_w, int dilation_h, int dilation_w, - int stride_h, int stride_w, int group) { - return SaberSuccess; +SaberStatus SaberConvEltwise::\ +dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + return this->_impl->dispatch(inputs, outputs, param); } template class SaberConvEltwise; +template class SaberConvEltwise; DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(SaberConvEltwise, ConvEltwiseParam, X86, AK_INT8); + } } diff --git a/saber/funcs/impl/x86/saber_conv_eltwise.h b/saber/funcs/impl/x86/saber_conv_eltwise.h index c06d7ca12..09dbef72a 100644 --- a/saber/funcs/impl/x86/saber_conv_eltwise.h +++ b/saber/funcs/impl/x86/saber_conv_eltwise.h @@ -31,12 +31,15 @@ class SaberConvEltwise : public ImplBase< X86, OpDtype, ConvEltwiseParam > { public: typedef typename DataTrait::Dtype OpDataType; - typedef ImplBase > Impl_conv_t; - typedef ImplBase > Impl_eltwise_t; + typedef ImplBase > Impl_t; - SaberConvEltwise() {} + SaberConvEltwise() : _impl(nullptr) {} - ~SaberConvEltwise() {} + ~SaberConvEltwise() { + if (_impl != nullptr){ + delete _impl; + } + } /** * [Create description] Init all cudnn resource here @@ -67,11 +70,13 @@ class SaberConvEltwise : public ImplBase< bool _extern_trans{false}; SaberEltwise _eltwise; SaberConv2D _conv; + Impl_t* _impl; Shape _inner_shape; Tensor _inner_tensor; std::vector *> _inner_tensor_v; int _kernel_height{0}; int _kernel_width{0}; + bool _do_in_impl{false}; }; } diff --git a/saber/funcs/impl/x86/saber_conv_pooling.cpp b/saber/funcs/impl/x86/saber_conv_pooling.cpp index 1aeba0560..40a6b2a99 100644 --- a/saber/funcs/impl/x86/saber_conv_pooling.cpp +++ b/saber/funcs/impl/x86/saber_conv_pooling.cpp @@ -4,6 +4,7 @@ #include "saber/funcs/impl/x86/saber_conv.h" #include "saber/core/tensor_op.h" #include "saber/funcs/funcs_utils.h" +#include "saber/funcs/impl/x86/kernel/jit_conv_pooling_normal.h" namespace anakin { namespace saber { @@ -55,6 +56,36 @@ SaberStatus SaberConv2DPooling::dispatch( template class SaberConv2DPooling; DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(SaberConv2DPooling, ConvPoolingParam, X86, AK_INT8); + +template <> +SaberStatus SaberConv2DPooling::\ +create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberUnImplError; + + return ret; +} + +template <> +SaberStatus SaberConv2DPooling::\ +init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param, Context& ctx) { + SaberStatus ret = SaberSuccess; + return ret; +} + +template <> +SaberStatus SaberConv2DPooling::\ +dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvPoolingParam& param) { + SaberStatus ret = SaberSuccess; + + return ret; +} + + } } diff --git a/saber/funcs/impl/x86/saber_conv_pooling.h b/saber/funcs/impl/x86/saber_conv_pooling.h index 8222ff09d..8ec303b0b 100644 --- a/saber/funcs/impl/x86/saber_conv_pooling.h +++ b/saber/funcs/impl/x86/saber_conv_pooling.h @@ -30,10 +30,16 @@ class SaberConv2DPooling : public ImplBase< X86, OpDtype, ConvPoolingParam > { public: typedef typename DataTrait::Dtype OpDataType; + typedef ImplBase > Impl_conv_pool_t; - SaberConv2DPooling() {} + SaberConv2DPooling():conv_pool_impl_(nullptr) {} - ~SaberConv2DPooling() {} + ~SaberConv2DPooling() { + if (conv_pool_impl_ != nullptr) { + delete conv_pool_impl_; + conv_pool_impl_ = nullptr; + } + } /** * [Create description] Init all cudnn resource here @@ -67,6 +73,7 @@ class SaberConv2DPooling : public ImplBase< Shape _inner_shape; Tensor _inner_tensor; std::vector *> _inner_tensor_v; + Impl_conv_pool_t* conv_pool_impl_; }; } diff --git a/saber/funcs/impl/x86/saber_cos_sim.cpp b/saber/funcs/impl/x86/saber_cos_sim.cpp new file mode 100644 index 000000000..c49bd01f9 --- /dev/null +++ b/saber/funcs/impl/x86/saber_cos_sim.cpp @@ -0,0 +1,82 @@ + +#include "saber/funcs/impl/x86/saber_cos_sim.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberCosSim::init( + const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberCosSim::create( + const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberCosSim::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m) { + CHECK_EQ(inputs.size(), 2) << "CosSim input num need be 2, but is" << inputs.size(); + CHECK_EQ(outputs.size(), 1) << "CosSim input num need be 1, but is" << outputs.size(); + size_t count_0 = inputs[0]->valid_size(); + size_t count_1 = inputs[1]->valid_size(); + CHECK_EQ(count_0, count_1) << "input0 and input1 valid size is not equal"; + + size_t num = inputs[0]->num(); + size_t inner_size = count_0 / inputs[0]->num(); + const OpDataType *input0_data = (const OpDataType*)inputs[0]->data(); + const OpDataType *input1_data = (const OpDataType*)inputs[1]->data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); +#if defined(__AVX2__) and defined(__FMA__) + avx2_cos_sim(input0_data, input1_data, num, inner_size, param.epsilon, output_data); +#else + for (size_t n = 0; n < num; n++) { + auto input0_square_sum = (OpDataType)0; + auto input1_square_sum = (OpDataType)0; + auto input01_prod_sum = (OpDataType)0; +#pragma omp parallel for schedule(static) reduction(+:input0_square_sum, input1_square_sum, input01_prod_sum) + for (size_t i = 0; i < inner_size; i++) { + input0_square_sum += input0_data[i] * input0_data[i]; + input1_square_sum += input1_data[i] * input1_data[i]; + input01_prod_sum += input0_data[i] * input1_data[i]; + } + float bc = input0_square_sum * input1_square_sum; + if (bc < param.epsilon) { + output_data[n] = 0; + } else { + output_data[n] = input01_prod_sum / sqrt(bc); + } + input0_data += inner_size; + input1_data += inner_size; + } +#endif + + for (size_t i = 0; i < outputs.size(); i++) { + outputs[i]->set_seq_offset(inputs[i]->get_seq_offset()); + } + return SaberSuccess; +} + +template class SaberCosSim; +DEFINE_OP_TEMPLATE(SaberCosSim, CosSimParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberCosSim, CosSimParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_cos_sim.h b/saber/funcs/impl/x86/saber_cos_sim.h new file mode 100644 index 000000000..66dbb6730 --- /dev/null +++ b/saber/funcs/impl/x86/saber_cos_sim.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_COS_SIM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_COS_SIM_H + +#include "saber/funcs/impl/impl_cos_sim.h" + +namespace anakin { +namespace saber { + +template +class SaberCosSim : + public ImplBase< + X86, OpDtype, + CosSimParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberCosSim() {} + + ~SaberCosSim() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_crf_decoding.cpp b/saber/funcs/impl/x86/saber_crf_decoding.cpp index 0cd3e3e3d..8892df450 100644 --- a/saber/funcs/impl/x86/saber_crf_decoding.cpp +++ b/saber/funcs/impl/x86/saber_crf_decoding.cpp @@ -1,12 +1,11 @@ - #include "saber/funcs/impl/x86/saber_crf_decoding.h" #include "saber/saber_funcs_param.h" #include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/anakin_thread.h" #include #include #include #include -#include "omp.h" namespace anakin { namespace saber { @@ -34,7 +33,7 @@ SaberStatus SaberCrfDecoding::create( this->_ctx = &ctx; _track.reshape(inputs[0]->valid_shape()); -#ifdef __AVX2__ +#if defined(__AVX2__) and defined(__FMA__) int tag_num = inputs[0]->channel(); _aligned_tag_num = (tag_num % 8) ? (tag_num / 8 + 1) * 8 : tag_num; // get transposed transition weight @@ -65,7 +64,7 @@ SaberStatus SaberCrfDecoding::create( template void decoding(Dtype* path, const Dtype* emission, const Dtype* transition, Dtype* alpha_value, int* track_value, int aligned_tag_num, int seq_len, int tag_num) { -#ifdef __AVX2__ +#if defined(__AVX2__) and defined(__FMA__) const Dtype* x = emission; const Dtype* w = transition; const int state_trans_base_idx = 2; @@ -193,7 +192,7 @@ SaberStatus SaberCrfDecoding::dispatch( const OpDataType *transition_ptr = (const OpDataType*)param.transition_weight()->data(); int slice_size = inputs[0]->channel() * inputs[0]->height() * inputs[0]->width(); -#ifdef __AVX2__ +#if defined(__AVX2__) and defined(__FMA__) if (tag_num % 8) { transition_ptr = (OpDataType*)_trans.data(); @@ -213,12 +212,12 @@ SaberStatus SaberCrfDecoding::dispatch( #endif OpDataType *decoded_path = (OpDataType*) outputs[0]->mutable_data(); int seq_num = seq_offset[0].size() - 1; - int nthreads = omp_get_max_threads(); + int nthreads = anakin_get_max_threads(); if (nthreads > seq_num) { nthreads = seq_num; } - #pragma omp parallel for num_threads(nthreads) if(seq_num > 1) +//#pragma omp parallel for num_threads(nthreads) if(seq_num > 1) for (int i = 0; i < seq_num; ++i) { int seq_len = seq_offset[0][i+1] - seq_offset[0][i]; // LOG(INFO) << "slice_size: " << slice_size << ", seq_num: " << seq_num << ", seq_len: " << seq_len; diff --git a/saber/funcs/impl/x86/saber_crop.h b/saber/funcs/impl/x86/saber_crop.h index 89e657482..5673eeeba 100644 --- a/saber/funcs/impl/x86/saber_crop.h +++ b/saber/funcs/impl/x86/saber_crop.h @@ -48,31 +48,45 @@ class SaberCrop : Context &ctx) { this->_ctx = &ctx; this->_param = ¶m; - CHECK_EQ(param.shape.size(),4); + std::vector shape; + if (inputs.size() == 2) { + shape = inputs.at(1)->valid_shape(); + } else { + shape = param.shape; + } + CHECK_EQ(shape.size(),4); + + // offset values may be omitted in the original model + // Caffe uses 0s as default values + auto offset_size = param.offset.size(); + if (offset_size == 0) { + param.offset.resize(4 - param.axis, 0); + } + if (param.axis == 1) { CHECK_EQ(param.offset.size(), 3); _c_off = param.offset[0]; _h_off = param.offset[1]; _w_off = param.offset[2]; - _c_end = param.shape[1]+_c_off; - _h_end = param.shape[2]+_h_off; - _w_end = param.shape[3]+_w_off; + _c_end = shape[1]+_c_off; + _h_end = shape[2]+_h_off; + _w_end = shape[3]+_w_off; } else if (param.axis == 2) { CHECK_EQ(param.offset.size(), 2); _c_off = 0; _h_off = param.offset[0]; _w_off = param.offset[1]; - _c_end = param.shape[1]; - _h_end = param.shape[2]+_h_off; - _w_end = param.shape[3]+_w_off; + _c_end = shape[1]; + _h_end = shape[2]+_h_off; + _w_end = shape[3]+_w_off; } else if (param.axis == 3) { CHECK_EQ(param.offset.size(), 1); _c_off = 0; _h_off = 0; _w_off = param.offset[0]; - _c_end = param.shape[1]; - _h_end = param.shape[2]; - _w_end = param.shape[3]+_w_off; + _c_end = shape[1]; + _h_end = shape[2]; + _w_end = shape[3]+_w_off; } else { return SaberInvalidValue; } diff --git a/saber/funcs/impl/x86/saber_deconv.cpp b/saber/funcs/impl/x86/saber_deconv.cpp index 769882b05..ea0f5f67b 100644 --- a/saber/funcs/impl/x86/saber_deconv.cpp +++ b/saber/funcs/impl/x86/saber_deconv.cpp @@ -1,15 +1,18 @@ #include "saber/funcs/impl/x86/saber_deconv.h" #include "saber/funcs/impl/x86/saber_col2im_deconv.h" - +#include "saber/funcs/impl/x86/kernel/jit_avx2_deconv.h" +#ifndef USE_SGX +#include "saber/funcs/impl/x86/vender_deconv.h" +#endif namespace anakin { namespace saber { template <> SaberStatus SaberDeconv2D::create( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context &ctx) { + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { _impl->create(inputs, outputs, param, ctx); return SaberSuccess; @@ -17,36 +20,43 @@ SaberStatus SaberDeconv2D::create( template <> SaberStatus SaberDeconv2D::init( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param, Context& ctx) { + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { this->_ctx = &ctx; - _impl = new SaberCol2ImDeconv; - _impl->init(inputs, outputs, param, ctx); - return create(inputs, outputs, param, ctx); + + if (inputs[0]->get_layout() == Layout_NCHW_C8R) { + _impl = new JitAvx2Deconv; + } else if (inputs[0]->get_layout() == Layout_NCHW && outputs[0]->get_layout() == Layout_NCHW) { + _impl = new SaberCol2ImDeconv; + } else { + LOG(FATAL) << "not support this layout"; + } + + return _impl->init(inputs, outputs, param, ctx); } template <> SaberStatus SaberDeconv2D::dispatch( - const std::vector *>& inputs, - std::vector *>& outputs, - ConvParam& param) { + const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { return _impl->dispatch(inputs, outputs, param); } template <> -SaberStatus SaberDeconv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int in_channel, int out_channel, +SaberStatus SaberDeconv2D::trans_weights(Tensor& target_weights, + Tensor& target_bias, int in_channel, int out_channel, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group) { return SaberUnImplError; } template <> -SaberStatus SaberDeconv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int in_channel, int out_channel, +SaberStatus SaberDeconv2D::trans_weights(Tensor& target_weights, + Tensor& target_bias, int in_channel, int out_channel, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group) { @@ -54,8 +64,8 @@ SaberStatus SaberDeconv2D::trans_weights(Tensor &target_weigh } template <> -SaberStatus SaberDeconv2D::trans_weights(Tensor &target_weights, - Tensor &target_bias, int in_channel, int out_channel, +SaberStatus SaberDeconv2D::trans_weights(Tensor& target_weights, + Tensor& target_bias, int in_channel, int out_channel, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group) { return SaberUnImplError; @@ -66,4 +76,4 @@ DEFINE_OP_TEMPLATE(SaberDeconv2D, ConvParam, X86, AK_HALF); DEFINE_OP_TEMPLATE(SaberDeconv2D, ConvParam, X86, AK_INT8); } -} \ No newline at end of file +} diff --git a/saber/funcs/impl/x86/saber_deconv.h b/saber/funcs/impl/x86/saber_deconv.h index 6b0148814..a680fc419 100644 --- a/saber/funcs/impl/x86/saber_deconv.h +++ b/saber/funcs/impl/x86/saber_deconv.h @@ -16,6 +16,7 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_DECONV_H #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_DECONV_H +#include "anakin_config.h" #include "saber/funcs/impl/impl_deconv.h" namespace anakin { diff --git a/saber/funcs/impl/x86/saber_detection_output.cpp b/saber/funcs/impl/x86/saber_detection_output.cpp index a84009577..cac0040a4 100644 --- a/saber/funcs/impl/x86/saber_detection_output.cpp +++ b/saber/funcs/impl/x86/saber_detection_output.cpp @@ -23,43 +23,92 @@ SaberStatus SaberDetectionOutput::dispatch(const std::vector* t_loc = inputs[0]; Tensor* t_conf = inputs[1]; - Tensor* t_prior = inputs[2]; - - const dtype* loc_data = static_cast(t_loc->data()); - const dtype* prior_data = static_cast(t_prior->data()); - const int num = t_loc->num(); - - // Decode predictions. - dtype* bbox_data = static_cast(_bbox_preds.mutable_data()); - const int loc_count = _bbox_preds.valid_size(); - decode_bboxes(loc_count, loc_data, prior_data, param.type, \ - param.variance_encode_in_target, _num_priors, param.share_location, \ - _num_loc_classes, param.background_id, bbox_data); - // Retrieve all decoded location predictions. - if (!param.share_location) { - dtype * bbox_permute_data = static_cast(_bbox_permute.mutable_data()); - permute_data(loc_count, bbox_data, _num_loc_classes, _num_priors, - 4, bbox_permute_data); - } - // Retrieve all confidences. - dtype* conf_permute_data = static_cast(_conf_permute.mutable_data()); - permute_data(t_conf->valid_size(), static_cast(t_conf->data()), \ - this->_num_classes, _num_priors, 1, conf_permute_data); + Tensor* t_prior; + std::vector priors; + CHECK_EQ(t_loc->get_dtype(), AK_FLOAT) << "input data type must be float"; + CHECK_EQ(t_conf->get_dtype(), AK_FLOAT) << "input data type must be float"; + + const float* bbox_data_cpu = nullptr; + const float* conf_data_cpu = nullptr; + + if (_shared_loc) { + //! for one stage + const int num = t_loc->num(); + for (int i = 0; i < num; ++i) { + priors.push_back(_num_priors / num); + } + + bool is_ssd = inputs.size() > 2; + + if (is_ssd) { + t_prior = inputs[2]; + int num_priors = _num_priors / num; - memcpy(_bbox_cpu_data, static_cast(_bbox_preds.data()), \ - _bbox_preds.valid_size() * sizeof(dtype)); - memcpy(_conf_cpu_data, static_cast(_conf_permute.data()), \ - _conf_permute.valid_size() * sizeof(dtype)); + const float* loc_data = static_cast(t_loc->data()); + const float* prior_data = static_cast(t_prior->data()); - std::vector result; + // Decode predictions. + float* bbox_data = static_cast(_bbox_preds.mutable_data()); + const int loc_count = _bbox_preds.valid_size(); + decode_bboxes(loc_count, loc_data, prior_data, param.type, \ + param.variance_encode_in_target, num_priors, param.share_location, \ + _num_loc_classes, param.background_id, bbox_data); + // Retrieve all decoded location predictions. + if (!param.share_location) { + float* bbox_permute_data = static_cast(_bbox_permute.mutable_data()); + permute_data(loc_count, bbox_data, _num_loc_classes, num_priors, + 4, bbox_permute_data); + } + // Retrieve all confidences. + float* conf_permute_data = static_cast(_conf_permute.mutable_data()); + permute_data(t_conf->valid_size(), static_cast(t_conf->data()), \ + this->_num_classes, num_priors, 1, conf_permute_data); + + bbox_data_cpu = bbox_data; + conf_data_cpu = conf_permute_data; + } else { //! multiclass_nms + bbox_data_cpu = static_cast(t_loc->data()); + conf_data_cpu = static_cast(t_conf->data()); + } + } else { + //! for two stage + //! sizeof seq offset is N + 1 + auto conf_permute = static_cast(_conf_permute.mutable_data()); + auto bbox_permute = static_cast(_bbox_permute.mutable_data()); + auto conf_ori = static_cast(t_conf->data()); + auto bbox_ori = static_cast(t_loc->data()); + //! for two stage + //! sizeof seq offset is N + 1 + auto offset = t_loc->get_seq_offset()[0]; + for (int i = 0; i < offset.size() - 1; ++i) { + int num_priors = offset[i + 1] - offset[i]; + priors.push_back(num_priors); + const float* conf_ori_batch = conf_ori + this->_num_classes * offset[i]; + const float* bbox_ori_batch = bbox_ori + this->_num_classes * 4 * offset[i]; + float* conf_permute_batch = conf_permute + this->_num_classes * offset[i]; + float* bbox_permute_batch = bbox_permute + this->_num_classes * 4 * offset[i]; + //! permute conf and bbox + //! input bbox layout is [M, C, 4], multi-batch view: [{priors0, C, 4}, {priors1, C, 4}, ...] + //! permute bbox data to [{C, priors0, 4}, {C, priors1, 4}, ...] + //! input conf layout is [M, C], multi-batch view: [{priors0, C}, {priors1, C}, ...] + //! permute conf data to [{C, priors0}, {C, priors1}, ...] + permute_data(num_priors * this->_num_classes, conf_ori_batch, + this->_num_classes, num_priors, 1, conf_permute_batch); + permute_data(num_priors * this->_num_classes * 4, bbox_ori_batch, + this->_num_classes, num_priors, 4, bbox_permute_batch); + } + bbox_data_cpu = bbox_permute; + conf_data_cpu = conf_permute; + } - nms_detect(_bbox_cpu_data, _conf_cpu_data, result, num, this->_num_classes, _num_priors, param.background_id, \ - param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, param.share_location); + std::vector result; + nms_detect(bbox_data_cpu, conf_data_cpu, result, priors, this->_num_classes, param.background_id, \ + param.keep_top_k, param.nms_top_k, param.conf_thresh, param.nms_thresh, param.nms_eta, _shared_loc); if (result.size() == 0) { result.resize(7); for (int i = 0; i < 7; ++i) { - result[i] = (dtype)-1; + result[i] = (float)-1; } outputs[0]->reshape(Shape({1, 1, 1, 7})); } else { @@ -67,7 +116,7 @@ SaberStatus SaberDetectionOutput::dispatch(const std::vectormutable_data(), result.data(), \ - result.size() * sizeof(dtype)); + result.size() * sizeof(float)); return SaberSuccess; } diff --git a/saber/funcs/impl/x86/saber_detection_output.h b/saber/funcs/impl/x86/saber_detection_output.h index 01eebc68b..615d22d86 100644 --- a/saber/funcs/impl/x86/saber_detection_output.h +++ b/saber/funcs/impl/x86/saber_detection_output.h @@ -31,17 +31,9 @@ class SaberDetectionOutput : \ DetectionOutputParam > { public: - typedef typename DataTrait::Dtype dtype; SaberDetectionOutput() = default; - ~SaberDetectionOutput() { - if (_bbox_cpu_data) { - fast_free(_bbox_cpu_data); - } - if (_conf_cpu_data) { - fast_free(_conf_cpu_data); - } - } + ~SaberDetectionOutput() {} virtual SaberStatus init(const std::vector *>& inputs, std::vector *>& outputs, @@ -55,44 +47,56 @@ class SaberDetectionOutput : \ std::vector *>& outputs, DetectionOutputParam& param, Context &ctx) { - //! inputs[0]: location map, dims = 4 {N, boxes * 4, 1, 1} - //! inputs[1]: confidence map, dims = 4 {N, classes * boxes, 1, 1} - //! inputs[2]: prior boxes, dims = 4 {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)} + _shared_loc = param.share_location; Shape sh_loc = inputs[0]->valid_shape(); Shape sh_conf = inputs[1]->valid_shape(); - Shape sh_box = inputs[2]->valid_shape(); - //! shape {1, 1, 2, boxes * 4(xmin, ymin, xmax, ymax)}, boxes = size / 2 / 4 - //! layout must be 4 dims, the priors is in the last dim - _num_priors = sh_box[2] / 4; - int num = inputs[0]->num(); - if (param.class_num == 0) { - _num_classes = inputs[1]->valid_size() / (num * _num_priors); - } else { - _num_classes = param.class_num; - } - if (param.share_location) { + Shape sh_box; + + //fixme, only support{xmin, ymin, xmax, ymax} style box + if (_shared_loc) { + //! for one stage detector + //! inputs[0]: location map, {N, boxes * 4} + //! inputs[1]: confidence map, ssd: {N, classes, boxes}, yolov3: {N, boxes, classes} + //! optional, ssd has 3 inputs, the last inputs is priorbox + //! inputs[2]: prior boxes, dims = 4 {1, 2, boxes * 4(xmin, ymin, xmax, ymax)} + CHECK_GE(inputs.size(), 2) << "detection_output op must has 2 inputs at least"; + bool is_ssd = inputs.size() > 2; + if (is_ssd) { + sh_box = inputs[2]->valid_shape(); + } + //! boxes = sh_loc / 4 + _num_priors = sh_loc.count() / 4; + if (param.class_num <= 0) { + _num_classes = sh_conf.count() / _num_priors; + } else { + _num_classes = param.class_num; + } _num_loc_classes = 1; + if (is_ssd) { + _bbox_preds.reshape(sh_loc); + _conf_permute.reshape(sh_conf); + } + } else { + //! for two stage detector + //! inputs[0]: tensor with offset, location, {M, C, 4} + //! inputs[1]: tensor with offset, confidence, {M, C} + CHECK_EQ(sh_loc[0], sh_conf[0]) << "boxes number must be the same"; + _num_priors = sh_loc[0]; + if (param.class_num <= 0) { + _num_classes = sh_conf.count() / _num_priors; + } else { + _num_classes = param.class_num; + } _num_loc_classes = _num_classes; _bbox_permute.reshape(sh_loc); + _conf_permute.reshape(sh_conf); } - _bbox_preds.reshape(sh_loc); - _conf_permute.reshape(sh_conf); - - CHECK_EQ(_num_priors * _num_loc_classes * 4, sh_loc[1]) << \ - "Number of priors must match number of location predictions."; - CHECK_EQ(_num_priors * _num_classes, sh_conf[1]) << \ - "Number of priors must match number of confidence predictions."; - - if (_conf_cpu_data != nullptr) { - fast_free(_conf_cpu_data); - } - if (_bbox_cpu_data != nullptr) { - fast_free(_bbox_cpu_data); - } - _conf_cpu_data = (dtype*)fast_malloc(sizeof(dtype) * sh_conf.count()); - _bbox_cpu_data = (dtype*)fast_malloc(sizeof(dtype) * sh_loc.count()); + CHECK_EQ(_num_priors * _num_loc_classes * 4, sh_loc.count()) << \ + "Number of boxes must match number of location predictions."; + CHECK_EQ(_num_priors * _num_classes, sh_conf.count()) << \ + "Number of boxes must match number of confidence predictions."; return SaberSuccess; } @@ -103,16 +107,15 @@ class SaberDetectionOutput : \ private: + bool _shared_loc{true}; int _num_classes; int _num_loc_classes; int _num_priors; Tensor _bbox_preds; Tensor _bbox_permute; Tensor _conf_permute; - dtype* _bbox_cpu_data{nullptr}; - dtype* _conf_cpu_data{nullptr}; }; -template class SaberDetectionOutput; + } //namespace saber } //namespace anakin diff --git a/saber/funcs/impl/x86/saber_eltwise.cpp b/saber/funcs/impl/x86/saber_eltwise.cpp index 55a296bc0..97644fb50 100644 --- a/saber/funcs/impl/x86/saber_eltwise.cpp +++ b/saber/funcs/impl/x86/saber_eltwise.cpp @@ -35,11 +35,11 @@ SaberStatus SaberEltwise::create( return SaberSuccess; } + template -template void SaberEltwise::simple_sum(const std::vector& inputs, std::vector& outputs, - EltwiseParam& param) { + EltwiseParam& param, bool with_relu) { const int input_num = inputs.size(); const size_t inner_size = inputs[0]->valid_size(); OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); @@ -50,9 +50,10 @@ void SaberEltwise::simple_sum(const std::vector& i } const OpDataType* coeff = static_cast(param.coeff.data()); - //TODO:can be SIMD to improve cache efficient +#pragma omp parallel for schedule(static) for (int inner_id = 0; inner_id < inner_size; ++inner_id) { + OpDataType tmp = coeff[0] * in_ptrs[0][inner_id]; for (int input_id = 1; input_id < input_num; ++input_id) { @@ -68,10 +69,9 @@ void SaberEltwise::simple_sum(const std::vector& i } } template -template void SaberEltwise::simple_prod(const std::vector& inputs, std::vector& outputs, - EltwiseParam& param) { + EltwiseParam& param, bool with_relu) { const int input_num = inputs.size(); const size_t inner_size = inputs[0]->valid_size(); OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); @@ -80,7 +80,7 @@ void SaberEltwise::simple_prod(const std::vector& for (int i = 0; i < input_num; ++i) { in_ptrs[i] = (OpDataType*) inputs[i]->data(); } - +#pragma omp parallel for schedule(static) for (int inner_id = 0; inner_id < inner_size; ++inner_id) { OpDataType tmp = in_ptrs[0][inner_id]; @@ -97,10 +97,9 @@ void SaberEltwise::simple_prod(const std::vector& } template -template void SaberEltwise::simple_max(const std::vector& inputs, std::vector& outputs, - EltwiseParam& param) { + EltwiseParam& param, bool with_relu) { const int input_num = inputs.size(); volatile const size_t inner_size = inputs[0]->valid_size(); OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); @@ -109,7 +108,7 @@ void SaberEltwise::simple_max(const std::vector& i for (int i = 0; i < input_num; ++i) { in_ptrs[i] = (OpDataType*) inputs[i]->data(); } - +#pragma omp parallel for schedule(static) for (int inner_id = 0; inner_id < inner_size; ++inner_id) { OpDataType tmp = in_ptrs[0][inner_id]; @@ -125,6 +124,56 @@ void SaberEltwise::simple_max(const std::vector& i } } +template +void SaberEltwise::simple_div(const std::vector& inputs, + std::vector& outputs, + EltwiseParam& param, bool with_relu) { + const int input_num = inputs.size(); + volatile const size_t inner_size = inputs[0]->valid_size(); + OpDataType* target = (OpDataType*) outputs[0]->mutable_data(); + std::vector in_ptrs(input_num); + + for (int i = 0; i < input_num; ++i) { + in_ptrs[i] = (OpDataType*) inputs[i]->data(); + } + if (inputs[1]->valid_size() == inputs[0]->valid_size()) { +#pragma omp parallel for schedule(static) + for (int inner_id = 0; inner_id < inner_size; ++inner_id) { + OpDataType tmp = in_ptrs[0][inner_id]; + + for (int input_id = 1; input_id < input_num; ++input_id) { + tmp /= in_ptrs[input_id][inner_id]; + } + + if (with_relu) { + target[inner_id] = tmp > 0 ? tmp : 0; + } else { + target[inner_id] = tmp; + } + } + } else { + CHECK_EQ(inputs.size(), 2) << "elt with axis not support fusion"; + int outer_num = inputs[0]->count(0, param.axis); + int mid_num = outputs[0]->valid_size(); + int inner_num = inputs[0]->count(param.axis, inputs[0]->dims()) / mid_num; + for (int outer_id = 0; outer_id < outer_num; ++outer_id) { +#pragma omp parallel for schedule(static) + for (int mid_id = 0; mid_id < mid_num; mid_id++) { + OpDataType div_data = in_ptrs[1][mid_id]; + for (int inner_id = 0; inner_id < inner_num; inner_id++) { + int index = (outer_id * mid_num + mid_id) * inner_num + inner_id; + OpDataType tmp = in_ptrs[0][index] / div_data; + if (with_relu) { + target[index] = tmp > 0 ? tmp : 0; + } else { + target[index] = tmp; + } + } + } + + } + } +} template SaberStatus SaberEltwise::dispatch( @@ -132,33 +181,21 @@ SaberStatus SaberEltwise::dispatch( std::vector& outputs, EltwiseParam& param) { CHECK_EQ(outputs.size(), (size_t)1); - switch (param.operation) { case Eltwise_sum: - if (_with_relu) { - simple_sum(inputs, outputs, param); - } else { - simple_sum(inputs, outputs, param); - } - + simple_sum(inputs, outputs, param, _with_relu); break; case Eltwise_prod: - if (_with_relu) { - simple_prod(inputs, outputs, param); - } else { - simple_prod(inputs, outputs, param); - } - + simple_prod(inputs, outputs, param, _with_relu); break; case Eltwise_max: - if (_with_relu) { - simple_max(inputs, outputs, param); - } else { - simple_max(inputs, outputs, param); - } + simple_max(inputs, outputs, param, _with_relu); + break; + case Eltwise_div: + simple_div(inputs, outputs, param, _with_relu); break; default: diff --git a/saber/funcs/impl/x86/saber_eltwise.h b/saber/funcs/impl/x86/saber_eltwise.h index 735c5a4ca..9fba686c9 100644 --- a/saber/funcs/impl/x86/saber_eltwise.h +++ b/saber/funcs/impl/x86/saber_eltwise.h @@ -50,18 +50,18 @@ class SaberEltwise : public ImplBase< std::vector& outputs, EltwiseParam ¶m) override; private: - template void simple_sum(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m); - template + EltwiseParam ¶m, bool with_relu); void simple_prod(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m); - template + EltwiseParam ¶m, bool with_relu); void simple_max(const std::vector& inputs, std::vector& outputs, - EltwiseParam ¶m); + EltwiseParam ¶m, bool with_relu); + void simple_div(const std::vector& inputs, + std::vector& outputs, + EltwiseParam ¶m, bool with_relu); bool _with_relu; bool _other_activation; diff --git a/saber/funcs/impl/x86/saber_embedding.cpp b/saber/funcs/impl/x86/saber_embedding.cpp index a0bf18b00..e3ca79779 100644 --- a/saber/funcs/impl/x86/saber_embedding.cpp +++ b/saber/funcs/impl/x86/saber_embedding.cpp @@ -84,6 +84,7 @@ SaberStatus SaberEmbedding::dispatch( } } } + return SaberSuccess; } diff --git a/saber/funcs/impl/x86/saber_fake_quantize_abs_max.cpp b/saber/funcs/impl/x86/saber_fake_quantize_abs_max.cpp deleted file mode 100644 index 4653816a2..000000000 --- a/saber/funcs/impl/x86/saber_fake_quantize_abs_max.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "saber/funcs/impl/x86/saber_fake_quantize_abs_max.h" -#include "saber/funcs/impl/x86/x86_utils.h" -#include -namespace anakin { -namespace saber { - -/** - * @brief formula: - * scale = max(abs(X)) - * range = 2^{bit_length - 1} - 1 - * Out = round(X/scale * range) - * - * - */ -template -SaberStatus SaberFakeQuantizeAbsMax::dispatch(\ - const std::vector *>& inputs, \ - std::vector *>& outputs, \ - FakeQuantizeAbsMaxParam& param) { - const OpDataType* src = (const OpDataType*)inputs[0]->data(); - auto dst = outputs[0]->mutable_data(); - int valid_size = inputs[0]->valid_size(); - auto max_data = 0.f; - for (int i = 0; i < valid_size; i++) { - auto abs_data = src[i] > 0.f ? src[i] : -src[i]; - max_data = abs_data > max_data ? abs_data : max_data; - } - auto range = (1 << (param.bit_length - 1)) - 1; - auto scale = 1.f / max_data * range; - if (param.bit_length == 8) { - char* dst_tmp = (char*)dst; - for (int i = 0; i < valid_size; i++) { - dst_tmp[i] = round(src[i] * scale); - //LOG(INFO) << i << " " << int(dst_tmp[i]); - } - } else if (param.bit_length == 16) { - int16_t* dst_tmp = (int16_t*)dst; - for (int i = 0; i < valid_size; i++) { - dst_tmp[i] = round(src[i] * scale); - } - } else { - LOG(FATAL) <<"other bit length has not been supported"; - } - - return SaberSuccess; -} - -template class SaberFakeQuantizeAbsMax; -DEFINE_OP_TEMPLATE(SaberFakeQuantizeAbsMax, FakeQuantizeAbsMaxParam, X86, AK_INT16); -DEFINE_OP_TEMPLATE(SaberFakeQuantizeAbsMax, FakeQuantizeAbsMaxParam, X86, AK_INT8); -} -} diff --git a/saber/funcs/impl/x86/saber_generate_proposals.cpp b/saber/funcs/impl/x86/saber_generate_proposals.cpp new file mode 100644 index 000000000..fbd7441cc --- /dev/null +++ b/saber/funcs/impl/x86/saber_generate_proposals.cpp @@ -0,0 +1,447 @@ + +#include "saber/funcs/impl/x86/saber_generate_proposals.h" +#include +#include "saber/funcs/debug.h" + +namespace anakin{ +namespace saber { +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +template +SaberStatus SaberGenerateProposals::init( + const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberGenerateProposals::create( + const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} +/*NCHW->NHWC*/ +template +static inline void trans(Tensor* out, Tensor* in) { + auto shape = in->valid_shape(); + out->reshape(Shape({shape[0], shape[2], shape[3], shape[1]}, Layout_NCHW)); + auto stride = in->get_stride(); + auto dst = (Dtype*) out->mutable_data(); + auto src = (const Dtype*) in->data(); + for (auto i = 0; i < shape.count(); i++) { + int n = i / stride[0]; + int c = (i / stride[1]) % shape[1]; + int hw = i % (stride[1]); + int out_id = n * stride[0] + hw*shape[1] + c; + dst[out_id] = src[i]; + } +} + + +template +static inline void box_coder(Tensor* proposals, + const Tensor* anchors, + const Tensor* bbox_deltas, + const Tensor* variances, + std::vector& index + ) { + proposals->reshape(Shape({index.size(), 4, 1, 1}, Layout_NCHW)); + int anchor_nums = index.size(); + int len = anchors->shape()[3]; + CHECK_EQ(len, 4) << "anchor length is 4"; + auto anchor_data = (const Dtype*) anchors->data(); + auto bbox_deltas_data = (const Dtype*) bbox_deltas->data(); + auto proposals_data = (Dtype*) proposals->data(); + const Dtype *variances_data = nullptr; + if (variances) { + variances_data = (const Dtype*)variances->data(); + } + for (int i = 0; i < index.size(); i++) { + int offset = index[i] * len; + auto anchor_data_tmp = anchor_data + offset; + auto variances_data_tmp = variances_data + offset; + auto bbox_deltas_data_tmp = bbox_deltas_data + offset; + auto proposals_data_tmp = proposals_data + i * len; + auto anchor_width = anchor_data_tmp[2] - anchor_data_tmp[0] + 1.0; + auto anchor_height = anchor_data_tmp[3] - anchor_data_tmp[1] + 1.0; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + Dtype bbox_center_x = 0, bbox_center_y = 0; + Dtype bbox_width = 0, bbox_height = 0; + if (variances) { + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data_tmp[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(variances_data_tmp[ 2] * + bbox_deltas_data_tmp[2], + kBBoxClipDefault)) * anchor_width; + bbox_height = std::exp(std::min(variances_data_tmp[3] * + bbox_deltas_data_tmp[3], + kBBoxClipDefault)) * anchor_height; + } else { + bbox_center_x = + bbox_deltas_data_tmp[0] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data_tmp[2], + kBBoxClipDefault)) * anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data_tmp[3], + kBBoxClipDefault)) * anchor_height; + } + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - 1; + } +} + +template +static inline void clip_tiled_boxes(Tensor *boxes, const Tensor *im_info) { + Dtype *boxes_data = (Dtype*)boxes->mutable_data(); + auto im_info_data = (const Dtype*)im_info->data(); + Dtype zero(0); + for (int64_t i = 0; i < boxes->valid_size(); i += 4) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); //left + boxes_data[i+1] = + std::max(std::min(boxes_data[i+1], im_info_data[0] - 1), zero); //top + boxes_data[i+2] = + std::max(std::min(boxes_data[i+2], im_info_data[1] - 1), zero); // right + boxes_data[i+3] = + std::max(std::min(boxes_data[i+3], im_info_data[0] - 1), zero);//bottom + } +} + +template +void filter_boxes(std::vector& keep, + const Tensor *boxes, + const float min_size, + const Tensor *im_info) { + const Dtype *im_info_data = (const Dtype*)im_info->data(); + const Dtype *boxes_data = (const Dtype*)boxes->data(); + Dtype im_scale = im_info_data[2]; + auto min_size_final = std::max(min_size, 1.0f); + keep.clear(); + + for (int i = 0; i < boxes->valid_size(); i += 4 ) { + Dtype left = boxes_data[i]; + Dtype right = boxes_data[i+2]; + Dtype top = boxes_data[i+1]; + Dtype bottom = boxes_data[i+3]; + Dtype ws = right - left + 1; + Dtype hs = bottom - top + 1; + Dtype ws_origin_scale = + (right - left) / im_scale + 1; + Dtype hs_origin_scale = + (bottom - top) / im_scale + 1; + Dtype x_ctr = left + ws / 2; + Dtype y_ctr = top + hs / 2; + if (ws_origin_scale >= min_size_final && hs_origin_scale >= min_size_final && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { + keep.push_back(i/4); + } + } +} + +template +static inline std::vector> get_sorted_score_index( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first < b.first; + }); + return sorted_indices; +} + +template +static inline Dtype BBoxArea(const Dtype *box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + return static_cast(0.); + } else { + const Dtype w = box[2] - box[0]; + const Dtype h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + return (w + 1) * (h + 1); + } + } +} + +template +static inline Dtype jaccard_overlap(const Dtype *box1, const Dtype *box2, bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const Dtype inter_xmin = std::max(box1[0], box2[0]); + const Dtype inter_ymin = std::max(box1[1], box2[1]); + const Dtype inter_xmax = std::min(box1[2], box2[2]); + const Dtype inter_ymax = std::min(box1[3], box2[3]); + const Dtype inter_w = std::max(Dtype(0), inter_xmax - inter_xmin + 1); + const Dtype inter_h = std::max(Dtype(0), inter_ymax - inter_ymin + 1); + const Dtype inter_area = inter_w * inter_h; + const Dtype bbox1_area = BBoxArea(box1, normalized); + const Dtype bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline void NMS(std::vector& selected_indices, + Tensor *bbox, + std::vector& indices, + Dtype nms_threshold, + float eta) { + int64_t num_boxes = bbox->num(); +// 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->channel(); + + int selected_num = 0; + Dtype adaptive_threshold = nms_threshold; + const Dtype *bbox_data = (const Dtype*)(bbox->data()); + selected_indices.clear(); + for (int i = 0; i (bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + + +} + +template +void gather(Tensor* out, + const Tensor* in, + std::vector& index, + const int inner_dim) { + Shape shape = in->valid_shape(); + int index_num = index.size(); + shape[0] = index_num; + out->reshape(shape); + auto in_data = (const Dtype*) in->data(); + auto out_data = (Dtype*)out->data(); + for (int i = 0; i < index_num; i++) { + memcpy(out_data + i * inner_dim, in_data + index[i] * inner_dim, sizeof(Dtype) * inner_dim); + } +} + +template +void get_score_sorted_index(const Tensor* scores, + int sort_num, + std::vector& sorted_score, + std::vector& score_index) { + auto scores_data = (const Dtype*)scores->data(); + std::vector> index; + for (int i = 0; i < scores->valid_size(); i++) { + index.emplace_back(std::make_pair(scores_data[i], i)); + } + std::partial_sort(index.begin(), index.begin() + sort_num, index.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first;}); + //std::nth_element(index.begin(), index.begin() + sort_num, index.end(), + // [](const std::pair &a, const std::pair &b) { return a.first > b.first;}); + + sorted_score.resize(sort_num); + score_index.resize(sort_num); + for (int i = 0; i < sort_num; i++) { + sorted_score[i] = index[i].first; + score_index[i] = index[i].second; + } +} + +template +void proposal_for_one_image( + Tensor &proposals_sel, + Tensor &scores_sel, + Tensor &proposals, + const Tensor &im_info_slice,//[1, 3] + const Tensor &anchors_slice,//[H, W, A, 4] + const Tensor &variances_slice, //[H, W, A, 4] + const Tensor &bbox_deltas_slice, // [1, H, W, A*4] + const Tensor &scores_slice, // [1, H, W, A] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + + int scores_num = scores_slice.valid_size(); + int index_num = 0; + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_num) { + index_num = scores_num; + } else { + index_num = pre_nms_top_n; + } + std::vector scores_sorted; + std::vector index; + get_score_sorted_index(&scores_slice, index_num, scores_sorted, index); + + box_coder(&proposals, &anchors_slice, &bbox_deltas_slice, &variances_slice, index); + + clip_tiled_boxes(&proposals, &im_info_slice); + + std::vector keep; + filter_boxes(keep, &proposals, min_size, &im_info_slice); + + //std::vector> filter_sort_index; + //for (int i = 0; i < keep.size(); i++) { + // filter_sort_index.emplace_back(std::make_pair(scores_sorted[index[keep[i]]], keep[i])); + //} + //std::stable_sort(filter_sort_index.begin(), filter_sort_index.begin() + keep.size(), + // [](const std::pair &a, const std::pair &b) { return a.first > b.first;}); + + //for (int i = 0; i < keep.size(); i++) { + // keep[i] = filter_sort_index[i].second; + //} + + + if (nms_thresh <= 0) { + gather(&proposals_sel, &proposals, keep, 4); + std::vector scores_index; + for (int i = 0; i < keep.size(); i++) { + scores_index[i] = index[keep[i]]; + } + gather(&scores_sel, &scores_slice, scores_index, 1); + return; + } + + std::vector keep_nms; + NMS(keep_nms, &proposals, keep, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.size()) { + keep_nms.resize(post_nms_top_n); + } + + std::vector scores_index(keep_nms.size()); + for (int id = 0; id < keep_nms.size(); id++) { + scores_index[id] = index[keep_nms[id]]; + } + gather(&scores_sel, &scores_slice, scores_index, 1); + gather(&proposals_sel, &proposals, keep_nms, 4); +} + +template +void AppendProposals(Tensor *dst, + int64_t offset, + const int im_id, + const Tensor *src) { + auto *out_data = (Dtype*)dst->data(); + auto *in_data = (const Dtype*)src->data(); + out_data += offset; + for (int i = 0; i < src->valid_size()/4; i++) { + out_data[0] = im_id; + std::memcpy(out_data + 1, in_data, 4* sizeof(Dtype)); + out_data += 5; + in_data += 4; + } +} + +template +void AppendScores(Tensor *dst, + int64_t offset, + const Tensor *src) { + auto *out_data = (Dtype*)dst->data(); + auto *in_data = (const Dtype*)src->data(); + out_data += offset; + std::memcpy(out_data, in_data, src->valid_size() * sizeof(Dtype)); +} + +template +SaberStatus SaberGenerateProposals::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m) { + typedef typename DataTrait::Dtype OpDataType; + auto anchors = *inputs[0]; + auto bbox_deltas = *inputs[1]; + auto im_info = *inputs[2]; + auto scores = *inputs[3]; + auto variances = *inputs[4]; + auto rpn_rois = outputs[0]; + auto rpn_roi_probs = outputs[1]; + int pre_nms_top_n = param.pre_nms_top_n;; + int post_nms_top_n = param.post_nms_top_n; + float nms_thresh = param.nms_thresh;; + float min_size = param.min_size;; + float eta = param.eta; + auto scores_shape = scores.valid_shape(); + auto bbox_shape = bbox_deltas.valid_shape(); + rpn_rois->reshape(Shape({im_info.num() * post_nms_top_n, 5, 1, 1}, Layout_NCHW)); + rpn_roi_probs->reshape(Shape({im_info.num() * post_nms_top_n, 1, 1, 1}, Layout_NCHW)); + + trans(&_scores_swap, &scores); + trans(&_bbox_deltas_swap, &bbox_deltas); + + int num_proposals = 0; + int img_num = scores_shape[0]; + Shape im_info_slice_shape = im_info.valid_shape(); + Shape bbox_deltas_slice_shape = bbox_deltas.valid_shape(); + Shape scores_slice_shape({scores.valid_size() / img_num, 1, 1, 1}, Layout_NCHW); + im_info_slice_shape[0] = 1; + bbox_deltas_slice_shape[0] = 1; + std::vector proposals_offset; + proposals_offset.push_back(0); + for (int i = 0; i < img_num; i++) { + Tensor im_info_slice((void*)((OpDataType*)im_info.mutable_data() + i * im_info.get_stride()[0]), X86(), this->_ctx->get_device_id(), im_info_slice_shape); + Tensor bbox_deltas_slice((void*)((OpDataType*)_bbox_deltas_swap.mutable_data() + i * bbox_deltas.get_stride()[0]), X86(), this->_ctx->get_device_id(), bbox_deltas_slice_shape); + Tensor scores_slice((void*)((OpDataType*)_scores_swap.mutable_data() + i * scores.get_stride()[0]), X86(), this->_ctx->get_device_id(), scores_slice_shape); + + proposal_for_one_image(_proposals_sel, + _scores_sel, + _proposals, + im_info_slice, + anchors, + variances, + bbox_deltas_slice, // [M, 4] + scores_slice, // [N, 1] + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + eta); + AppendProposals(rpn_rois, 5 * num_proposals, i, &_proposals_sel); + AppendScores(rpn_roi_probs, num_proposals, &_scores_sel); + num_proposals += _scores_sel.valid_size();; + proposals_offset.push_back(num_proposals); + } + rpn_roi_probs->reshape(Shape({num_proposals, 1, 1, 1}, Layout_NCHW)); + rpn_rois->reshape(Shape({num_proposals, 5, 1, 1}, Layout_NCHW)); + std::vector> out_offset; + out_offset.push_back(proposals_offset); + for (size_t i = 0; i < outputs.size(); i++) { + outputs[i]->set_seq_offset(out_offset); + } + return SaberSuccess; +} + +template class SaberGenerateProposals; +DEFINE_OP_TEMPLATE(SaberGenerateProposals, GenerateProposalsParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberGenerateProposals, GenerateProposalsParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_generate_proposals.h b/saber/funcs/impl/x86/saber_generate_proposals.h new file mode 100644 index 000000000..a3b1d775e --- /dev/null +++ b/saber/funcs/impl/x86/saber_generate_proposals.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_GENERATE_PROPOSALS_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_GENERATE_PROPOSALS_H + +#include "saber/funcs/impl/impl_generate_proposals.h" + +namespace anakin { +namespace saber { + +template +class SaberGenerateProposals : + public ImplBase< + X86, OpDtype, + GenerateProposalsParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberGenerateProposals() {} + + ~SaberGenerateProposals() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam ¶m) override; + +private: + Tensor _bbox_deltas_swap; + Tensor _scores_swap; + Tensor _proposals; + Tensor _proposals_sel; + Tensor _scores_sel; + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_gru.cpp b/saber/funcs/impl/x86/saber_gru.cpp index 8c7793912..3ec63ee91 100644 --- a/saber/funcs/impl/x86/saber_gru.cpp +++ b/saber/funcs/impl/x86/saber_gru.cpp @@ -1,11 +1,7 @@ - - #include "saber/funcs/impl/x86/saber_gru.h" #include "saber/core/tensor_op.h" #include "mkl_cblas.h" #include "saber/funcs/impl/x86/saber_normal_activation.h" -#include -#include "sys/time.h" namespace anakin { diff --git a/saber/funcs/impl/x86/saber_gru.h b/saber/funcs/impl/x86/saber_gru.h index e5e650b83..408552946 100644 --- a/saber/funcs/impl/x86/saber_gru.h +++ b/saber/funcs/impl/x86/saber_gru.h @@ -4,13 +4,15 @@ #define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_GRU_H #include "saber/funcs/impl/impl_gru.h" #include "saber/funcs/impl/x86/x86_utils.h" -#include #if defined(__AVX512F__) +#include #define SABER_X86_TYPE __m512 #elif defined(__AVX2__) and defined(__FMA__) +#include #define SABER_X86_TYPE __m256 #elif defined(__SSE4_2__) and defined(__FMA__) +#include #define SABER_X86_TYPE __m128 #else #define SABER_X86_TYPE float diff --git a/saber/funcs/impl/x86/saber_im2col_conv.cpp b/saber/funcs/impl/x86/saber_im2col_conv.cpp index 5a88254a2..850f82e7a 100644 --- a/saber/funcs/impl/x86/saber_im2col_conv.cpp +++ b/saber/funcs/impl/x86/saber_im2col_conv.cpp @@ -1,4 +1,3 @@ - #include "saber/funcs/impl/x86/saber_im2col_conv.h" namespace anakin { @@ -53,6 +52,43 @@ void im2col_cpu(const Dtype* data_im, const int channels, } } +template +void im2col_cpu_par(const Dtype* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_col) { + int dil_kernel_h = (kernel_h - 1) * dilation_h + 1; + int dil_kernel_w = (kernel_w - 1) * dilation_w + 1; + int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1; + int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; + +#pragma omp parallel for + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; + + const int hc0 = h_offset * dilation_h - pad_h; + const int wc0 = w_offset * dilation_w - pad_w; + for (int h = 0; h < height_col; ++h) { + int h_pad = h * stride_h + hc0; + + const int row_offset = (c * height_col + h) * width_col; + const int srow_offset = (c_im * height + h_pad) * width; + for (int w = 0; w < width_col; ++w) { + int w_pad = w * stride_w + wc0; + if ((((unsigned)h_pad) < ((unsigned)height)) && (((unsigned)w_pad) < ((unsigned)width))) + data_col[row_offset + w] = data_im[srow_offset + w_pad]; + else { + data_col[row_offset + w] = 0.; + } + } + } + } +} template <> SaberStatus SaberIm2colConv::create(const std::vector *>& inputs, std::vector*>& outputs, @@ -73,6 +109,7 @@ SaberStatus SaberIm2colConv::create(const std::vector *>& _im2col_tensor.reshape(_im2col_shape); int out_stride = out_h * out_w; +// LOG(INFO)<<"im2col m,n,k "<<(out_c / conv_param->group)<<","<<(out_stride)<<","<<(in_c / conv_param->group * kernel_h * kernel_w); _gemm.init(false, false, out_c / conv_param->group, out_stride, in_c / conv_param->group * kernel_h * kernel_w, *(this->_ctx)); @@ -117,11 +154,16 @@ SaberStatus SaberIm2colConv::dispatch(const std::vector *> for (int i = 0; i < batch_size; i++) { for (int j = 0; j < group; j++) { - im2col_cpu(din, in_c / group, in_h, in_w, kernel_h, kernel_w, conv_param->pad_h, conv_param->pad_w, + im2col_cpu_par(din, in_c / group, in_h, in_w, kernel_h, kernel_w, conv_param->pad_h, conv_param->pad_w, conv_param->stride_h, conv_param->stride_w, conv_param->dilation_h, conv_param->dilation_w, (float*)_im2col_tensor.mutable_data()); - _gemm.dispatch(1.f, 0.f, weights_d + j * weight_size_per_group, (const float*)_im2col_tensor.data(), + float add_out = 0.f; + if (param.eltwise_param.has_eltwise){ + add_out = 1.f; + } + + _gemm.dispatch(1.f, add_out, weights_d + j * weight_size_per_group, (const float*)_im2col_tensor.data(), dout); din += in_c / group * in_stride; diff --git a/saber/funcs/impl/x86/saber_lstm.cpp b/saber/funcs/impl/x86/saber_lstm.cpp index fd05de123..b5181c390 100644 --- a/saber/funcs/impl/x86/saber_lstm.cpp +++ b/saber/funcs/impl/x86/saber_lstm.cpp @@ -1,5 +1,4 @@ #include "saber/funcs/impl/x86/saber_lstm.h" -#include "sys/time.h" #include "saber/funcs/impl/x86/saber_normal_activation.h" #include "mkl_cblas.h" @@ -9,29 +8,19 @@ namespace anakin { namespace saber { -//inline -static void gemm(const bool TransA, const bool TransB, int m, int n, int k, const float alpha, - const float* a, const float* b, const float beta, float* c) { - // cout << "(" << m << "," << n << "," << k << ")" << endl; - int lda = (!TransA/* == CblasNoTrans*/) ? k : m; - int ldb = (!TransB/* == CblasNoTrans*/) ? n : k; - CBLAS_TRANSPOSE cuTransA = - (!TransA/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; - CBLAS_TRANSPOSE cuTransB = - (!TransB/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; - cblas_sgemm(CblasRowMajor, cuTransA, cuTransB, m, n, k, alpha, a, k, b, n, beta, c, n); -}; - -template -static inline void cal_first_lstm_nullhidden(int emit_word_id_start,int emit_word_id_end,OpDataType* temp_wx,const OpDataType* weight_peephole, - OpDataType* hout,OpDataType* inner_cell,const BIT* b_i, const BIT* b_f, const BIT* b_c, const BIT* b_o, - ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity,int hidden_size){ +template +static inline void cal_first_lstm_nullhidden(int emit_word_id_start, int emit_word_id_end, + OpDataType* temp_wx, const OpDataType* weight_peephole, + OpDataType* hout, OpDataType* inner_cell, const BIT* b_i, const BIT* b_f, const BIT* b_c, + const BIT* b_o, + ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity, int hidden_size) { const int i_offset = 0; const int c_offset = 2; const int o_offset = 3; BIT(*gate_act)(const BIT) = Activate_inner(gate_activity); BIT(*cell_act)(const BIT) = Activate_inner(cell_activity); BIT(*candi_act)(const BIT) = Activate_inner(candi_activity); + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { int emit_wx_offset = emit_word_id * hidden_size * 4; const BIT* w_x_i = (BIT*)(temp_wx + i_offset * hidden_size + emit_wx_offset); @@ -43,9 +32,10 @@ static inline void cal_first_lstm_nullhidden(int emit_word_id_start,int emit_wor BIT* gate_h_p = (BIT*)(hout + emit_id_offset * hidden_size); BIT* gate_c_p = (BIT*)(inner_cell + emit_id_offset * hidden_size); - if(with_peephole) { + if (with_peephole) { +#pragma omp parallel for schedule(static) for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); - ++frame_id) { + ++frame_id) { BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); BIT gate_c = gate_i * gate_c_s; @@ -53,9 +43,10 @@ static inline void cal_first_lstm_nullhidden(int emit_word_id_start,int emit_wor gate_c_p[frame_id] = gate_c; gate_h_p[frame_id] = gate_o * candi_act(gate_c); } - } else{ + } else { +#pragma omp parallel for schedule(static) for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); - ++frame_id) { + ++frame_id) { BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); BIT gate_c_s = cell_act(w_x_c[frame_id] + b_c[frame_id]); BIT gate_c = gate_i * gate_c_s; @@ -67,10 +58,12 @@ static inline void cal_first_lstm_nullhidden(int emit_word_id_start,int emit_wor } } -template -static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,OpDataType* temp_wx,const OpDataType* weight_peephole, - OpDataType* hout,OpDataType* inner_cell,const BIT* b_i, const BIT* b_f, const BIT* b_c, const BIT* b_o, - ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity,int hidden_size){ +template +static inline void cal_lstm_batch(int emit_word_id_start, int emit_word_id_end, OpDataType* temp_wx, + const OpDataType* weight_peephole, + OpDataType* hout, OpDataType* inner_cell, const BIT* b_i, const BIT* b_f, const BIT* b_c, + const BIT* b_o, + ActiveType gate_activity, ActiveType cell_activity, ActiveType candi_activity, int hidden_size) { const int i_offset = 0; const int f_offset = 1; const int c_offset = 2; @@ -78,6 +71,7 @@ static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,Op BIT(*gate_act)(const BIT) = Activate_inner(gate_activity); BIT(*cell_act)(const BIT) = Activate_inner(cell_activity); BIT(*candi_act)(const BIT) = Activate_inner(candi_activity); + for (int emit_word_id = emit_word_id_start; emit_word_id < emit_word_id_end; emit_word_id++) { int emit_wx_offset = emit_word_id * hidden_size * 4; const BIT* w_x_i = (BIT*)(temp_wx + i_offset * hidden_size + emit_wx_offset); @@ -94,9 +88,10 @@ static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,Op BIT* gate_h_p = (BIT*)(hout + emit_id_offset * hidden_size); BIT* gate_c_p = (BIT*)(inner_cell + emit_id_offset * hidden_size); - if(with_peephole) { + if (with_peephole) { +#pragma omp parallel for schedule(static) for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); - ++frame_id) { + ++frame_id) { BIT c_1 = gate_c_p[frame_id]; BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id] + w_ci[frame_id] * c_1); BIT gate_f = gate_act(w_x_f[frame_id] + b_f[frame_id] + w_cf[frame_id] * c_1); @@ -107,9 +102,10 @@ static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,Op gate_h_p[frame_id] = gate_o * candi_act(gate_c); } - }else{ + } else { +#pragma omp parallel for schedule(static) for (int frame_id = 0; frame_id < hidden_size / (sizeof(BIT) / sizeof(OpDataType)); - ++frame_id) { + ++frame_id) { BIT c_1 = gate_c_p[frame_id]; BIT gate_i = gate_act(w_x_i[frame_id] + b_i[frame_id]); BIT gate_f = gate_act(w_x_f[frame_id] + b_f[frame_id]); @@ -124,11 +120,11 @@ static inline void cal_lstm_batch(int emit_word_id_start,int emit_word_id_end,Op } template<> -template +template SaberStatus SaberLstm:: avx_dispatch(const std::vector*>& inputs, - std::vector*>& outputs, - LstmParam& param) { + std::vector*>& outputs, + LstmParam& param) { int loop_div = sizeof(BIT) / sizeof(OpDataType); const OpDataType* weight_h = (const OpDataType*)_aligned_weights_h2h.data(); @@ -139,29 +135,37 @@ avx_dispatch(const std::vector*>& inputs, BIT(*cell_act)(const BIT) = Activate_inner(param.cell_activity); BIT(*candi_act)(const BIT) = Activate_inner(param.candidate_activity); - std::vector offset_vec = inputs[0]->get_seq_offset()[inputs[0]->get_seq_offset().size()-1]; - std::vector length_vec(offset_vec.size() - 1); + std::vector offset_vec = inputs[0]->get_seq_offset()[inputs[0]->get_seq_offset().size() - 1]; + // std::vector length_vec(offset_vec.size() - 1); int batch_size = offset_vec.size() - 1; - int seqsum = 0; - int max_seq_len = 0; - bool is_hw2seq = offset_vec.size() > 2; - int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); + int seqsum = inputs[0]->num(); + + if (param.skip_num > 1) { + CHECK_EQ(offset_vec.size() - 1, 1) << "only support batch = 1 in skip_mode"; + int word_sum = inputs[0]->num(); + CHECK_EQ(word_sum % param.skip_num, 0); + batch_size = param.skip_num; + } + + // int max_seq_len = 0; + // bool is_hw2seq = offset_vec.size() > 2; + // int word_sum = is_hw2seq ? offset_vec[offset_vec.size() - 1] : inputs[0]->channel(); utils::AlignedUtils aligned_utils; const OpDataType* h_init = nullptr; const OpDataType* cell_init = nullptr; const OpDataType* x = (const OpDataType*)inputs[0]->data(); - OpDataType* out = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* out = (OpDataType*)outputs[0]->mutable_data(); bool is_reverse = param.is_reverse; if (inputs.size() > 1) { h_init = (const OpDataType*)inputs[1]->data(); - utils::try_expand_tensor(_aligned_init_hidden,batch_size * _aligned_hidden_size); + utils::try_expand_tensor(_aligned_init_hidden, batch_size * _aligned_hidden_size); aligned_utils.aligned_last_dim(h_init, (OpDataType*)_aligned_init_hidden.mutable_data(), batch_size * _hidden_size, _hidden_size, _aligned_hidden_size); h_init = (const OpDataType*)_aligned_init_hidden.data(); } else if (param.init_hidden() != nullptr) { - h_init =(const OpDataType*) param.init_hidden()->data(); + h_init = (const OpDataType*) param.init_hidden()->data(); //FIXME:is it correct? } else { // _aligned_init_hidden.try_expand_tensor(batch_size * _aligned_hidden_size); @@ -173,38 +177,43 @@ avx_dispatch(const std::vector*>& inputs, std::vector emit_offset_vec; int emit_length = 0; utils::SeqSortedseqTranseUtil transe_util(is_reverse); - bool transform = transe_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length); + bool transform = transe_util.get_sorted_map(offset_vec, emit_offset_vec, emit_length, + param.skip_num); OpDataType* inner_h_out = out; OpDataType* inner_cell = nullptr; const OpDataType* inner_x = x; const OpDataType* inner_h_init = h_init; - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - length_vec[i] = len; - max_seq_len = max_seq_len > len ? max_seq_len : len; - seqsum += len; - } + // for (int i = 0; i < offset_vec.size() - 1; ++i) { + // int len = offset_vec[i + 1] - offset_vec[i]; + // length_vec[i] = len; + // max_seq_len = max_seq_len > len ? max_seq_len : len; + // seqsum += len; + // } - utils::try_expand_tensor(_temp_wx,seqsum * 4 * _aligned_hidden_size); - utils::try_expand_tensor(_temp_wh,batch_size * 4 * _aligned_hidden_size); - utils::try_expand_tensor(_temp_out,seqsum * _aligned_hidden_size * param.num_direction); - utils::try_expand_tensor(_temp_cell,batch_size * _aligned_hidden_size); + // LOG(INFO)<<"seqsum = "<*>& inputs, OpDataType* temp_wh = (OpDataType*)_temp_wh.mutable_data(); OpDataType* temp_wx = (OpDataType*)_temp_wx.mutable_data(); - gemm(false, false, seqsum, 4 * _aligned_hidden_size, _word_size, 1.f, inner_x, weight_w, 0.f, - temp_wx); + _wx_gemm_fp32.dispatch(1.f,0.f,seqsum, inner_x, weight_w,temp_wx); +// gemm(false, false, seqsum, 4 * _aligned_hidden_size, _word_size, 1.f, inner_x, weight_w, 0.f, +// temp_wx); const int i_offset = 0; const int f_offset = 1; @@ -225,6 +235,7 @@ avx_dispatch(const std::vector*>& inputs, const BIT* b_c = (BIT*)(bias + c_offset * _aligned_hidden_size); const BIT* b_o = (BIT*)(bias + o_offset * _aligned_hidden_size); + for (int word_id = 0; word_id < emit_length; word_id++) { int real_word_id = word_id; int last_word_id = word_id - 1; @@ -239,13 +250,15 @@ avx_dispatch(const std::vector*>& inputs, int emit_word_length = emit_word_id_end - emit_word_id_start; const float* hin; + // LOG(INFO)<<"emit_word_id_start "<(emit_word_id_start,emit_word_id_end,temp_wx,weight_peephole, - hout,inner_cell,b_i,b_f,b_c,b_o, - param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); + cal_first_lstm_nullhidden(emit_word_id_start, emit_word_id_end, + temp_wx, weight_peephole, + hout, inner_cell, b_i, b_f, b_c, b_o, + param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); continue; @@ -259,22 +272,28 @@ avx_dispatch(const std::vector*>& inputs, hout = emit_offset_vec[real_word_id] * _aligned_hidden_size + inner_h_out; //wh - gemm(false, false, emit_word_length, 4 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, - weight_h, - 1.f, temp_wx+emit_word_id_start*4*_aligned_hidden_size); - cal_lstm_batch(emit_word_id_start,emit_word_id_end,temp_wx,weight_peephole, - hout,inner_cell,b_i,b_f,b_c,b_o, - param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); +// gemm(false, false, emit_word_length, 4 * _aligned_hidden_size, _aligned_hidden_size, 1.0, hin, +// weight_h, +// 1.f, temp_wx + emit_word_id_start * 4 * _aligned_hidden_size); + + _wh_gemm_fp32.dispatch(1.f,1.f,emit_word_length,hin, weight_h,temp_wx + emit_word_id_start * 4 * _aligned_hidden_size); + + cal_lstm_batch(emit_word_id_start, emit_word_id_end, temp_wx, + weight_peephole, + hout, inner_cell, b_i, b_f, b_c, b_o, + param.gate_activity, param.cell_activity, param.candidate_activity, _aligned_hidden_size); } if (transform) { transe_util.sorted_seq_2_seq(inner_h_out, out, _hidden_size, _aligned_hidden_size); } else if (_hidden_size != _aligned_hidden_size) { - aligned_utils.unaligned_last_dim((OpDataType*)_temp_out.data(), out, seqsum * _hidden_size, _hidden_size, + aligned_utils.unaligned_last_dim((OpDataType*)_temp_out.data(), out, seqsum * _hidden_size, + _hidden_size, _aligned_hidden_size); } + return SaberSuccess; } @@ -290,14 +309,66 @@ dispatch(const std::vector*>& inputs, CHECK_EQ(param.num_layers, 1) << "only support param.num_layers==1"; if (param.with_peephole) { - avx_dispatch(inputs, outputs, param); + avx_dispatch(inputs, outputs, param); } else { - avx_dispatch(inputs, outputs, param); + avx_dispatch(inputs, outputs, param); } + return SaberSuccess; } DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(SaberLstm, LstmParam, X86, AK_INT8); + +template<> +SaberStatus SaberLstm:: +dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + CHECK_EQ(inputs[0]->get_dtype(), AK_INT8); + auto seq_offset = inputs[0]->get_seq_offset()[0]; + int seq_num = seq_offset.size(); + + // _temp_wx + for (int seq_id = 0; seq_id < seq_num; seq_id++) { + int word_id_start = seq_offset[seq_id]; + int word_id_end = seq_offset[seq_id + 1]; + + for (int word_id = word_id_start; word_id < word_id_end; word_id++) { + + } + } + + LOG(FATAL)<<"not impl"; + return SaberSuccess; +}; + +template<> +SaberStatus SaberLstm::create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + LOG(FATAL)<<"not impl"; + return SaberSuccess; +}; + + +template<> +SaberStatus SaberLstm::init(const std::vector*>& inputs, + std::vector*>& outputs, LstmParam& param, Context& ctx) { + if (param.with_peephole) { + _hidden_size = param.bias()->valid_size() / 7; + } else { + _hidden_size = param.bias()->valid_size() / 4; + } + + _word_size = (param.weight()->valid_size() - _hidden_size * _hidden_size * 4) / _hidden_size / 4; + + CHECK_EQ(_hidden_size % 16, 0); + CHECK_EQ(_word_size % 16, 0); + LOG(FATAL)<<"not impl"; + return SaberSuccess; +}; + + } } diff --git a/saber/funcs/impl/x86/saber_lstm.h b/saber/funcs/impl/x86/saber_lstm.h index 64f3fe876..06325d177 100644 --- a/saber/funcs/impl/x86/saber_lstm.h +++ b/saber/funcs/impl/x86/saber_lstm.h @@ -3,13 +3,16 @@ #include "saber/funcs/impl/impl_lstm.h" #include "saber_funcs_param.h" #include "saber/funcs/impl/x86/x86_utils.h" -#include +#include "saber/funcs/impl/x86/mkl_gemm.h" #if defined(__AVX512F__) +#include #define SABER_X86_TYPE __m512 #elif defined(__AVX2__) and defined(__FMA__) +#include #define SABER_X86_TYPE __m256 #elif defined(__SSE4_2__) and defined(__FMA__) +#include #define SABER_X86_TYPE __m128 #else #define SABER_X86_TYPE float @@ -78,6 +81,12 @@ class SaberLstm : weights_peephole_size,_hidden_size,_aligned_hidden_size); } + int seqsum = inputs[0]->num(); + const float* weight_h = (const float*)_aligned_weights_h2h.data(); + const float* weight_w = (const float*)_aligned_weights_i2h.data(); + _wx_gemm_fp32.init(false, false,seqsum, 4 * _aligned_hidden_size, _word_size,ctx,weight_w,PACKED_MKLGEMM); + _wh_gemm_fp32.init(false, false,seqsum, 4 * _aligned_hidden_size, _aligned_hidden_size,ctx,weight_h,PACKED_MKLGEMM); + return create(inputs,outputs,param,ctx); } ; @@ -121,6 +130,9 @@ class SaberLstm : Tensor _temp_out; Tensor _temp_h_init; + MklDnnGemm _wx_gemm_fp32; + MklDnnGemm _wh_gemm_fp32; + template SaberStatus avx_dispatch(const std::vector*>& inputs, std::vector*>& outputs, diff --git a/saber/funcs/impl/x86/saber_lstmp.cpp b/saber/funcs/impl/x86/saber_lstmp.cpp new file mode 100644 index 000000000..791934a29 --- /dev/null +++ b/saber/funcs/impl/x86/saber_lstmp.cpp @@ -0,0 +1,518 @@ +#include "saber/funcs/impl/x86/saber_lstmp.h" +#include "mkl_cblas.h" +#include "mkl.h" +#include "saber_normal_activation.h" +#include "debug.h" +#include "timer.h" + +#if defined(__AVX512F__) +#include +#define SABER_X86_TYPE __m512 +#elif defined(__AVX2__) and defined(__FMA__) +#include +#define SABER_X86_TYPE __m256 +#elif defined(__SSE4_2__) and defined(__FMA__) +#include +#define SABER_X86_TYPE __m128 +#else +#define SABER_X86_TYPE float +#endif + +namespace anakin { + +namespace saber { + +static void gemm(const bool TransA, const bool TransB, int m, int n, int k, const float alpha, + const float* a, const float* b, const float beta, float* c) { + // cout << "(" << m << "," << n << "," << k << ")" << endl; + int lda = (!TransA/* == CblasNoTrans*/) ? k : m; + int ldb = (!TransB/* == CblasNoTrans*/) ? n : k; + CBLAS_TRANSPOSE cu_trans_a = + (!TransA/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE cu_trans_b = + (!TransB/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + Context ctx(0, 0, 0); + SaberTimer timer; + timer.start(ctx); + cblas_sgemm(CblasRowMajor, cu_trans_a, cu_trans_b, m, n, k, alpha, a, k, b, n, beta, c, n); + timer.end(ctx); + double ms = timer.get_average_ms(); + double work_load = (double)m * n * k * 2; + double speed = work_load / ms / 1000.0 / 1000.0; + LOG(INFO) << "mkl_cblas_sgemm " << m << "," << n << "," << k << "," << ms << "," << speed; +}; + +static void s8s8s32_gemm(const bool TransA, const bool TransB, int m, int n, int k, + const float alpha, + const int8_t* a, const int8_t* b, const float beta, int32_t* c) { + +}; + + +template<> +SaberStatus SaberLstmp:: create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + return SaberSuccess; +}; + + +template<> +SaberStatus SaberLstmp::init(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + _inner_hidden_dim = param.cell_dim; + _output_hidden_dim = param.project_dim; + + CHECK_GT(param.cell_dim, 0); + CHECK_GT(param.project_dim, 0); + CHECK_EQ(param.cell_dim % (sizeof(SABER_X86_TYPE) / sizeof(float)), 0); + + int word_dim = inputs[0]->channel(); + const float* weights_x_ptr = static_cast(param.weight()->data()); + const float* weights_h_ptr = weights_x_ptr + word_dim * _inner_hidden_dim * 4; + const float* weights_project_ptr = weights_h_ptr + _output_hidden_dim * _inner_hidden_dim * 4; + int word_num = inputs[0]->num(); + const int skip_num = param.skip_num; + _wx_gemm_fp32.init(false, false,word_num, 4 * _inner_hidden_dim, word_dim,ctx,weights_x_ptr,PACKED_MKLGEMM); + _wh_gemm_fp32.init(false, false,skip_num, 4 * _inner_hidden_dim, _output_hidden_dim,ctx,weights_h_ptr,PACKED_MKLGEMM); + _wp_gemm_fp32.init(false, false,skip_num, _output_hidden_dim, _inner_hidden_dim,ctx,weights_project_ptr,PACKED_MKLGEMM); + return create(inputs, outputs, param, ctx); +} ; + +template +static inline void cal_lstm_batch(int emit_word_id_size, OpDataType* temp_wx, + const OpDataType* weight_peephole, + OpDataType* hout, OpDataType* inner_cell, const OpDataType* b_i_in, const OpDataType* b_f_in, + const OpDataType* b_c_in, + const OpDataType* b_o_in, int hidden_size) { + + const int inner_iter_num = hidden_size / (sizeof(BIT) / sizeof(OpDataType)); + const BIT* b_i = (BIT*)b_i_in; + const BIT* b_f = (BIT*)b_f_in; + const BIT* b_c = (BIT*)b_c_in; + const BIT* b_o = (BIT*)b_o_in; + const int max_thread_nums=anakin_get_max_threads(); + for (int emit_word_id = 0; emit_word_id < emit_word_id_size; emit_word_id++) { + int emit_wx_offset = emit_word_id * hidden_size * 4; + const BIT* w_x_i = (BIT*)(temp_wx + 0 * hidden_size + emit_wx_offset); + const BIT* w_x_f = (BIT*)(temp_wx + 1 * hidden_size + emit_wx_offset); + const BIT* w_x_c = (BIT*)(temp_wx + 2 * hidden_size + emit_wx_offset); + const BIT* w_x_o = (BIT*)(temp_wx + 3 * hidden_size + emit_wx_offset); + + const BIT* w_ci = (BIT*)(weight_peephole + 0 * hidden_size); + const BIT* w_cf = (BIT*)(weight_peephole + 1 * hidden_size); + const BIT* w_co = (BIT*)(weight_peephole + 2 * hidden_size); + + BIT* gate_h_p = (BIT*)(hout + emit_word_id * hidden_size); + BIT* gate_c_p = (BIT*)(inner_cell + emit_word_id * hidden_size); + + if (first_iter) { +#pragma omp parallel for schedule(static) if (max_thread_nums > 1) + for (int frame_id = 0; frame_id < inner_iter_num; ++frame_id) { + BIT gate_i = Sigmoid(w_x_i[frame_id] + b_i[frame_id]); + BIT gate_f = Sigmoid(w_x_f[frame_id] + b_f[frame_id]); + BIT gate_c_s = Tanh(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_i * gate_c_s; + BIT gate_o = Sigmoid(w_x_o[frame_id] + gate_c * w_co[frame_id] + b_o[frame_id]); + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * Tanh(gate_c); + } + } else { +#pragma omp parallel for schedule(static) if (max_thread_nums > 1) + for (int frame_id = 0; frame_id < inner_iter_num; ++frame_id) { + BIT c_1 = gate_c_p[frame_id]; + BIT gate_i = Sigmoid(w_x_i[frame_id] + b_i[frame_id] + w_ci[frame_id] * c_1); + BIT gate_f = Sigmoid(w_x_f[frame_id] + b_f[frame_id] + w_cf[frame_id] * c_1); + BIT gate_c_s = Tanh(w_x_c[frame_id] + b_c[frame_id]); + BIT gate_c = gate_f * c_1 + gate_i * gate_c_s; + BIT gate_o = Sigmoid(w_x_o[frame_id] + b_o[frame_id] + gate_c * w_co[frame_id]); + + gate_c_p[frame_id] = gate_c; + gate_h_p[frame_id] = gate_o * Tanh(gate_c); + } + } + } +} + +template<> +SaberStatus SaberLstmp:: +dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + auto offset_vec = inputs[0]->get_seq_offset(); + CHECK_EQ(offset_vec.size(), 1); + auto offset = offset_vec[0]; + CHECK_EQ(offset.size(), 2); + const int skip_num = param.skip_num; + CHECK_GT(skip_num, 1); + int word_num = inputs[0]->num(); + int word_dim = inputs[0]->channel(); + int iter_num = utils::round_up(word_num, skip_num) / skip_num; + + utils::try_expand_tensor(_wx_tensor, word_num * 4 * _inner_hidden_dim); + utils::try_expand_tensor(_temp_hidden_tensor, skip_num * _inner_hidden_dim); + utils::try_expand_tensor(_temp_cell_tensor, skip_num * _inner_hidden_dim); + + float* wx_ptr = static_cast(_wx_tensor.mutable_data()); + const float* x_ptr = static_cast(inputs[0]->data()); + const float* weights_x_ptr = static_cast(param.weight()->data()); + const float* weights_h_ptr = weights_x_ptr + word_dim * _inner_hidden_dim * 4; + const float* weights_project_ptr = weights_h_ptr + _output_hidden_dim * _inner_hidden_dim * 4; + const float* weights_bias_ptr = static_cast(param.bias()->data()); + const float* weights_bias_i_ptr = weights_bias_ptr; + const float* weights_bias_f_ptr = weights_bias_i_ptr + _inner_hidden_dim; + const float* weights_bias_c_ptr = weights_bias_f_ptr + _inner_hidden_dim; + const float* weights_bias_o_ptr = weights_bias_c_ptr + _inner_hidden_dim; + const float* weights_peephole_ptr = weights_bias_ptr + _inner_hidden_dim * 4; + float* output_ptr = static_cast(outputs[0]->mutable_data()); + float* temp_hidden_out = static_cast(_temp_hidden_tensor.mutable_data()); + float* temp_cell_out = static_cast(_temp_cell_tensor.mutable_data()); +// gemm(false, false, word_num, 4 * _inner_hidden_dim, word_dim, 1.f, x_ptr, weights_x_ptr, 0.f, +// wx_ptr); + _wx_gemm_fp32.dispatch(1.f,0.f,word_num,x_ptr, weights_x_ptr,wx_ptr); + + for (int i = 0; i < iter_num; i++) { + const int run_batch_dim = (i == (iter_num - 1)) ? (word_num - skip_num * i) : skip_num; + float* wx_iter = wx_ptr + i * skip_num * 4 * _inner_hidden_dim; + + if (i >= 1) { + float* hidden_in = output_ptr + (i - 1) * skip_num * _output_hidden_dim; +// gemm(false, false, run_batch_dim, 4 * _inner_hidden_dim, _output_hidden_dim, 1.f, hidden_in, +// weights_h_ptr, +// 1.f, wx_iter); + _wh_gemm_fp32.dispatch(1.f,1.f,run_batch_dim,hidden_in,weights_h_ptr,wx_iter); + + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + + } else { + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + } + + float* hidden_out = output_ptr + i * skip_num * _output_hidden_dim; +// gemm(false, false, run_batch_dim, _output_hidden_dim, _inner_hidden_dim, 1.f, temp_hidden_out, +// weights_project_ptr, 0.f, hidden_out); + _wp_gemm_fp32.dispatch(1.f,0.f,run_batch_dim,temp_hidden_out,weights_project_ptr,hidden_out); + vsTanh(run_batch_dim * _output_hidden_dim, hidden_out, hidden_out); + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; +} + +template<> +SaberStatus SaberLstmp:: create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + + return SaberSuccess; +}; + + +template<> +SaberStatus SaberLstmp::init(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx) { + _inner_hidden_dim = param.cell_dim; + _output_hidden_dim = param.project_dim; + + CHECK_GT(param.cell_dim, 0); + CHECK_GT(param.project_dim, 0); + CHECK_EQ(param.cell_dim % (sizeof(SABER_X86_TYPE) / sizeof(float)), 0); + + int word_num = inputs[0]->num(); + int word_channel = inputs[0]->channel(); + float* weights_x_ptr = static_cast(param.weight()->data()); + float* weights_h_ptr = weights_x_ptr + word_channel * _inner_hidden_dim * 4; + float* weights_project_ptr = weights_h_ptr + _output_hidden_dim * _inner_hidden_dim * 4; + float* weights_bias_ptr = static_cast(param.bias()->data()); + Shape shape_x({1, 1, word_num, word_channel}); + Shape shape_h({1, 1, param.skip_num, _output_hidden_dim}); + Shape shape_wh({1, 1, param.skip_num, 4 * _inner_hidden_dim}); + Shape shape_iter_project({1, 1, param.skip_num, _inner_hidden_dim}); + Shape shape_weights_wx({1, 1, word_channel, 4 * _inner_hidden_dim}); + Shape shape_weights_wh({1, 1, _output_hidden_dim, 4 * _inner_hidden_dim}); + Shape shape_weights_project({1, 1, _inner_hidden_dim, _output_hidden_dim}); + _inner_x_int8.re_alloc(shape_x, AK_INT8); + _inner_h_int8.re_alloc(shape_h, AK_INT8); + _inner_wh_int32.re_alloc(shape_wh, AK_INT32); + _inner_project_scale.re_alloc(shape_iter_project, AK_INT8); + _int8_weights_wx.re_alloc(shape_weights_wx, AK_INT8); + _int8_weights_wh.re_alloc(shape_weights_wh, AK_INT8); + _int8_weights_project.re_alloc(shape_weights_project, AK_INT8); + utils::ScaleUtils::scale_gemm_xw_weights_to_nchw_host(_int8_weights_wx, + Tensor(static_cast(weights_x_ptr), X86(), 0, shape_weights_wx, AK_FLOAT)); + utils::ScaleUtils::scale_gemm_xw_weights_to_nchw_host(_int8_weights_wh, + Tensor(static_cast(weights_h_ptr), X86(), 0, shape_weights_wh, AK_FLOAT)); + utils::ScaleUtils::scale_gemm_xw_weights_to_nchw_host(_int8_weights_project, + Tensor(static_cast(weights_project_ptr), X86(), 0, shape_weights_project, AK_FLOAT)); + + auto input_scale = inputs[0]->get_scale(); + CHECK_EQ(input_scale.size(), 1); + + CHECK_EQ(_int8_weights_wx.get_scale().size(), 4 * _inner_hidden_dim); + + for (auto i : _int8_weights_wx.get_scale()) { + _inner_scale_wx.push_back(input_scale[0]*i); + } + + _inner_scale_wh.resize(4 * _inner_hidden_dim); + _inner_scale_project.resize(_output_hidden_dim); + //my intrinsic gemm init + int word_dim = inputs[0]->channel(); + _wx_gemm_me.init(4 * _inner_hidden_dim, word_dim, _int8_weights_wx); + _wh_gemm_me.init(4 * _inner_hidden_dim, _output_hidden_dim, _int8_weights_wh); + _project_gemm_me.init(_output_hidden_dim, _inner_hidden_dim, _int8_weights_project); + + + _temp_hidden_tensor.re_alloc(Shape({1, 1, param.skip_num, _inner_hidden_dim}), AK_FLOAT); + _temp_cell_tensor.re_alloc(Shape({1, 1, param.skip_num, _inner_hidden_dim}), AK_FLOAT); + + + int8_t* weights_x_int8_ptr = static_cast(_int8_weights_wx.data()); + int8_t* weights_h_int8_ptr = static_cast(_int8_weights_wh.data()); + int8_t* weights_p_int8_ptr = static_cast(_int8_weights_project.data()); + + if (jit::mayiuse(jit::avx512_core_vnni)) { + _wx_gemm.init(false, false, word_num, 4 * _inner_hidden_dim, word_dim, ctx, weights_x_int8_ptr,PACKED_MKLGEMM); + _wh_gemm.init(false, false, param.skip_num, 4 * _inner_hidden_dim, _output_hidden_dim, ctx, + weights_h_int8_ptr,PACKED_MKLGEMM); + _wp_gemm.init(false, false, param.skip_num, _output_hidden_dim, _inner_hidden_dim, ctx, + weights_p_int8_ptr,PACKED_MKLGEMM); + } + + LOG(INFO) << "create Lstmp"; + return create(inputs, outputs, param, ctx); +} ; + + +template<> +SaberStatus SaberLstmp:: +dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param) { + if (jit::mayiuse(jit::avx512_core_vnni)) { + auto offset_vec = inputs[0]->get_seq_offset(); + CHECK_EQ(offset_vec.size(), 1); + auto offset = offset_vec[0]; + CHECK_EQ(offset.size(), 2); + const int skip_num = param.skip_num; + CHECK_GT(skip_num, 1); + int word_num = inputs[0]->num(); + int word_dim = inputs[0]->channel(); + int iter_num = utils::round_up(word_num, skip_num) / skip_num; + + utils::try_expand_tensor(_wx_tensor, word_num * 4 * _inner_hidden_dim); + utils::try_expand_tensor(_temp_hidden_tensor, skip_num * _inner_hidden_dim); + utils::try_expand_tensor(_temp_cell_tensor, skip_num * _inner_hidden_dim); + + float* wx_ptr = static_cast(_wx_tensor.mutable_data()); + const float* x_ptr = static_cast(inputs[0]->data()); + const int8_t* weights_x_ptr = static_cast(_int8_weights_wx.data()); + const int8_t* weights_h_ptr = static_cast(_int8_weights_wh.data()); + const int8_t* weights_project_ptr_int8 = static_cast(_int8_weights_project.data()); + const float* weights_project_ptr = static_cast(param.weight()->data()) + + word_dim * _inner_hidden_dim * 4 + + _output_hidden_dim * _inner_hidden_dim * 4; + const float* weights_bias_ptr = static_cast(param.bias()->data()); + const float* weights_bias_i_ptr = weights_bias_ptr; + const float* weights_bias_f_ptr = weights_bias_i_ptr + _inner_hidden_dim; + const float* weights_bias_c_ptr = weights_bias_f_ptr + _inner_hidden_dim; + const float* weights_bias_o_ptr = weights_bias_c_ptr + _inner_hidden_dim; + const float* weights_peephole_ptr = weights_bias_ptr + _inner_hidden_dim * 4; + float* output_ptr = static_cast(outputs[0]->mutable_data()); + float* temp_hidden_out = static_cast(_temp_hidden_tensor.mutable_data()); + float* temp_cell_out = static_cast(_temp_cell_tensor.mutable_data()); + + if (inputs[0]->get_dtype() == AK_FLOAT) { + utils::ScaleUtils::scale_fp32_int8(_inner_x_int8, *inputs[0]); + const int8_t* x_int8_ptr = static_cast(_inner_x_int8.data()); + _wx_gemm.dispatch(1.f, 0.f,word_num, x_int8_ptr, weights_x_ptr, (int32_t*) wx_ptr); + utils::ScaleUtils::cvt_int32_fp32((int32_t*) wx_ptr, _inner_scale_wx, word_num, + 4 * _inner_hidden_dim); + } else { + LOG(FATAL) << "not impl"; + } + + for (int i = 0; i < iter_num; i++) { + const int run_batch_dim = (i == (iter_num - 1)) ? (word_num - skip_num * i) : skip_num; + float* wx_iter = wx_ptr + i * skip_num * 4 * _inner_hidden_dim; + + if (i >= 1) { + float* hidden_in = output_ptr + (i - 1) * skip_num * _output_hidden_dim; + utils::ScaleUtils::scale_fp32_int8(_inner_h_int8, hidden_in, run_batch_dim * _output_hidden_dim); + float scale_x = _inner_h_int8.get_scale()[0]; + std::vector scale_weights_h = _int8_weights_wh.get_scale(); + CHECK_EQ(scale_weights_h.size(), 4 * _inner_hidden_dim); + + for (int i = 0; i < 4 * _inner_hidden_dim; i++) { + _inner_scale_wh[i] = scale_x * scale_weights_h[i]; + } + + _wh_gemm.dispatch(1.f, 0.f,run_batch_dim, static_cast(_inner_h_int8.data()), weights_h_ptr, + static_cast(_inner_wh_int32.data())); + utils::ScaleUtils::cvt_int32_fp32(static_cast(_inner_wh_int32.data()), _inner_scale_wh, + run_batch_dim, + 4 * _inner_hidden_dim); + float* wh_fp32 = static_cast(_inner_wh_int32.data()); + + for (int i = 0; i < run_batch_dim * 4 * _inner_hidden_dim; i++) { + wx_iter[i] += wh_fp32[i]; + } + + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, + weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + + } else { + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, + weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + } + + float* hidden_out = output_ptr + i * skip_num * _output_hidden_dim; + + utils::ScaleUtils::scale_fp32_int8(_inner_project_scale, temp_hidden_out, + run_batch_dim * _inner_hidden_dim); + float scale_x = _inner_project_scale.get_scale()[0]; + std::vector scale_vec = _int8_weights_project.get_scale(); + + for (int i = 0; i < _output_hidden_dim; i++) { + _inner_scale_project[i] = scale_x * scale_vec[i]; + } + + + _wp_gemm.dispatch(1.f, 0.f,run_batch_dim, static_cast(_inner_project_scale.data()), + weights_project_ptr_int8, + (int*) hidden_out); + utils::ScaleUtils::cvt_int32_fp32((int*)(hidden_out), _inner_scale_project, + run_batch_dim, + _output_hidden_dim); + + vsTanh(run_batch_dim * _output_hidden_dim, hidden_out, hidden_out); + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; + } else { + auto offset_vec = inputs[0]->get_seq_offset(); + CHECK_EQ(offset_vec.size(), 1); + auto offset = offset_vec[0]; + CHECK_EQ(offset.size(), 2); + const int skip_num = param.skip_num; + CHECK_GT(skip_num, 1); + int word_num = inputs[0]->num(); + int word_dim = inputs[0]->channel(); + int iter_num = utils::round_up(word_num, skip_num) / skip_num; + + + utils::try_expand_tensor(_wx_tensor, word_num * 4 * _inner_hidden_dim); + utils::try_expand_tensor(_temp_hidden_tensor, skip_num * _inner_hidden_dim); + utils::try_expand_tensor(_temp_cell_tensor, skip_num * _inner_hidden_dim); + + float* wx_ptr = static_cast(_wx_tensor.mutable_data()); + const float* x_ptr = static_cast(inputs[0]->data()); + const int8_t* weights_x_ptr = static_cast(_int8_weights_wx.data()); + const float* weights_h_ptr = static_cast(param.weight()->data()) + word_dim * + _inner_hidden_dim * 4; + const float* weights_project_ptr = weights_h_ptr + _output_hidden_dim * _inner_hidden_dim * 4; + const int8_t* weights_project_ptr_int8 = static_cast(_int8_weights_project.data()); + const float* weights_bias_ptr = static_cast(param.bias()->data()); + const float* weights_bias_i_ptr = weights_bias_ptr; + const float* weights_bias_f_ptr = weights_bias_i_ptr + _inner_hidden_dim; + const float* weights_bias_c_ptr = weights_bias_f_ptr + _inner_hidden_dim; + const float* weights_bias_o_ptr = weights_bias_c_ptr + _inner_hidden_dim; + const float* weights_peephole_ptr = weights_bias_ptr + _inner_hidden_dim * 4; + float* output_ptr = static_cast(outputs[0]->mutable_data()); + float* temp_hidden_out = static_cast(_temp_hidden_tensor.mutable_data()); + float* temp_cell_out = static_cast(_temp_cell_tensor.mutable_data()); + + if (inputs[0]->get_dtype() == AK_FLOAT) { + utils::ScaleUtils::scale_fp32_int8(_inner_x_int8, *inputs[0]); + _wx_gemm_me.dispatch(word_num, 4 * _inner_hidden_dim, word_dim, _inner_x_int8, _wx_tensor); + utils::ScaleUtils::cvt_int32_fp32((int32_t*)wx_ptr, _inner_scale_wx, word_num, + 4 * _inner_hidden_dim); + + } else { + LOG(FATAL) << "not impl"; + } + + for (int i = 0; i < iter_num; i++) { + const int run_batch_dim = (i == (iter_num - 1)) ? (word_num - skip_num * i) : skip_num; + float* wx_iter = wx_ptr + i * skip_num * 4 * _inner_hidden_dim; + + if (i >= 1) { + float* hidden_in = output_ptr + (i - 1) * skip_num * _output_hidden_dim; + utils::ScaleUtils::scale_fp32_int8(_inner_h_int8, hidden_in, run_batch_dim * _output_hidden_dim); + float scale_x = _inner_h_int8.get_scale()[0]; + std::vector scale_weights_h = _int8_weights_wh.get_scale(); + CHECK_EQ(scale_weights_h.size(), 4 * _inner_hidden_dim); + + for (int i = 0; i < 4 * _inner_hidden_dim; i++) { + _inner_scale_wh[i] = scale_x * scale_weights_h[i]; + } + + _wh_gemm_me.dispatch(run_batch_dim, 4 * _inner_hidden_dim, _output_hidden_dim, _inner_h_int8, + _inner_wh_int32); + + utils::ScaleUtils::cvt_int32_fp32(static_cast(_inner_wh_int32.data()), _inner_scale_wh, + run_batch_dim, + 4 * _inner_hidden_dim); + float* wh_fp32 = static_cast(_inner_wh_int32.data()); + + for (int i = 0; i < run_batch_dim * 4 * _inner_hidden_dim; i++) { + wx_iter[i] += wh_fp32[i]; + } + + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + + } else { + cal_lstm_batch(run_batch_dim, wx_iter, weights_peephole_ptr, + temp_hidden_out, temp_cell_out, weights_bias_i_ptr, weights_bias_f_ptr, weights_bias_c_ptr, + weights_bias_o_ptr, _inner_hidden_dim); + } + + float* hidden_out = output_ptr + i * skip_num * _output_hidden_dim; + + utils::ScaleUtils::scale_fp32_int8(_inner_project_scale, temp_hidden_out, + run_batch_dim * _inner_hidden_dim); + float scale_x = _inner_project_scale.get_scale()[0]; + std::vector scale_vec = _int8_weights_project.get_scale(); + + for (int i = 0; i < _output_hidden_dim; i++) { + _inner_scale_project[i] = scale_x * scale_vec[i]; + } + + Tensor temp_tensor(hidden_out, X86(), 0, Shape({1, 1, run_batch_dim, _output_hidden_dim}), + AK_INT32); + _project_gemm_me.dispatch(run_batch_dim, _output_hidden_dim, _inner_hidden_dim, + _inner_project_scale, temp_tensor); + utils::ScaleUtils::cvt_int32_fp32((int*)(hidden_out), _inner_scale_project, + run_batch_dim, + _output_hidden_dim); + + vsTanh(run_batch_dim * _output_hidden_dim, hidden_out, hidden_out); + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + return SaberSuccess; + } +} + + +DEFINE_OP_TEMPLATE(SaberLstmp, LstmParam, X86, AK_HALF); + +} +} diff --git a/saber/funcs/impl/x86/saber_lstmp.h b/saber/funcs/impl/x86/saber_lstmp.h new file mode 100644 index 000000000..0ee5302df --- /dev/null +++ b/saber/funcs/impl/x86/saber_lstmp.h @@ -0,0 +1,98 @@ +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTMP_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTMP_H +#include "saber/funcs/impl/impl_lstmp.h" +#include "saber_funcs_param.h" +#include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/saber_lstm.h" +#include "saber/funcs/impl/x86/mkl_gemm.h" +#include "saber/funcs/impl/x86/intrinsic_packed_fc.h" + +#if defined(__AVX512F__) +#include +#define SABER_X86_TYPE __m512 +#elif defined(__AVX2__) and defined(__FMA__) +#include +#define SABER_X86_TYPE __m256 +#elif defined(__SSE4_2__) and defined(__FMA__) +#include +#define SABER_X86_TYPE __m128 +#else +#define SABER_X86_TYPE float +#endif + +//#define SABER_X86_TYPE __m128 + +namespace anakin { +namespace saber { + +template +class SaberLstmp : + public ImplBase < + X86, OpDtype, LstmParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + // typedef Tensor OpTensor; + SaberLstmp() {} + + ~SaberLstmp() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx); + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param, + Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + LstmParam& param); + +private: + LstmParam _lstm_param; + Tensor _lstm_weights; + Tensor _gemm_weights; + Tensor _inner_output; + Tensor _inner_gemm_output; + SaberLstm _saber_lstm; + std::vector*> _inner_ouput_tensor_vec; + int _output_hidden_size; + int _inner_hidden_size; + + MklDnnGemm _wx_gemm_fp32; + MklDnnGemm _wh_gemm_fp32; + MklDnnGemm _wp_gemm_fp32; + + Tensor _wx_tensor; + Tensor _temp_hidden_tensor; + Tensor _temp_cell_tensor; + int _output_hidden_dim; + int _inner_hidden_dim; + + Tensor _inner_x_int8; + Tensor _inner_h_int8; + Tensor _inner_wh_int32; + Tensor _inner_project_scale; + Tensor _int8_weights_wx; + Tensor _int8_weights_wh; + Tensor _int8_weights_project; + + std::vector _inner_scale_wx; + std::vector _inner_scale_wh; + std::vector _inner_scale_project; + + MklDnnGemm _wx_gemm; + MklDnnGemm _wh_gemm; + MklDnnGemm _wp_gemm; + + PackedFC _wx_gemm_me; + PackedFC _wh_gemm_me; + PackedFC _project_gemm_me; + +}; + +} +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_LSTM_H diff --git a/saber/funcs/impl/x86/saber_match_matrix.cpp b/saber/funcs/impl/x86/saber_match_matrix.cpp index 7bb27a965..a5eada5c1 100644 --- a/saber/funcs/impl/x86/saber_match_matrix.cpp +++ b/saber/funcs/impl/x86/saber_match_matrix.cpp @@ -60,7 +60,7 @@ void padding_out(const dtype* src, std::vector& offset_r, int dim_t, int le int tl = dim_t * len_l; for (int i = 0; i < seq_num; i++) { dtype* dst_tmp = dst + i * tl * max_len_r; - dtype* src_tmp = src + offset_r[i] * tl; + const dtype* src_tmp = src + offset_r[i] * tl; int cur_len = offset_r[i+1] - offset_r[i]; for (int j = 0; j < cur_len; j++) { for (int k = 0; k < tl; k++) { @@ -84,6 +84,7 @@ SaberStatus SaberMatchMatrix::dispatch( auto offset_r = inputs[1]->get_seq_offset()[0]; int len_l = offset_l[1] - offset_l[0]; int len_r = offset_r[offset_r.size() - 1]; + int batch = offset_l.size() - 1; const OpDataType* weight_data = (const OpDataType*) param.weight()->data(); const OpDataType* input_l = (const OpDataType*)inputs[0]->data(); const OpDataType* input_r = (const OpDataType*)inputs[1]->data(); @@ -92,13 +93,26 @@ SaberStatus SaberMatchMatrix::dispatch( OpDataType* output_tmp = (OpDataType*)_output_tmp.mutable_data(); OpDataType* output_data = (OpDataType*) outputs[0]->mutable_data(); _gemm_l_transform.init(true, true, dim_t * dim_in, len_l, dim_in, *(this->_ctx)); - _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, input_l, input_l_transform); - for (int i = 0; i < dim_t; i++) { - int offset = i * dim_in * len_l; - transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); + if (param.is_l_same) { + _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, input_l, input_l_transform); + for (int i = 0; i < dim_t; i++) { + int offset = i * dim_in * len_l; + transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); + } + _gemm_r_transform.init(false, true, len_r, dim_t*len_l, dim_in, *(this->_ctx)); + _gemm_r_transform.dispatch(1.0f, 0.f, input_r, input_l_transform_reorganize, output_tmp); + } else { + for (int i = 0; i < batch; i++) { + _gemm_l_transform.dispatch(1.0f, 0.f, weight_data, input_l + i * len_l * dim_in, input_l_transform); + for (int j = 0; j < dim_t; j++) { + int offset = j * dim_in * len_l; + transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); + } + _gemm_r_transform.init(false, true, offset_r[i+1] - offset_r[i], dim_t * len_l, dim_in, *(this->_ctx)); + _gemm_r_transform.dispatch(1.0f, 0.f, input_r + offset_r[i] * dim_in, input_l_transform_reorganize, output_tmp + offset_r[i] * dim_t * len_l); + + } } - _gemm_r_transform.init(false, true, len_r, dim_t*len_l, dim_in, *(this->_ctx)); - _gemm_r_transform.dispatch(1.0f, 0.f, input_r, input_l_transform_reorganize, output_tmp); padding_out(output_tmp, offset_r, dim_t, len_l, output_data); outputs[0]->set_seq_offset(inputs[1]->get_seq_offset()); diff --git a/saber/funcs/impl/x86/saber_mean.cpp b/saber/funcs/impl/x86/saber_mean.cpp new file mode 100644 index 000000000..11b6094c4 --- /dev/null +++ b/saber/funcs/impl/x86/saber_mean.cpp @@ -0,0 +1,31 @@ +#include "saber/funcs/impl/x86/saber_mean.h" + +namespace anakin { +namespace saber { + +template +SaberStatus SaberMean::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + MeanParam& param) { + + const OpDataType* input_ptr = (const OpDataType*)inputs[0]->data(); + OpDataType* output_ptr = (OpDataType*)outputs[0]->mutable_data(); + int n = inputs[0]->valid_size(); + OpDataType s = (OpDataType)0.0; + +# pragma omp parallel for reduction(+:s) + for (int i = 0; i < n; i++) { + s += input_ptr[i]; + } + s /= n; + output_ptr[0] = s; + + return SaberSuccess; +} + +template class SaberMean; +DEFINE_OP_TEMPLATE(SaberMean, MeanParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberMean, MeanParam, X86, AK_INT8); + +} // namespace saber. +} // namespace anakin. \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_mean.h b/saber/funcs/impl/x86/saber_mean.h new file mode 100644 index 000000000..f94b4975b --- /dev/null +++ b/saber/funcs/impl/x86/saber_mean.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MEAN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_MEAN_H + +#include "saber/funcs/impl/impl_mean.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberMean : + public ImplBase< + X86, OpDtype, + MeanParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberMean() {} + ~SaberMean() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + MeanParam& param, Context& ctx) { + + this->_ctx = &ctx; + create(inputs, outputs, param, ctx); + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + MeanParam& param, Context &ctx) { + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + MeanParam& param); + +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MATCH_MATRIX_H diff --git a/saber/funcs/impl/x86/saber_normal_activation.h b/saber/funcs/impl/x86/saber_normal_activation.h index 9bcaf2b1f..ed4411bc5 100644 --- a/saber/funcs/impl/x86/saber_normal_activation.h +++ b/saber/funcs/impl/x86/saber_normal_activation.h @@ -2,10 +2,9 @@ #ifndef ANAKIN_SABER_NORMAL_ACTIVATION_H #define ANAKIN_SABER_NORMAL_ACTIVATION_H +#include "anakin_config.h" #include "saber_types.h" #include - - #include "saber_avx512_math.h" #include "saber_avx2_math.h" #include "saber_sse_math.h" @@ -17,7 +16,7 @@ namespace saber { template inline Dtype InValidAct(Dtype a) { - CHECK_EQ(0, 1) << "InValidAct"; + return 0; } template @@ -42,8 +41,14 @@ inline Dtype Identity(const Dtype a) { return a; } + #if defined(__SSE4_2__) and defined(__FMA__) +template<> +inline __m128 InValidAct<__m128>(const __m128 a) { + return _mm_set1_ps(0.0f); +} + template<> inline __m128 Relu<__m128>(const __m128 a) { @@ -80,6 +85,10 @@ inline __m128 Tanh<__m128>(const __m128 a) { #if defined(__AVX2__) and defined(__FMA__) +template<> +inline __m256 InValidAct<__m256>(const __m256 a) { + return _mm256_set1_ps(0.0f); +} template<> inline __m256 Relu<__m256>(const __m256 a) { @@ -112,6 +121,10 @@ inline __m256 Tanh<__m256>(const __m256 a) { #if defined(__AVX512F__) +template<> +inline __m512 InValidAct<__m512>(const __m512 a) { + return _mm512_set1_ps(0.0f); +} template<> inline __m512 Relu<__m512>(const __m512 a) { diff --git a/saber/funcs/impl/x86/saber_normalize.cpp b/saber/funcs/impl/x86/saber_normalize.cpp index 1fbb08280..865a8b162 100644 --- a/saber/funcs/impl/x86/saber_normalize.cpp +++ b/saber/funcs/impl/x86/saber_normalize.cpp @@ -4,6 +4,55 @@ namespace anakin{ namespace saber{ template class SaberNormalize; + +template +void group_normlize(const dtype* in_data, const dtype* scale, const dtype* bias, + int n, int c, int h, int w, float eps, int group, + dtype* out_data, dtype* out_mean, dtype* out_var){ + int group_size = (c - 1) / group + 1; + int im_size = h * w; + for (int n_index = 0; n_index < n; ++n_index){ + for (int g_index = 0; g_index < group; ++g_index){ + dtype t_mean = 0; + dtype t_var = 0; + int real_channels = c - g_index * group_size >= group_size ? + group_size : c - g_index * group_size; + int compute_size = im_size * real_channels; + for (int im_index = 0; im_index < compute_size; ++im_index){ + t_mean += in_data[im_index]; + t_var += in_data[im_index] * in_data[im_index]; + } + t_mean /= compute_size; + t_var /= compute_size; + t_var -= t_mean * t_mean; + dtype t_var_inv = 1 / sqrt(t_var + eps); + if (out_mean){ + out_mean[n * group + g_index] = t_mean; + } + if (out_var){ + out_var[n * group + g_index] = t_var; + } + + int scale_bias_start_index = g_index * group_size; + for (int c_index = 0; c_index < real_channels; ++c_index){ + int c_start = c_index * im_size; + for (int im_index = 0; im_index < im_size; ++im_index){ + dtype dest_val = (in_data[c_start + im_index] - t_mean) * t_var_inv; + if (scale){ + dest_val *= scale[scale_bias_start_index + c_index]; + } + if (bias){ + dest_val += bias[scale_bias_start_index + c_index]; + } + out_data[c_start + im_index] = dest_val; + } + + } + out_data += compute_size; + in_data += compute_size; + } + } +} template <> SaberStatus SaberNormalize::\ @@ -13,6 +62,7 @@ SaberStatus SaberNormalize::\ int p = param.p; bool across_spatial = param.across_spatial; bool has_scale = param.has_scale; + bool has_bias = param.has_bias; bool channel_shared = param.channel_shared; float eps = param.eps; int n = inputs[0]->num(); @@ -20,14 +70,37 @@ SaberStatus SaberNormalize::\ int h = inputs[0]->height(); int w = inputs[0]->width(); Tensor th_scale; + Tensor th_bias; const float* scale = nullptr; + const float* bias = nullptr; + float* out_mean = nullptr; + float* out_var = nullptr; if(has_scale){ th_scale.re_alloc(param.scale->shape(), AK_FLOAT); th_scale.copy_from(*param.scale); scale = static_cast(th_scale.data()); } + if (has_bias){ + th_bias.re_alloc(param.bias->shape(), AK_FLOAT); + th_bias.copy_from(*param.bias); + bias = static_cast(th_bias.data()); + } + const float* src_ptr = static_cast(inputs[0]->data()); float* dst_ptr = static_cast(outputs[0]->mutable_data()); + + if (param.group > 0){ + //group>1, do group normal + if (outputs.size() > 1){ + out_mean = static_cast(outputs[1]->mutable_data()); + } + if (outputs.size() > 2){ + out_var = static_cast(outputs[2]->mutable_data()); + } + group_normlize(src_ptr, scale, bias, n, c, h, w, eps, param.group, + dst_ptr, out_mean, out_var); + return SaberSuccess; + } if (across_spatial) { int compute_size = h * w * c; diff --git a/saber/funcs/impl/x86/saber_one_hot.cpp b/saber/funcs/impl/x86/saber_one_hot.cpp new file mode 100644 index 000000000..0d3b5075e --- /dev/null +++ b/saber/funcs/impl/x86/saber_one_hot.cpp @@ -0,0 +1,48 @@ + +#include "saber/funcs/impl/x86/saber_one_hot.h" + +namespace anakin { + +namespace saber { + +template <> +SaberStatus SaberOneHot::create( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param, Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberOneHot::init( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param, Context& ctx) { + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberOneHot::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + OneHotParam& param) { + memset(outputs[0]->mutable_data(), 0, outputs[0]->valid_size() * outputs[0]->get_dtype_size()); + + int depth = param.depth; + const float* in_ptr = (const float*)inputs[0]->data(); + float* out_ptr = (float*)outputs[0]->mutable_data(); + int dims = inputs[0]->valid_size(); + for (int i = 0; i < dims; ++i) { + out_ptr[i * depth + (int)in_ptr[i]] = 1.0; + } + return SaberSuccess; +} + +template class SaberOneHot; +DEFINE_OP_TEMPLATE(SaberOneHot, OneHotParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberOneHot, OneHotParam, X86, AK_INT8); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_one_hot.h b/saber/funcs/impl/x86/saber_one_hot.h new file mode 100644 index 000000000..ee44c907e --- /dev/null +++ b/saber/funcs/impl/x86/saber_one_hot.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ONE_HOT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ONE_HOT_H + +#include "saber/funcs/impl/impl_one_hot.h" +#include "saber/core/data_traits.h" + +namespace anakin { + +namespace saber { + +template +class SaberOneHot: \ + public ImplBase < + X86, OpDtype, + OneHotParam> { + +public: + typedef typename DataTrait :: Dtype dtype; + + SaberOneHot() = default; + + ~SaberOneHot() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param, + Context& ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param, + Context& ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + OneHotParam& param) override; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ONE_HOT_H diff --git a/saber/funcs/impl/x86/saber_pad.cpp b/saber/funcs/impl/x86/saber_pad.cpp new file mode 100644 index 000000000..edf766cab --- /dev/null +++ b/saber/funcs/impl/x86/saber_pad.cpp @@ -0,0 +1,66 @@ +#include "saber/funcs/impl/x86/saber_pad.h" +namespace anakin { + +namespace saber { + +template +SaberStatus SaberPad::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PadParam& param) { + + const dtype* in_data = static_cast(inputs[0]->data()); + dtype* out_data = static_cast(outputs[0]->mutable_data()); + Shape out_shape = outputs[0]->valid_shape(); + Shape in_shape = inputs[0]->valid_shape(); + int out_n = out_shape.num(); + int out_c = out_shape.channel(); + int out_h = out_shape.height(); + int out_w = out_shape.width(); + int pad_h_top = param.pad_h[0]; + int pad_h_bottom = param.pad_h[1]; + int pad_w_left = param.pad_w[0]; + int pad_w_right = param.pad_w[1]; + int pad_c_0 = param.pad_c[0]; + int pad_c_1 = param.pad_c[1]; + + int ceil_in_c = in_shape.channel(); + int ceil_in_h = in_shape.height(); + int ceil_in_w = in_shape.width(); + + + + for (size_t n_index = 0; n_index < out_n; n_index++) { + for (size_t c_index = 0; c_index < out_c; c_index++) { + int c_in_index = c_index - pad_c_0; + bool is_pad_c = c_in_index < 0 || c_in_index >= ceil_in_c; + for (size_t h_index = 0; h_index < out_h; h_index++) { + int h_in_index = h_index - pad_h_top; + bool is_pad_h = h_in_index < 0 || h_in_index >= ceil_in_h; + for (size_t w_index = 0; w_index < out_w; w_index++) { + int w_in_index = w_index - pad_w_left; + bool is_pad_w = w_in_index < 0 || w_in_index >= ceil_in_w; + bool is_pad = is_pad_c||is_pad_h||is_pad_w; + int in_index = n_index * _in_n_stride + c_in_index * _in_c_stride + h_in_index * _in_h_stride + + w_in_index * _in_w_stride; + int out_index = n_index * _out_n_stride + c_index * _out_c_stride + h_index * _out_h_stride + + w_index * _out_w_stride; +// LOG(INFO)<= ceil_in_c)<<","<<(h_in_index < 0 || h_in_index >= ceil_in_h)<<","<<(w_in_index < 0 || w_in_index >= ceil_in_w); + if (is_pad) { + out_data[out_index] = 0; + } else { + out_data[out_index] = in_data[in_index]; + } + } + } + } + } + + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberPad, PadParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberPad, PadParam, X86, AK_INT8); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_pad.h b/saber/funcs/impl/x86/saber_pad.h new file mode 100644 index 000000000..993120b0e --- /dev/null +++ b/saber/funcs/impl/x86/saber_pad.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PAD_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PAD_H + +#include "saber/funcs/impl/impl_pad.h" +#include "saber/core/data_traits.h" + +namespace anakin { + +namespace saber { + +template +class SaberPad: \ + public ImplBase < + X86, OpDtype, + PadParam> { + +public: + typedef typename DataTrait :: Dtype dtype; + + SaberPad() {} + + ~SaberPad() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PadParam& param, + Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PadParam& param, + Context& ctx) { + CHECK_EQ(2, param.pad_c.size()); + CHECK_EQ(2, param.pad_h.size()); + CHECK_EQ(2, param.pad_w.size()); + Shape out_stride = outputs[0]->get_stride(); + Shape in_stride = inputs[0]->get_stride(); + int in_n_index = inputs[0]->num_index(); + int in_c_index = inputs[0]->channel_index(); + int in_h_index = inputs[0]->height_index(); + int in_w_index = inputs[0]->width_index(); + int out_n_index = outputs[0]->num_index(); + int out_c_index = outputs[0]->channel_index(); + int out_h_index = outputs[0]->height_index(); + int out_w_index = outputs[0]->width_index(); + _out_n_stride = out_stride[out_n_index]; + _out_c_stride = out_stride[out_c_index]; + _out_h_stride = out_stride[out_h_index]; + _out_w_stride = out_stride[out_w_index]; + _in_n_stride = in_stride[in_n_index]; + _in_c_stride = in_stride[in_c_index]; + _in_h_stride = in_stride[in_h_index]; + _in_w_stride = in_stride[in_w_index]; + + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, PadParam& param); +private: + + int _in_n_stride; + int _in_c_stride; + int _in_h_stride; + int _in_w_stride; + int _out_n_stride; + int _out_c_stride; + int _out_h_stride; + int _out_w_stride; +}; + +template class SaberPad; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PAD_H diff --git a/saber/funcs/impl/x86/saber_pixel_shuffle.cpp b/saber/funcs/impl/x86/saber_pixel_shuffle.cpp new file mode 100644 index 000000000..fa698ed04 --- /dev/null +++ b/saber/funcs/impl/x86/saber_pixel_shuffle.cpp @@ -0,0 +1,54 @@ +#include "saber/funcs/impl/x86/saber_pixel_shuffle.h" + +namespace anakin{ +namespace saber{ +template class SaberPixelShuffle; + +template <> +SaberStatus SaberPixelShuffle::\ +dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m){ + + const float* src_ptr = static_cast(inputs[0]->data()); + float* dst_ptr = static_cast(outputs[0]->mutable_data()); + + int out_size = outputs[0]->valid_size(); + + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()){ + for (int j = 0; j < out_size; ++j){ + int in_idx = 0; + int id = j; + for (int i = 0; i < _num_axes; ++i) { + int order = _order[i]; + int new_step = _out_steps[i]; + int old_step = _in_steps[order]; + int offset = (id / new_step) * old_step; + in_idx += offset; + id %= new_step; + } + dst_ptr[j] = src_ptr[in_idx]; + } + } else { + for (int j=0; j= 0; --i) { + int order = _order[i]; + int new_step = _out_steps[i]; + int old_step = _in_steps[order]; + int id = (j / new_valid_stride) % _out_new_sh[i]; + in_idx += id * old_step; + out_idx += id * new_step; + new_valid_stride *= _out_new_sh[i]; + } + dst_ptr[out_idx] = src_ptr[in_idx]; + } + } + return SaberSuccess; +} + + +} +} diff --git a/saber/funcs/impl/x86/saber_pixel_shuffle.h b/saber/funcs/impl/x86/saber_pixel_shuffle.h new file mode 100644 index 000000000..5ec84b8a0 --- /dev/null +++ b/saber/funcs/impl/x86/saber_pixel_shuffle.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PIXEL_SHUFFLE_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PIXEL_SHUFFLE_H + +#include "saber/funcs/impl/impl_pixel_shuffle.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPixelShuffle:\ + public ImplBase< + X86, + OpDtype, + PixelShuffleParam> { + +public: + + SaberPixelShuffle() {} + ~SaberPixelShuffle() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m, + Context &ctx){ + return create(inputs, outputs, param, ctx); + } + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m, + Context &ctx){ + this -> _ctx = &ctx; + + _num_axes = inputs[0]->valid_shape().size() + 2; + Shape in_sh = inputs[0]->valid_shape(); + int new_c = in_sh.channel()/(param.rw * param.rh); + Shape in_new_sh; + Shape out_new_sh; + in_new_sh.push_back(in_sh.num()); + out_new_sh.push_back(in_sh.num()); + if (param.channel_first){ + in_new_sh.push_back(new_c); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + _order = std::vector({0, 1, 4, 2, 5, 3}); + out_new_sh.push_back(new_c); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + + + } else { + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(new_c); + _order = std::vector({0, 1, 3, 2, 4, 5}); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + out_new_sh.push_back(new_c); + } + _in_steps = in_new_sh.get_stride(); + _out_steps = out_new_sh.get_stride(); + + + return SaberSuccess; + } + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PixelShuffleParam ¶m); + +private: + int _num_axes; + std::vector _order; + Shape _in_steps; + Shape _out_steps; + Shape _out_new_sh; +}; + +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PixelShuffle_H diff --git a/saber/funcs/impl/x86/saber_pooling.cpp b/saber/funcs/impl/x86/saber_pooling.cpp index edcb541d7..bd4ee656f 100644 --- a/saber/funcs/impl/x86/saber_pooling.cpp +++ b/saber/funcs/impl/x86/saber_pooling.cpp @@ -1,107 +1,291 @@ #include "saber/funcs/impl/x86/saber_pooling.h" #include "saber/funcs/impl/x86/kernel/jit_uni_pool_kernel_f32.h" - +#include "debug.h" namespace anakin { namespace saber { using namespace jit; template <> -SaberStatus SaberPooling::init_conf( - jit_pool_conf_t& jpp, const std::vector*>& inputs, - std::vector*>& outputs, - PoolingParam& param) { - //**/this function only use for avx512 - using namespace utils; +SaberStatus SaberPooling::create(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam& param, + Context& ctx) { Shape src_shape(inputs[0]->shape()); Shape dst_shape(outputs[0]->shape()); - const int simd_w = 16; - const int ndims = 4; + LayoutType in_laytype = inputs[0]->get_layout(); + bool layout_c16 = (in_laytype == Layout_NCHW_C16R || in_laytype == Layout_NCHW_C16); + + bool layout_c8 = (in_laytype == Layout_NCHW_C8R || in_laytype == Layout_NCHW_C8); + + if (!utils::one_of(param.pooling_type, + Pooling_max, + Pooling_average_include_padding, + Pooling_average_exclude_padding)) { + LOG(FATAL) << "not support " << param.pooling_type; + return SaberUnImplError; + } + jit_pool_conf_t jpp; + jpp.src_fmt = inputs[0]->get_layout(); + const int ndims = 4; jpp.ndims = ndims; jpp.mb = src_shape[0]; - jpp.c = src_shape[1] * 16; + jpp.c = inputs[0]->channel(); + + if (in_laytype == Layout_NCHW_C8R || in_laytype == Layout_NCHW_C16R) { + jpp.c = utils::round_up(src_shape.channel(), inputs[0]->valid_shape().get_layout_aligned_length()); + } + jpp.id = (ndims == 5) ? src_shape[2] : 1; jpp.ih = src_shape[ndims - 2]; jpp.iw = src_shape[ndims - 1]; jpp.od = (ndims == 5) ? dst_shape[2] : 1; jpp.oh = dst_shape[ndims - 2]; jpp.ow = dst_shape[ndims - 1]; - jpp.stride_d = 1; jpp.stride_h = param.stride_h; jpp.stride_w = param.stride_w; jpp.kd = 1; jpp.kh = param.window_h; jpp.kw = param.window_w; - jpp.f_pad = 0; jpp.t_pad = param.pad_h; jpp.l_pad = param.pad_w; - jpp.alg = param.pooling_type; - jpp.ind_dt = AK_FLOAT; - jpp.simple_alg = false; + if (_kernel != nullptr) { + delete _kernel; + } - jpp.c_block = simd_w; + if (layout_c16) { + CHECK(mayiuse(avx512_common)) << "jit pooling init failed"; + CHECK(jit_pool_kernel_f32::init_conf(jpp)) << "jit pooling init failed"; + _kernel = new jit_pool_kernel_f32(jpp); + } else if (layout_c8) { + CHECK(mayiuse(avx2)) << "jit pooling init failed"; + CHECK(jit_pool_kernel_f32::init_conf(jpp)) << "jit pooling init failed"; + _kernel = new jit_pool_kernel_f32(jpp); + } - jpp.nb_c = jpp.c / jpp.c_block; + return SaberSuccess; +} - if (jpp.alg == Pooling_max) { - jpp.ur_w = 16; - } else { - jpp.ur_w = 24; - } +template <> +SaberStatus SaberPooling::init( + const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam& param, Context& ctx) { - if (jpp.ow < jpp.ur_w) { - jpp.ur_w = jpp.ow; - } + this->_ctx = &ctx; - if (jpp.l_pad > jpp.ur_w) { - return SaberUnImplError; - } + return create(inputs, outputs, param, ctx); +} - jpp.ur_w_tail = jpp.ow % jpp.ur_w; +void pooling_avx2_nchwc8(const float* src, float* dst, int in_n, int in_c, int in_h, int in_w, + int out_h, + int out_w, int stride_h, int stride_w, int window_h, int window_w, int pad_h, int pad_w, + PoolingType pooling_type) { + int size_in_n = in_c * in_h * in_w * 8; + int size_in_c = in_h * in_w * 8; + int size_out_n = in_c * out_h * out_w * 8; + int size_out_c = out_h * out_w * 8; - if (jit_uni_pool_kernel_f32::init_conf(jpp)) { - return SaberSuccess; - } else { - return SaberUnImplError; + for (int ind_n = 0; ind_n < in_n; ++ind_n) { + for (int ind_c = 0; ind_c < in_c; ++ind_c) { + for (int ind_h = 0; ind_h < out_h; ++ind_h) { + int sh = ind_h * stride_h; + int eh = sh + window_h; + + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; + + + for (int ind_w = 0; ind_w < out_w; ++ind_w) { + int sw = ind_w * stride_w; + int ew = sw + window_w; + + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; + + float result[8] = {0.f}; + + int dst_ind = ind_n * size_out_n + ind_c * size_out_c + ind_h * out_w * 8 + ind_w * 8; + + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + int src_ind = + ind_n * size_in_n + ind_c * size_in_c + kh * in_w * 8 + kw * 8 + inner_c_id; + + if (kh == sh && kw == sw) { + result[inner_c_id] = src[src_ind]; + } else { + if (pooling_type == Pooling_max) { + result[inner_c_id] = + result[inner_c_id] >= src[src_ind] ? result[inner_c_id] : src[src_ind]; + // LOG(INFO)<<"find it "<= in_w + pad_w ? in_w + pad_w : sw + window_w; + bw -= sw; + } + + if (eh == in_h) { + bh = sh + window_h >= in_h + pad_h ? in_h + pad_h : sh + window_h; + bh -= sh; + } + + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + result[inner_c_id] /= bh * bw; + } + } + + if (pooling_type == Pooling_average_exclude_padding) { + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + result[inner_c_id] /= (ew - sw) * (eh - sh); + } + } + + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + + dst[dst_ind + inner_c_id] = result[inner_c_id]; + // LOG(INFO)<<"finnal it "< -SaberStatus SaberPooling::create( - const std::vector*>& inputs, - std::vector*>& outputs, - PoolingParam& param, - Context& ctx) { - if (mayiuse(avx512_common)) { - jit_pool_conf_t jpp_; +void pooling_avx2_nchwc8_nchw(const float* src, float* dst, int in_n, int in_c, int in_h, int in_w, + int out_h, + int out_w, int stride_h, int stride_w, int window_h, int window_w, int pad_h, int pad_w, + PoolingType pooling_type, int real_c) { + int size_in_n = in_c * in_h * in_w * 8; + int size_in_c = in_h * in_w * 8; + int size_out_n = in_c * out_h * out_w * 8; + int size_out_c = out_h * out_w * 8; + int size_out_real_n = real_c * out_h * out_w; + int size_out_real_c = out_h * out_w; + #pragma omp parallel for collapse(3) schedule(static) - if (init_conf(jpp_, inputs, outputs, param) != SaberSuccess) { - return SaberUnImplError; - } + for (int ind_n = 0; ind_n < in_n; ++ind_n) { + for (int ind_c = 0; ind_c < in_c; ++ind_c) { + for (int ind_h = 0; ind_h < out_h; ++ind_h) { + int sh = ind_h * stride_h; + int eh = sh + window_h; - _kernel = new jit_uni_pool_kernel_f32(jpp_); - } else {} + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; - return SaberSuccess; -} -template <> -SaberStatus SaberPooling::init( - const std::vector*>& inputs, - std::vector*>& outputs, - PoolingParam& param, Context& ctx) { + for (int ind_w = 0; ind_w < out_w; ++ind_w) { + int sw = ind_w * stride_w; + int ew = sw + window_w; - this->_ctx = &ctx; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; + + float result[8] = {0.f}; + + + + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + int src_ind = + ind_n * size_in_n + ind_c * size_in_c + kh * in_w * 8 + kw * 8 + inner_c_id; + + if (kh == sh && kw == sw) { + result[inner_c_id] = src[src_ind]; + } else { + if (pooling_type == Pooling_max) { + result[inner_c_id] = + result[inner_c_id] >= src[src_ind] ? result[inner_c_id] : src[src_ind]; + // LOG(INFO)<<"find it "<= in_w + pad_w ? in_w + pad_w : sw + window_w; + bw -= sw; + } + + if (eh == in_h) { + bh = sh + window_h >= in_h + pad_h ? in_h + pad_h : sh + window_h; + bh -= sh; + } + + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + result[inner_c_id] /= bh * bw; + } + } + + if (pooling_type == Pooling_average_exclude_padding) { + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + result[inner_c_id] /= (ew - sw) * (eh - sh); + } + } + + for (int inner_c_id = 0; inner_c_id < 8; inner_c_id++) { + int dst_ind = ind_n * size_out_real_n + (ind_c * 8 + inner_c_id) * size_out_real_c + ind_h * out_w + + ind_w; + dst[dst_ind] = result[inner_c_id]; + // LOG(INFO)<<"finnal it "< @@ -113,14 +297,18 @@ ::dispatch(const std::vector*>& inputs, const float* src = static_cast(inputs[0]->data()); float* dst = static_cast(outputs[0]->mutable_data()); - //if (mayiuse(avx512_common)) { - if (false) { - //avx512 use jit - const auto& jpp = _kernel->jpp; + DLOG(INFO) << "input layout " << inputs[0]->get_layout() << " , output layout " << + outputs[0]->get_layout(); + + if (_kernel != nullptr && (inputs[0]->get_layout() == Layout_NCHW_C8 + || inputs[0]->get_layout() == Layout_NCHW_C8R) && (outputs[0]->get_layout() == Layout_NCHW_C8 + || outputs[0]->get_layout() == Layout_NCHW_C8R)) { + const float* src = (const float*)inputs[0]->data(); + float* dst = (float*)outputs[0]->mutable_data(); + const auto& jpp = _kernel->jpp; auto ker = [&](int n, int b_c, int oh) { jit_pool_call_t arg; - const int ij = oh * jpp.stride_h; const int i_t_overflow = std::max(0, jpp.t_pad - ij); const int i_b_overflow = std::max(jpp.ih, ij + jpp.kh - jpp.t_pad) - jpp.ih; @@ -153,6 +341,42 @@ ::dispatch(const std::vector*>& inputs, } } } + } else if (inputs[0]->get_layout() == Layout_NCHW_C8 + || inputs[0]->get_layout() == Layout_NCHW_C8R) { + if (outputs[0]->get_layout() == Layout_NCHW_C8 || outputs[0]->get_layout() == Layout_NCHW_C8R) { + int in_n = inputs[0]->num(); + int in_c = inputs[0]->channel() / 8; + + if (inputs[0]->get_layout() == Layout_NCHW_C8R) { + in_c = utils::div_up(inputs[0]->channel(), 8); + // LOG(INFO)<<"input inputs[0]->channel() c= "<channel()<<","<height(); + int in_w = inputs[0]->width(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + + pooling_avx2_nchwc8(src, dst, in_n, in_c, in_h, in_w, out_h, out_w, + param.stride_h, param.stride_w, param.window_h, param.window_w, param.pad_h, param.pad_w, + param.pooling_type); + // write_tensorfile(*inputs[0],"input_pooling"); + // write_tensorfile(*outputs[0],"output_pooling"); + // exit(0); + } else { + // DLOG(FATAL)<<"pooling nchw_c8 to nchw_c8r"; + int in_n = inputs[0]->num(); + int in_c = utils::div_up(inputs[0]->channel(), 8); + int real_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + pooling_avx2_nchwc8_nchw(src, dst, in_n, in_c, in_h, in_w, out_h, out_w, + param.stride_h, param.stride_w, param.window_h, param.window_w, param.pad_h, param.pad_w, + param.pooling_type, real_c); + DLOG(INFO) << "pooling nchw_c8 to nchw_c8r"; + } } else { //x86 common code int in_n = inputs[0]->num(); @@ -166,6 +390,7 @@ ::dispatch(const std::vector*>& inputs, int out_w = outputs[0]->width(); int size_out_n = in_c * out_h * out_w; int size_out_c = out_h * out_w; + #pragma omp parallel for collapse(3) schedule(static) for (int ind_n = 0; ind_n < in_n; ++ind_n) { for (int ind_c = 0; ind_c < in_c; ++ind_c) { @@ -185,7 +410,7 @@ ::dispatch(const std::vector*>& inputs, ew = (ew - param.pad_w) > in_w ? in_w : ew - param.pad_w; - float result; + float result = static_cast(0); int dst_ind = ind_n * size_out_n + ind_c * size_out_c + ind_h * out_w + ind_w; @@ -213,19 +438,20 @@ ::dispatch(const std::vector*>& inputs, } if (param.pooling_type == Pooling_average_include_padding) { - + int bh = param.window_h; int bw = param.window_w; - if (ew == in_w) - { + + if (ew == in_w) { bw = sw + param.window_w >= in_w + param.pad_w ? in_w + param.pad_w : sw + param.window_w; - bw -=sw; + bw -= sw; } - if (eh == in_h) - { - bh = sh + param.window_h >= in_h + param.pad_h ? in_h + param.pad_h: sh + param.window_h; + + if (eh == in_h) { + bh = sh + param.window_h >= in_h + param.pad_h ? in_h + param.pad_h : sh + param.window_h; bh -= sh; } + result /= bh * bw; } @@ -246,8 +472,36 @@ ::dispatch(const std::vector*>& inputs, return SaberSuccess; } + + +template <> +SaberStatus SaberPooling::create(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam& param, + Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberPooling::init(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam& param, Context& ctx) { + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberPooling::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PoolingParam& param) { + + return SaberSuccess; +} + template class SaberPooling; +template class SaberPooling; DEFINE_OP_TEMPLATE(SaberPooling, PoolingParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(SaberPooling, PoolingParam, X86, AK_INT8); + } } // namespace anakin \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_pooling.h b/saber/funcs/impl/x86/saber_pooling.h index 94393e9b0..0a05d8439 100644 --- a/saber/funcs/impl/x86/saber_pooling.h +++ b/saber/funcs/impl/x86/saber_pooling.h @@ -59,13 +59,9 @@ class SaberPooling : public ImplBase< virtual SaberStatus dispatch(const std::vector& inputs, std::vector& outputs, PoolingParam ¶m) override; - - virtual SaberStatus init_conf(jit_pool_conf_t &jpp, - const std::vector& inputs, - std::vector& outputs, - PoolingParam& param); private: - jit_uni_pool_kernel_f32* _kernel; + jit_uni_pool_kernel_f32* _kernel; + Tensor_input_scale; }; diff --git a/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.cpp b/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.cpp new file mode 100644 index 000000000..34548e11a --- /dev/null +++ b/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.cpp @@ -0,0 +1,245 @@ +#include "anakin_thread.h" +#include "saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { +bool decode_4d12b( const unsigned char *in, + unsigned int ilen, + unsigned int *out, + unsigned int olen) { + if (ilen % 3 != 0) { + LOG(INFO) << "error, ilen mod 3 != 0"; + return false; + } + if (ilen * 2 != olen * 3) { + LOG(INFO) << "error, ilen * 2 != olen * 3"; + return false; + } + memset(out, 0, olen * sizeof(unsigned int)); + for (unsigned int i = 0; i < ilen / 3; i++) { + unsigned char *raw_ptr = (unsigned char *)(out + i * 2); + auto tmp_in = in + 3 * i; + raw_ptr[0] = tmp_in[0]; + raw_ptr[1] = tmp_in[1] & 0x0f; + raw_ptr[4] = tmp_in[2]; + raw_ptr[5] = tmp_in[1] >> 4; + } + return true; +} + +void get_cur_idx(int word_idx, const int* word_offset, const int* real_offset, int offset_len, int* real_idx, int* case_idx) { + CHECK_EQ(offset_len, 9); + int index = 0; + if (word_idx < word_offset[4]) { + if (word_idx < word_offset[2]) { + if (word_idx < word_offset[1]) { + if (word_idx < word_offset[0]) { + index = 0; + } else { + index = 1; + } + } else { + index = 2; + } + } else { + if (word_idx < word_offset[3]) { + index = 3; + } else { + index = 4; + } + } + } else { + if (word_idx < word_offset[6]) { + if (word_idx < word_offset[5]) { + index = 5; + } else { + index = 6; + } + } else { + if (word_idx < word_offset[7]) { + index = 7; + } else { + index = 8; + } + } + } + *case_idx = index % 3; + if (index > 0) { + *real_idx = word_idx - word_offset[index - 1] + real_offset[index]; + } else { + *real_idx = word_idx; + } +} + +template +SaberStatus SaberProductQuantEmbeddingWithVsum::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + _voc_size = param.word_voc; + _emb_size = param.word_emb; + _max_seq_len = param.max_seq_len; + + _unigram_num[0] = param.top_unigram; + _unigram_num[1] = param.sec_unigram; + _unigram_num[2] = param.thd_unigram; + + _bigram_num[0] = param.top_bigram; + _bigram_num[1] = param.sec_bigram; + _bigram_num[2] = param.thd_bigram; + + _collocation_num[0] = param.top_collocation; + _collocation_num[1] = param.sec_collocation; + _collocation_num[2] = param.thd_collocation; + int _level_num = 3; + for (unsigned int i = 0; i < _level_num; i++) { + _word_num[i] = _unigram_num[i] + _bigram_num[i] + _collocation_num[i]; + _quant_dict[i] = NULL; + } + + _chnl_num[0] = 1; // log quant + _chnl_num[1] = _emb_size / 2; // 2d8b product quant + _chnl_num[2] = _emb_size / 4; // 4d12b product quant + + _word_len[0] = _emb_size; + _word_len[1] = _chnl_num[1]; + _word_len[2] = _chnl_num[2] / 2 * 3; + + _dict_size[0] = 256; + _dict_size[1] = 2 * 256; + _dict_size[2] = 4 * 4096; + _word_offset[0] = _unigram_num[0]; + _word_offset[1] = _word_offset[0] + _unigram_num[1]; + _word_offset[2] = _word_offset[1] + _unigram_num[2]; + + _word_offset[3] = _word_offset[2] + _bigram_num[0]; + _word_offset[4] = _word_offset[3] + _bigram_num[1]; + _word_offset[5] = _word_offset[4] + _bigram_num[2]; + + _word_offset[6] = _word_offset[5] + _collocation_num[0]; + _word_offset[7] = _word_offset[6] + _collocation_num[1]; + _word_offset[8] = _word_offset[7] + _collocation_num[2]; + + _real_offset[0] = 0; + _real_offset[1] = 0; + _real_offset[2] = 0; + + _real_offset[3] = _unigram_num[0]; + _real_offset[4] = _unigram_num[1]; + _real_offset[5] = _unigram_num[2]; + + _real_offset[6] = _unigram_num[0] + _bigram_num[0]; + _real_offset[7] = _unigram_num[1] + _bigram_num[1]; + _real_offset[8] = _unigram_num[2] + _bigram_num[2]; + + _buf = new unsigned int[anakin_get_num_procs() * _chnl_num[2]]; + + _weights[0] = (const unsigned char*)param.embedding_0->data(); + _weights[1] = (const unsigned char*)param.embedding_1->data(); + _weights[2] = (const unsigned char*)param.embedding_2->data(); + + _quant_dict[0] = (const float*)param.quant_dict_0->data(); + _quant_dict[1] = (const float*)param.quant_dict_1->data(); + _quant_dict[2] = (const float*)param.quant_dict_2->data(); + + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberProductQuantEmbeddingWithVsum::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberProductQuantEmbeddingWithVsum::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m) { + + auto offset = inputs[0]->get_seq_offset()[0]; + int seq_num = offset.size() - 1; + + outputs[0]->reshape(Shape({seq_num, _emb_size, 1, 1}, Layout_NCHW)); + + const OpDataType *input_data = (const OpDataType*)inputs[0]->data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + memset(output_data, 0, sizeof(OpDataType) * outputs[0]->valid_size()); + std::vector>> real_index; + real_index.resize(seq_num); + #pragma omp parallel for schedule(static) + for (int seq_id = 0; seq_id < seq_num; seq_id++) { + real_index[seq_id].resize(3); + int cur_len = offset[seq_id+1] - offset[seq_id]; + int len = _max_seq_len == -1 ? cur_len : std::min(cur_len, _max_seq_len); + for (int i = 0; i < len; i++) { + int word_idx = static_cast(input_data[offset[seq_id] + i]); + int real_idx = 0; + int case_idx = 0; + get_cur_idx(word_idx, _word_offset, _real_offset, 9, &real_idx, &case_idx); + real_index[seq_id][case_idx].push_back(real_idx); + } + } + #pragma omp parallel for schedule(static) + for (int seq_id = 0; seq_id < seq_num; seq_id++) { + auto tmp_buf = _buf + anakin_get_thread_num() * _chnl_num[2]; + auto tmp_out_data = output_data + seq_id * _emb_size; + + memset(tmp_out_data, 0, sizeof(OpDataType)*_emb_size); + //case 0: + for (int i = 0; i < real_index[seq_id][0].size(); i++) { + const unsigned char* word_pos = _weights[0] + real_index[seq_id][0][i] * _word_len[0]; + for (int j = 0; j < _word_len[0]; j++) { + tmp_out_data[j] += _quant_dict[0][word_pos[j]]; + } + } + //case 1: + for (int i = 0; i < real_index[seq_id][1].size(); i++) { + const unsigned char* word_pos = _weights[1] + real_index[seq_id][1][i] * _word_len[1]; + for (int j = 0; j < _chnl_num[1]; j++) { + const float * curr_dict = _quant_dict[1] + j * _dict_size[1] + word_pos[j] * 2; + auto tmp_out = tmp_out_data + j * 2; + tmp_out[0] += curr_dict[0]; + tmp_out[1] += curr_dict[1]; + } + } + //case 2: + for (int i = 0; i < real_index[seq_id][2].size(); i++) { + const unsigned char* word_pos = _weights[2] + real_index[seq_id][2][i] * _word_len[2]; + decode_4d12b(word_pos, _word_len[2], tmp_buf, _chnl_num[2]); + for (int j = 0; j < _chnl_num[2]; j++) { + const float * curr_dict = _quant_dict[2] + j * _dict_size[2] + tmp_buf[j] * 4; + auto tmp_out = tmp_out_data + j * 4; + tmp_out[0] += curr_dict[0]; + tmp_out[1] += curr_dict[1]; + tmp_out[2] += curr_dict[2]; + tmp_out[3] += curr_dict[3]; + } + } + } + + std::vector out_offset; + for (int i = 0; i < seq_num; i++) { + out_offset.push_back(i); + } + out_offset.push_back(seq_num); + outputs[0]->set_seq_offset(std::vector>{out_offset}); + return SaberSuccess; +} + +template class SaberProductQuantEmbeddingWithVsum; +DEFINE_OP_TEMPLATE(SaberProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberProductQuantEmbeddingWithVsum, ProductQuantEmbeddingWithVsumParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.h b/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.h new file mode 100644 index 000000000..d1052d84c --- /dev/null +++ b/saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/impl/impl_product_quant_embedding_with_vsum.h" + +namespace anakin { +namespace saber { + +template +class SaberProductQuantEmbeddingWithVsum : + public ImplBase< + X86, OpDtype, + ProductQuantEmbeddingWithVsumParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberProductQuantEmbeddingWithVsum() {} + + ~SaberProductQuantEmbeddingWithVsum() { + delete [] _buf; + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam ¶m) override; + +private: + int _voc_size; + int _emb_size; + int _max_seq_len; + int _unigram_num[3]; + int _bigram_num[3]; + int _collocation_num[3]; + int _chnl_num[3]; + int _word_len[3]; + int _word_num[3]; + int _dict_size[3]; + int _word_offset[9]; + int _real_offset[9]; + const unsigned char* _weights[3]; + const float* _quant_dict[3]; + + unsigned int* _buf; +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_ps_roi_pooling.cpp b/saber/funcs/impl/x86/saber_ps_roi_pooling.cpp new file mode 100644 index 000000000..7645784a8 --- /dev/null +++ b/saber/funcs/impl/x86/saber_ps_roi_pooling.cpp @@ -0,0 +1,289 @@ +#include "saber/funcs/impl/x86/saber_ps_roi_pooling.h" +#include +#include + +namespace anakin { + +namespace saber { + +/* + * crop rois and resize to [crop_height, crop_width] from in_data + * in_data shape: [pooled_h * pooled_w * c, im_h, im_w] + * rois shape: [num_rois, 4] + * out_data: [pooled_h * pooled_w * c, num_rois, crop_height, crop_width] + */ +template +void crop_and_resize_kernel( + const Dtype* in_data, + const Dtype* rois, + Dtype* out_data, + int num_rois, + int im_h, int im_w, + int crop_height, int crop_width, + int count, + int method, + float extra_value){ + + for (int index = 0;index < count; ++index){ + int temp_ind = index; + int cur_w = temp_ind % crop_width; + temp_ind /= crop_width; + int cur_h = temp_ind % crop_height; + temp_ind /= crop_height; + int cur_n = temp_ind % num_rois; + int cur_c = temp_ind / num_rois; + + const Dtype* rois_data = rois + cur_n * 4; + + float y1 = rois_data[0] * (im_h - 1); + float x1 = rois_data[1] * (im_w - 1); + float y2 = rois_data[2] * (im_h - 1); + float x2 = rois_data[3] * (im_w - 1); + + float height_scale = crop_height > 1 ? (y2 - y1)/(crop_height - 1) : 0; + float width_scale = crop_width > 1 ? (x2 - x1)/(crop_width - 1) : 0; + + float in_y = crop_height > 1 ? y1 + cur_h * height_scale : (y1 + y2)/2; + + if (in_y < 0 || in_y > im_h - 1){ + out_data[index] = extra_value; + continue; + } + + float in_x = crop_width > 1 ? x1 + cur_w * width_scale : (x1 + x2)/2; + if (in_x < 0 || in_x > im_w - 1){ + out_data[index] = extra_value; + continue; + } + + const Dtype* im_data = in_data + cur_c * im_h * im_w; + + //resize method 0 means bilinear + if (method == 0){ + int top_y = floor(in_y); + int bot_y = ceil(in_y); + float y_lerp = in_y - top_y; + + int left_x = floor(in_x); + int right_x = ceil(in_x); + float x_lerp = in_x - left_x; + + Dtype top_left = im_data[top_y*im_w + left_x]; + Dtype top_right = im_data[top_y*im_w + right_x]; + Dtype bot_left = im_data[bot_y*im_w + left_x]; + Dtype bot_right = im_data[bot_y*im_w + right_x]; + float top = top_left + (top_right - top_left) * y_lerp; + float bot = bot_left + (bot_right - bot_left) * y_lerp; + out_data[index] = top + (bot - top) * x_lerp; + } else { + //else method means nearest + int closest_x = round(in_x); + int closest_y = round(in_y); + out_data[index] = im_data[closest_y*im_w + closest_x]; + } + } + +} + +template +void crop_global_pooling_kernel(const Dtype* in_data, Dtype* out_data, + int pooled_size, int channel, int num_rois, int crop_height, int crop_width, + int count){ + for (int index = 0; index < count; ++index){ + int cur_n = index / channel; + int cur_c = index % channel; + int crop_size = crop_height * crop_width; + Dtype sum = 0; + for (int i = 0; i < crop_size; ++i){ + Dtype tmp_sum = 0; + for (int j = 0; j < pooled_size; ++j){ + tmp_sum += in_data[(j * num_rois + cur_n) * crop_size + i]; + } + sum += tmp_sum / pooled_size; + } + out_data[index] = sum /crop_size; + } +} + +template +void crop_no_global_pooling_kernel(const Dtype* in_data, Dtype* out_data, + int pooled_height, int pooled_width, int channel, int num_rois, int crop_height, int crop_width, + int count){ + for (int index = 0; index < count; ++index){ + int cur_pw = index % pooled_width; + index /= pooled_width; + int cur_cw = index % crop_width; + index /= crop_width; + int cur_ph = index % pooled_height; + index /= pooled_height; + int cur_ch = index % crop_height; + index /= crop_height; + int cur_c = index % channel; + int cur_n = index / channel; + + int in_index = ((((cur_ph * pooled_width + cur_pw) * channel + + cur_c) * num_rois + cur_n) * crop_height + cur_ch) * crop_width + cur_cw; + out_data[index] = in_data[in_index]; + } +} + + +//for tf, it has no batch_ind +template +void psroi_pool_no_batchind(const Dtype* in_data, const Dtype* rois, Dtype* out_data, + int in_n, int in_c, int in_h, int in_w, int o_c, int o_h, int o_w, + int pooled_h, int pooled_w, float spatial_scale, int count){ + + for (int index = 0; index < count; ++index){ + int temp_ind = index; + int cur_w = temp_ind % o_w; + temp_ind /= o_w; + int cur_h = temp_ind % o_h; + temp_ind /= o_h; + int cur_c = temp_ind % o_c; + int cur_n = temp_ind / o_c; + + const Dtype* rois_data = rois + cur_n * 4; + + int roi_x0 = fminf(fmaxf(rois_data[0] * spatial_scale, 0), in_w-1); + int roi_y0 = fminf(fmaxf(rois_data[1] * spatial_scale, 0), in_h-1); + int roi_x1 = fminf(fmaxf(rois_data[2] * spatial_scale, 0), in_w-1); + int roi_y1 = fminf(fmaxf(rois_data[3] * spatial_scale, 0), in_h-1); + + int roi_h = roi_y1 - roi_y0 + 1; + int roi_w = roi_x1 - roi_x0 + 1; + + Dtype bin_w = static_cast(roi_w) / pooled_w; + Dtype bin_h = static_cast(roi_h) / pooled_h; + + int ws = roi_x0 + bin_w * cur_w; + int we = ceil(roi_x0 + bin_w * (cur_w + 1)); + int ys = roi_y0 + bin_h * cur_h; + int ye = ceil(roi_y0 + bin_h * (cur_h + 1)); + + int c_index = (cur_h * pooled_w + cur_w) * o_c + cur_c; + + const Dtype* offset_in_data = in_data + c_index * in_w * in_h; + + Dtype sum = 0; + + for (int y = ys; y < ye; ++y){ + for (int w = ws; w < we; ++w){ + sum += offset_in_data[y * in_w + w]; + } + } + sum /= (ye - ys) * (we - ws); + + //tf is set to `hwc` format, here we set `chw` format + out_data[index] = sum; + + } + +} + +//for caffe, it has batchind +template +void psroi_pool_with_batchind(const Dtype* in_data, const Dtype* rois, Dtype* out_data, + int in_n, int in_c, int in_h, int in_w, int o_c, int o_h, int o_w, + int pooled_h, int pooled_w, float spatial_scale, int count){ + + for (int index = 0; index < count; ++index){ + int temp_ind = index; + int cur_w = temp_ind % o_w; + temp_ind /= o_w; + int cur_h = temp_ind % o_h; + temp_ind /= o_h; + int cur_c = temp_ind % o_c; + int cur_n = temp_ind / o_c; + + const Dtype* rois_data = rois + cur_n * 5; + + int batch = rois_data[0]; + Dtype roi_x0 = rois_data[1] * spatial_scale; + Dtype roi_y0 = rois_data[2] * spatial_scale; + Dtype roi_x1 = (rois_data[3] + 1) * spatial_scale; + Dtype roi_y1 = (rois_data[4] + 1) * spatial_scale; + + Dtype roi_h = roi_y1 - roi_y0; + Dtype roi_w = roi_x1 - roi_x0; + + Dtype bin_w = roi_w / pooled_w; + Dtype bin_h = roi_h / pooled_h; + + int ws = roi_x0 + bin_w * cur_w; + int we = ceil(roi_x0 + bin_w * (cur_w + 1)); + int ys = roi_y0 + bin_h * cur_h; + int ye = ceil(roi_y0 + bin_h * (cur_h + 1)); + + ws = fminf(fmaxf(ws, 0), in_w); + we = fminf(fmaxf(we, 0), in_w); + ys = fminf(fmaxf(ys, 0), in_h); + ye = fminf(fmaxf(ye, 0), in_h); + + int c_index = (cur_h * pooled_w + cur_w) * o_c + cur_c; + + const Dtype* offset_in_data = in_data + (batch * in_c + c_index) * in_w * in_h; + + Dtype sum = 0.f; + + for (int y = ys; y < ye; ++y){ + for (int w = ws; w < we; ++w){ + sum += offset_in_data[y * in_w + w]; + } + } + sum /= (ye - ys) * (we - ws); + + out_data[index] = sum; + + } + +} + +template +SaberStatus SaberPsRoiPool::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + PsRoiPoolParam& param) { + + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + const OpDataType* in_rois = (const OpDataType*)inputs[1]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + OpDataType* inter_data = (OpDataType*)_crop_data.mutable_data(); + + int num_rois = inputs[1] -> num(); + int out_n = outputs[0]->num(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int in_n = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + + int crop_width = param.crop_width / param.pooled_width; + int crop_height = param.crop_height / param.pooled_height; + + int crop_count = _crop_data.valid_size(); + int pool_count = outputs[0]->valid_size(); + int pooled_size = param.pooled_height * param.pooled_width; + + crop_and_resize_kernel(\ + in_data, in_rois, inter_data, num_rois, in_h, in_w, + crop_height, crop_width, crop_count, param.method, + param.extra_value); + if (param.global_pooling){ + crop_global_pooling_kernel(\ + inter_data, out_data, pooled_size, out_c, + num_rois, crop_height, crop_width, pool_count); + } else { + crop_no_global_pooling_kernel(\ + inter_data, out_data, param.pooled_height, param.pooled_width, + out_c, num_rois, crop_height, crop_width, pool_count); + } + + return SaberSuccess; + +} + +} +} diff --git a/saber/funcs/impl/x86/saber_ps_roi_pooling.h b/saber/funcs/impl/x86/saber_ps_roi_pooling.h new file mode 100644 index 000000000..eb0c760d6 --- /dev/null +++ b/saber/funcs/impl/x86/saber_ps_roi_pooling.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PS_ROI_POOLING_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PS_ROI_POOLING_H + +#include "saber/funcs/impl/impl_ps_roi_pooling.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberPsRoiPool: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberPsRoiPool() + {} + + ~SaberPsRoiPool() { + + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m, + Context &ctx) { + Shape inter_shape = inputs[0]->shape(); + int oc = outputs[0]->channel(); + int num = outputs[0]->num(); + int crop_width = param.crop_width / param.pooled_width; + int crop_height = param.crop_height / param.pooled_height; + + inter_shape.set_num(param.pooled_height * param.pooled_width * oc); + inter_shape.set_channel(num); + inter_shape.set_width(crop_width); + inter_shape.set_height(crop_height); + _crop_data.re_alloc(inter_shape, OpDtype); + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PsRoiPoolParam ¶m); + +private: + Tensor _crop_data; + +}; +template class SaberPsRoiPool; +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ROI_POOL_H diff --git a/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.cpp b/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.cpp new file mode 100644 index 000000000..a1e9d5adb --- /dev/null +++ b/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.cpp @@ -0,0 +1,137 @@ + +#include "saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include +extern "C"{ + #include "xxHash/xxhash.h" + #include "bloomfilter/bloomfilter.h" +} + +namespace anakin{ +namespace saber { + +bool should_use_term( + const float* term, + bloomfilter* white_filter_ptr, + bloomfilter* black_filter_ptr, + size_t len){ + return + (!white_filter_ptr || 1 == bloomfilter_get(white_filter_ptr, + term, + len * sizeof(float))) && + (!black_filter_ptr || 0 == bloomfilter_get(black_filter_ptr, + term, + len * sizeof(float))); +} + +template +SaberStatus SaberPyramidHashQuantEmbeddingWithVsum::hash_embedding_forward(const OpDataType* buffer, + int len, + const OpDataType* quant_dict, + const unsigned char* weights, + OpDataType* out) { + for (unsigned int j = 0; j < _emb_size; j += _rand_len) { + unsigned int pos = XXH32(buffer, len * sizeof(OpDataType), j) % _space_size; + //LOG(INFO)<< "pos:" < +SaberStatus SaberPyramidHashQuantEmbeddingWithVsum::init( + const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + _space_size = param.space_size; + _emb_size = param.emb_size; + _pyramid_layer = param.pyramid_layer; + _rand_len = param.rand_len; + _white_filter_size = param.white_list_len; + _black_filter_size = param.black_list_len; + _dropout_percent = param.dropout_percent; + _quant_bit = 8; + _dict_size = 1 << _quant_bit; + CHECK_EQ(param.quant_dict->valid_size(), _dict_size); + CHECK_EQ(param.hash_space->valid_size(), _space_size + _rand_len); + if (param.white_filter != NULL) { + CHECK_EQ(param.white_filter->valid_size(), _white_filter_size); + } + if (param.black_filter != NULL) { + CHECK_EQ(param.black_filter->valid_size(), _black_filter_size); + } + + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberPyramidHashQuantEmbeddingWithVsum::create( + const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberPyramidHashQuantEmbeddingWithVsum::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m) { + CHECK_EQ(inputs.size(), 1) << "PyramidHashQuantEmbedding input num need be 1, but is" << inputs.size(); + CHECK_EQ(outputs.size(), 1) << "PyramidHashQuantEmbedding input num need be 1, but is" << outputs.size(); + size_t count = inputs[0]->valid_size(); + + const OpDataType *input_data = (const OpDataType*)inputs[0]->data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + const unsigned char* weights = (const unsigned char*) param.hash_space->data(); + const float* quant_dict = (const float*)param.quant_dict->data(); + CHECK(weights != NULL) << "embedding matrix weights is NULL"; + + bloomfilter* white_filter_ptr = NULL; + bloomfilter* black_filter_ptr = NULL; + if (_white_filter_size) { + white_filter_ptr = (bloomfilter*)param.white_filter->mutable_data(); + } + if (_black_filter_size) { + black_filter_ptr = (bloomfilter*)param.black_filter->mutable_data(); + } + + auto in_seq_offset = inputs[0]->get_seq_offset()[0]; + memset(output_data, 0, sizeof(OpDataType)*outputs[0]->valid_size()); + #pragma omp parallel for schedule(static) + for (int i = 0; i < in_seq_offset.size() - 1; i++) { + int cur_len = in_seq_offset[i+1] - in_seq_offset[i]; + auto tmp_out_data = output_data + i * _emb_size; + auto in_tmp = input_data + in_seq_offset[i]; + + if (cur_len < 2) { + memset(tmp_out_data, 0, sizeof(OpDataType) * _emb_size); + } else { + for (int j = 1; j < param.pyramid_layer && j < cur_len; j++) { + for (int k = 0; k < cur_len - j; k++) { + if (should_use_term(&in_tmp[k], white_filter_ptr, black_filter_ptr, j + 1)) { + hash_embedding_forward(&in_tmp[k], j + 1, quant_dict, weights, + tmp_out_data); + } + } + } + } + } + return SaberSuccess; +} +template class SaberPyramidHashQuantEmbeddingWithVsum; +DEFINE_OP_TEMPLATE(SaberPyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberPyramidHashQuantEmbeddingWithVsum, PyramidHashQuantEmbeddingParam, X86, AK_INT8); +} +} + diff --git a/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.h b/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.h new file mode 100644 index 000000000..0879c4f60 --- /dev/null +++ b/saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/impl/impl_pyramid_hash_quant_embedding_with_vsum.h" + +namespace anakin { +namespace saber { + +template +class SaberPyramidHashQuantEmbeddingWithVsum : + public ImplBase< + X86, OpDtype, + PyramidHashQuantEmbeddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberPyramidHashQuantEmbeddingWithVsum() {} + + ~SaberPyramidHashQuantEmbeddingWithVsum() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + PyramidHashQuantEmbeddingParam ¶m) override; + virtual SaberStatus hash_embedding_forward(const OpDataType* buffer, + int len, + const OpDataType* quant_dict, + const unsigned char* weights, + OpDataType* out); + +private: + int _space_size; + int _emb_size; + int _pyramid_layer; + int _rand_len; + int _white_filter_size; + int _black_filter_size; + float _dropout_percent; + int _quant_bit; + int _dict_size; +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_rcnn_proposal.h b/saber/funcs/impl/x86/saber_rcnn_proposal.h index dffe74ff0..fdd385bab 100644 --- a/saber/funcs/impl/x86/saber_rcnn_proposal.h +++ b/saber/funcs/impl/x86/saber_rcnn_proposal.h @@ -25,15 +25,7 @@ class SaberRCNNProposal : public ImplROIOutputSSD < X86, OpDtype > { public: - SaberRCNNProposal() - : _img_info_data_host_tensor(NULL) - , _probs_st_host_tensor(NULL) - , _cords_st_host_tensor(NULL) - , _rois_st_host_tensor(NULL) - , _outputs_boxes_scores_host_tensor(NULL) - , has_img_info_(false) - , rois_dim_(0) - {} + SaberRCNNProposal() = default; ~SaberRCNNProposal() { if (_img_info_data_host_tensor != NULL) { @@ -71,13 +63,13 @@ class SaberRCNNProposal : public ImplROIOutputSSD < std::vector*> &outputs, ProposalParam& param); private: - bool has_img_info_; - int rois_dim_; - Tensor* _img_info_data_host_tensor; - Tensor* _probs_st_host_tensor; - Tensor* _cords_st_host_tensor; - Tensor* _rois_st_host_tensor; - Tensor* _outputs_boxes_scores_host_tensor; + bool has_img_info_{false}; + int rois_dim_{0}; + Tensor* _img_info_data_host_tensor{nullptr}; + Tensor* _probs_st_host_tensor{nullptr}; + Tensor* _cords_st_host_tensor{nullptr}; + Tensor* _rois_st_host_tensor{nullptr}; + Tensor* _outputs_boxes_scores_host_tensor{nullptr}; }; } } diff --git a/saber/funcs/impl/x86/saber_reduce.cpp b/saber/funcs/impl/x86/saber_reduce.cpp new file mode 100644 index 000000000..026735053 --- /dev/null +++ b/saber/funcs/impl/x86/saber_reduce.cpp @@ -0,0 +1,406 @@ + +#include "saber/funcs/impl/x86/saber_reduce.h" + +namespace anakin { +namespace saber { +namespace { + +template +class ReOp{ +public: + static float compute(float a, float b) { + return -1.f; + } +}; + +template <> +float ReOp::compute(float a, float b) { + LOG(FATAL) << "reduce type is not init yet!!!!"; + return 0; +} + +template <> +float ReOp::compute(float a, float b) { + return ((a > b) ? a : b); +} + +template <> +float ReOp::compute(float a, float b) { + return ((a > b) ? b : a); +} + +template <> +float ReOp::compute(float a, float b) { + return a + b; +} + +template <> +float ReOp::compute(float a, float b) { + return a + b; +} + +template <> +float ReOp::compute(float a, float b) { + return a * b; +} + +template +class IndexCompute { +public: + static int input_idx(const int* dims, + const int* odims, + int out_idx); +}; + +template <> +int IndexCompute<4>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int i2 = (out_idx % out_stride[1]) / out_stride[2]; + int i3 = (out_idx % out_stride[2]) / out_stride[3]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1] + + i2 * in_stride[2] + + i3 * in_stride[3]; + return idx; +} + +template <> +int IndexCompute<3>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int i2 = (out_idx % out_stride[1]) / out_stride[2]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1] + + i2 * in_stride[2]; + return idx; +} + +template <> +int IndexCompute<2>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int i1 = (out_idx % out_stride[0]) / out_stride[1]; + int idx = i0 * in_stride[0] + + i1 * in_stride[1]; + return idx; +} + +template <> +int IndexCompute<1>::input_idx( + const int* in_stride, + const int* out_stride, + int out_idx) { + + int i0 = out_idx / out_stride[0]; + int idx = i0 * in_stride[0]; + return idx; +} + +template +class ReduceCompute{ +public: + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float* in_data, int in_idx) { + return 0; + } +}; + +template +class ReduceCompute<1, type> { +public: + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res = in_data[in_idx]; + int idx = in_idx + in_stride[rdims[0]]; +#pragma ivdep + for (int i = 1; i < dims[rdims[0]]; ++i) { + res = ReOp::compute(res, in_data[idx]); + idx += in_stride[rdims[0]]; + } + return res; + } +}; + +template +class ReduceCompute<2, type> { +public: + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = in_data[idx0]; + int idx1 = idx0 + in_stride[rdims[1]]; +#pragma ivdep + for (int j = 1; j < dims[rdims[1]]; ++j) { + res1 = ReOp::compute(res1, in_data[idx1]); + idx1 += in_stride[rdims[1]]; + } + idx0 += in_stride[rdims[0]]; + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + } + return res0; + } +}; + +template +class ReduceCompute<3, type> { +public: + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = 0.f; + int idx1 = idx0; + for (int j = 0; j < dims[rdims[1]]; ++j) { + float res2 = in_data[idx1]; + int idx2 = idx1 + in_stride[rdims[2]]; +#pragma ivdep + for (int k = 1; k < dims[rdims[2]]; ++k) { + res2 = ReOp::compute(res2, in_data[idx2]); + idx2 += in_stride[rdims[2]]; + } + if (j == 0) { + res1 = res2; + } else { + res1 = ReOp::compute(res1, res2); + } + idx1 += in_stride[rdims[1]]; + } + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + idx0 += in_stride[rdims[0]]; + } + return res0; + } +}; + +template +class ReduceCompute<4, type> { +public: + static float compute( + const int* dims, + const int* rdims, + const int* in_stride, + const float *in_data, int in_idx) { + + float res0 = 0.f; + int idx0 = in_idx; + for (int i = 0; i < dims[rdims[0]]; ++i) { + float res1 = 0.f; + int idx1 = idx0; + for (int j = 0; j < dims[rdims[1]]; ++j) { + float res2 = 0.f; + int idx2 = idx1; + for (int k = 0; k < dims[rdims[2]]; ++k) { + float res3 = in_data[idx2]; + int idx3 = idx2 + in_stride[rdims[3]]; +#pragma ivdep + for (int u = 0; u < dims[rdims[3]]; ++u) { + res3 = ReOp::compute(res3, in_data[idx3]); + idx3 += in_stride[rdims[3]]; + } + if (k == 0) { + res2 = res3; + } else { + res2 = ReOp::compute(res2, res3); + } + idx2 += in_stride[rdims[2]]; + } + if (j == 0) { + res1 = res2; + } else { + res1 = ReOp::compute(res1, res2); + } + idx1 += in_stride[rdims[1]]; + } + if (i == 0) { + res0 = res1; + } else { + res0 = ReOp::compute(res0, res1); + } + idx0 += in_stride[rdims[0]]; + } + return res0; + } +}; + +template +void reduce( + const dtype* src, + dtype* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, int out_size) { + + int reduce_size = 1; + for (int i = 0; i < rDim; ++i) { + reduce_size *= dims[rdim[i]]; + } + float reduce_size_1 = 1.f / ((float)reduce_size); +#pragma omp parallel for + for (int x = 0; x < out_size; ++x) { + int out_idx = x; + //init; + int in_idx = IndexCompute::input_idx(i_stride, o_stride, out_idx); + float res = ReduceCompute::compute( + dims, rdim, i_stride, src, in_idx); + dst[out_idx] = res; + if (Reduce_avg == type) { + dst[out_idx] *= reduce_size_1; + } + } +} + +void reduce_unknow( + const float* src, + float* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, int out_size) { + LOG(FATAL) << "reduce type unkonw!!!"; +} + +template +void reduce_all( + const dtype* src, + dtype* dst, + const int* rdim, + const int* dims, + const int* i_stride, + const int* o_stride, + int out_size) { + + int reduce_size = 1; + for (int i = 0; i < rDim; ++i) { + reduce_size *= dims[rdim[i]]; + } + float reduce_size_1 = 1.f / ((float)reduce_size); + //init; + float res = src[0]; +#pragma ivdep + for (int i = 1; i < reduce_size; ++i) { + res = ReOp::compute(res, src[i]); + } + dst[0] = res; + if (Reduce_avg == type) { + dst[0] *= reduce_size_1; + } +} +} + +#define REG_REDUCE_TYPE_KERNEL(REDUCE_TYPE) \ + _kernel_direct_map[REDUCE_TYPE] = { \ + {reduce_unknow}, \ + {reduce_unknow, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce, \ + reduce_all}, \ + {reduce_unknow, \ + reduce, \ + reduce, \ + reduce, \ + reduce_all}} + +template <> +SaberStatus SaberReduce::create( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + return SaberSuccess; +} + +template <> +SaberStatus SaberReduce::init( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param, Context& ctx) { + + REG_REDUCE_TYPE_KERNEL(Reduce_avg); + REG_REDUCE_TYPE_KERNEL(Reduce_min); + REG_REDUCE_TYPE_KERNEL(Reduce_max); + REG_REDUCE_TYPE_KERNEL(Reduce_sum); + REG_REDUCE_TYPE_KERNEL(Reduce_prod); + + this->_ctx = &ctx; + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberReduce::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param) { + + auto i_stride = inputs[0]->get_stride(); + auto o_stride = outputs[0]->get_stride(); + std::vector ndim; + + for (auto i : inputs[0]->valid_shape()) { + ndim.push_back(i); + } + _kernel_direct_map[param.reduce_type][inputs[0]->dims()][param.reduce_dim.size()]( + (const float*)inputs[0]->data(), + (float*)outputs[0]->mutable_data(), + param.reduce_dim.data(), ndim.data(), + i_stride.data(), o_stride.data(), + outputs[0]->valid_size()); + + return SaberSuccess; +} + +template class SaberReduce; +DEFINE_OP_TEMPLATE(SaberReduce, ReduceParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberReduce, ReduceParam, X86, AK_INT8); + +} // namespace saber. +} // namespace anakin. diff --git a/saber/funcs/impl/x86/saber_reduce.h b/saber/funcs/impl/x86/saber_reduce.h new file mode 100644 index 000000000..e37b9caef --- /dev/null +++ b/saber/funcs/impl/x86/saber_reduce.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REDUCE_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REDUCE_H + +#include "saber/funcs/impl/impl_reduce.h" +#include +#include + +namespace anakin{ + +namespace saber{ + +template +class SaberReduce : + public ImplBase< + X86, OpDtype, + ReduceParam > { +public: + SaberReduce() = default; + ~SaberReduce() = default; + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceParam& param, Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceParam& param); + +private: + + typedef std::function reduce_kernel; + + std::map>> _kernel_direct_map; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REDUCE_H diff --git a/saber/funcs/impl/x86/saber_reduce_min.cpp b/saber/funcs/impl/x86/saber_reduce_min.cpp new file mode 100644 index 000000000..ab6f9f7a8 --- /dev/null +++ b/saber/funcs/impl/x86/saber_reduce_min.cpp @@ -0,0 +1,195 @@ +#include "saber/funcs/impl/x86/saber_reduce_min.h" + +namespace anakin { +namespace saber { + +template +void reduce_n(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index, src_index, src_index0; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = src[data_index]; + for (int n = 1; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_c(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index, src_index0, src_index; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = src[src_index0]; + for (int c = 1; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_h(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index, src_index, src_index0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = src[src_index0]; + for (int h = 1; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_w(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index, src_index0, src_index; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = src[src_index0]; + for (int w = 1; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] = dst[data_index] < src[src_index] ? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_all(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + dtype min = src[0]; + int src_index; + int n_id, c_id; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + min = src[src_index] < min? src[src_index] : min; + } + } + } + } + dst[0] = min; +} + +template +void reduce_nc(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + //reduce n first. + Shape shape_tmp({1, channel_in, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +template +void reduce_ch(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce c first + Shape shape_tmp({num_in, 1, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +template +void reduce_hw(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce h first + Shape shape_tmp({num_in, channel_in, 1, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +template +SaberStatus SaberReduceMin::dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceMinParam& param) { + + const OpDataType* input_ptr = (const OpDataType*)inputs[0]->data(); + OpDataType* output_ptr = (OpDataType*)outputs[0]->mutable_data(); + + if (_reduce_dim.empty()) { + //reduce all. + reduce_all(input_ptr, output_ptr, _n, _c, _h, _w); + }else { + if (_reduce_dim.size() == 1) { + switch (_reduce_dim[0]) { + case 0: reduce_n(input_ptr, output_ptr, _n, _c, _h, _w); break; + case 1: reduce_c(input_ptr, output_ptr, _n, _c, _h, _w); break; + case 2: reduce_h(input_ptr, output_ptr, _n, _c, _h, _w); break; + case 3: reduce_w(input_ptr, output_ptr, _n, _c, _h, _w); break; + default: LOG(FATAL) << "error!!!"; + } + }else if (_reduce_dim.size() == 2) { + if (_reduce_dim[0] == 0 && _reduce_dim[1] == 1) { + reduce_nc(input_ptr, output_ptr, _n, _c, _h, _w); + }else if (_reduce_dim[0] == 1 && _reduce_dim[1] == 2) { + reduce_ch(input_ptr, output_ptr, _n, _c, _h, _w); + }else if (_reduce_dim[0] == 2 && _reduce_dim[1] == 3) { + reduce_hw(input_ptr, output_ptr, _n, _c, _h, _w); + }else { + LOG(FATAL) <<"invalid reduce_dim!!"; + } + } else { + LOG(FATAL) << "reduce_dim's size over than 2, which is not supported now!!"; + } + } + + + return SaberSuccess; +} + +template class SaberReduceMin; +DEFINE_OP_TEMPLATE(SaberReduceMin, ReduceMinParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberReduceMin, ReduceMinParam, X86, AK_INT8); + +} // namespace saber. +} // namespace anakin. \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_reduce_min.h b/saber/funcs/impl/x86/saber_reduce_min.h new file mode 100644 index 000000000..5d3306834 --- /dev/null +++ b/saber/funcs/impl/x86/saber_reduce_min.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REDUCE_MIN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_REDUCE_MIN_H + +#include "saber/funcs/impl/impl_reduce_min.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberReduceMin : + public ImplBase< + X86, OpDtype, + ReduceMinParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + SaberReduceMin() {} + ~SaberReduceMin() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceMinParam& param, Context& ctx) { + + this->_ctx = &ctx; + create(inputs, outputs, param, ctx); + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ReduceMinParam& param, Context &ctx) { + + _n = inputs[0]->num(); + _c = inputs[0]->channel(); + _h = inputs[0]->height(); + _w = inputs[0]->width(); + // int count = input[0]->valid_size(); + _rank = inputs[0]->valid_shape().size(); + + _reduce_dim = param.reduce_dim; + if (!_reduce_dim.empty()) { + //not empty + for (int i = 0; i < _reduce_dim.size(); ++i) { + if (_reduce_dim[i] < 0) { + _reduce_dim[i] += _rank; + } + } + } + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ReduceMinParam& param); + +private: + int _n; + int _c; + int _h; + int _w; + int _rank; //The dimentions of a tensor. + std::vector _reduce_dim; +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_CUDA_SABER_MATCH_MATRIX_H diff --git a/saber/funcs/impl/x86/saber_resize.cpp b/saber/funcs/impl/x86/saber_resize.cpp index 812e0a0c6..81e5bd613 100644 --- a/saber/funcs/impl/x86/saber_resize.cpp +++ b/saber/funcs/impl/x86/saber_resize.cpp @@ -1,14 +1,200 @@ #include "saber/funcs/impl/x86/saber_resize.h" -namespace anakin{ +namespace anakin { namespace saber { +template +void resize_bilinear_custom_kernel(const int w_out, const int h_out, + const int n_in, const int c_in, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_channel, + const int dst_stride_batch, + const int w_in, const int h_in, + const int src_stride_w, + const int src_stride_h, + const int src_stride_channel, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst){ + +#pragma omp parallel for collapse(2) schedule(static) + for (int h = 0; h < h_out; ++h) { + for (int w = 0; w < w_out; ++w) { + dtype fw = w * scale_w; + dtype fh = h * scale_h; + int w_start = (int)fw; + int w_end = (int)fw + 1; + int h_start = (int)fh; + int h_end = (int)fh + 1; + fw -= w_start; + fh -= h_start; + const dtype w00 = (1.0 - fh) * (1.0 - fw); + const dtype w01 = fw * (1.0 - fh); + const dtype w10 = fh * (1.0 - fw); + const dtype w11 = fw * fh; + + for (int n = 0; n < n_in; ++n) { + for (int c = 0; c < c_in; ++c) { + int src_index = n * src_stride_batch + c * src_stride_channel; + dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + dtype tr = w_end >= w_in ? 0 : src[src_index + w_end * src_stride_w + h_start * src_stride_h]; + dtype bl = h_end >= h_in ? 0 : src[src_index + w_start * src_stride_w + h_end * src_stride_h]; + dtype br = (w_end >= w_in) + || (h_end >= h_in) ? 0 : src[src_index + w_end * src_stride_w + h_end * src_stride_h]; + int dst_index = n * dst_stride_batch + c * dst_stride_channel + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); + } + } + } + } +} + +template +void resize_bilinear_align_kernel(const int w_out, const int h_out, + const int n_in,const int c_in, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_channel, + const int dst_stride_batch, + const int w_in, const int h_in, + const int src_stride_w, + const int src_stride_h, + const int src_stride_channel, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst){ + + float scale_w_new = (float)(w_in - 1) / (w_out - 1); + float scale_h_new = (float)(h_in - 1) / (h_out - 1); +#pragma omp parallel for collapse(2) schedule(static) + for (int h = 0; h < h_out; ++h) { + for (int w = 0; w < w_out; ++w) { + dtype fw = w * scale_w_new; + dtype fh = h * scale_h_new; + int w_start = (int)fw; + int w_id = w_start < w_in - 1 ? 1 : 0; + int w_end = (int)fw + w_id; + int h_start = (int)fh; + int h_id = h_start < h_in - 1 ? 1 : 0; + int h_end = (int)fh + h_id; + fw -= w_start; + fh -= h_start; + const dtype w00 = (1.0 - fh) * (1.0 - fw); + const dtype w01 = fw * (1.0 - fh); + const dtype w10 = fh * (1.0 - fw); + const dtype w11 = fw * fh; + + for (int n = 0; n < n_in; ++n) { + for (int c = 0; c < c_in; ++c) { + int src_index = n * src_stride_batch + c * src_stride_channel; + dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + dtype tr = src[src_index + w_end * src_stride_w + h_start * src_stride_h]; + dtype bl = src[src_index + w_start * src_stride_w + h_end * src_stride_h]; + dtype br = src[src_index + w_end * src_stride_w + h_end * src_stride_h]; + int dst_index = n * dst_stride_batch + c * dst_stride_channel + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); + } + } + } + } +} + +template +void resize_bilinear_no_align_kernel(const int w_out, const int h_out, + const int n_in,const int c_in, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_channel, + const int dst_stride_batch, + const int w_in, const int h_in, + const int src_stride_w, + const int src_stride_h, + const int src_stride_channel, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst){ + + float scale_w_new = (float)w_in / w_out; + float scale_h_new = (float)h_in / h_out; +#pragma omp parallel for collapse(2) schedule(static) + for (int h = 0; h < h_out; ++h) { + for (int w = 0; w < w_out; ++w) { + dtype fw = scale_w_new * (w + 0.5f) - 0.5f; + dtype fh = scale_h_new * (h + 0.5f) - 0.5f; + fw = fw < 0 ? 0 : fw; + fh = fh < 0 ? 0 : fh; + int w_start = (int)fw; + int w_id = w_start < w_in - 1 ? 1 : 0; + int w_end = (int)fw + w_id; + int h_start = (int)fh; + int h_id = h_start < h_in - 1 ? 1 : 0; + int h_end = (int)fh + h_id; + fw -= w_start; + fh -= h_start; + const dtype w00 = (1.0 - fh) * (1.0 - fw); + const dtype w01 = fw * (1.0 - fh); + const dtype w10 = fh * (1.0 - fw); + const dtype w11 = fw * fh; + + for (int n = 0; n < n_in; ++n) { + for (int c = 0; c < c_in; ++c) { + int src_index = n * src_stride_batch + c * src_stride_channel; + dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + dtype tr = src[src_index + w_end * src_stride_w + h_start * src_stride_h]; + dtype bl = src[src_index + w_start * src_stride_w + h_end * src_stride_h]; + dtype br = src[src_index + w_end * src_stride_w + h_end * src_stride_h]; + int dst_index = n * dst_stride_batch + c * dst_stride_channel + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); + } + } + } + } +} +template +void resize_nearest_kernel(const int w_out, const int h_out, + const int n_in,const int c_in, + const int dst_stride_w, + const int dst_stride_h, + const int dst_stride_channel, + const int dst_stride_batch, + const int w_in, const int h_in, + const int src_stride_w, + const int src_stride_h, + const int src_stride_channel, + const int src_stride_batch, + const float scale_w, const float scale_h, + const dtype* src, dtype* dst){ + + float scale_w_new = (float)(w_in - 1) / (w_out - 1); + float scale_h_new = (float)(h_in - 1) / (h_out - 1); + + #pragma omp parallel for collapse(2) schedule(static) + for (int h = 0; h < h_out; ++h) { + for (int w = 0; w < w_out; ++w) { + + int near_x = static_cast(scale_w_new * w + 0.5); + int near_y = static_cast(scale_h_new * h + 0.5); + near_x = near_x < 0 ? 0 : near_x; + near_y = near_y < 0 ? 0 : near_y; + + + for (int n = 0; n < n_in; ++n) { + for (int c = 0; c < c_in; ++c) { + int src_index = n * src_stride_batch + c * src_stride_channel; + int dst_index = n * dst_stride_batch + c * dst_stride_channel + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = src[src_index + near_y * src_stride_h + near_x * src_stride_w]; + } + } + } + } +} + template SaberStatus SaberResize::dispatch( - const std::vector& inputs, - std::vector& outputs, - ResizeParam ¶m) -{ + const std::vector& inputs, + std::vector& outputs, + ResizeParam& param) { typedef typename DataTrait::Dtype InDataType; typedef typename DataTrait::Dtype OutDataType; typedef typename DataTrait::Dtype dtype; @@ -18,6 +204,13 @@ SaberStatus SaberResize::dispatch( int c_out = outputs[0]->channel(); int n_out = outputs[0]->num(); + if (inputs.size() > 1){ + int* out_size_data = static_cast(inputs[1]->data()); + h_out = out_size_data[0]; + w_out = out_size_data[1]; + outputs[0]->reshape(Shape({n_out, c_out, h_out, w_out})); + } + int w_in = inputs[0]->width(); int h_in = inputs[0]->height(); int c_in = inputs[0]->channel(); @@ -38,17 +231,28 @@ SaberStatus SaberResize::dispatch( OutDataType* dst = (OutDataType*)outputs[0]->mutable_data(); Shape src_real_shape; Shape dst_real_shape; + if (inputs[0]->is_continue_mem()) { src_real_shape = inputs[0]->valid_shape(); } else { src_real_shape = inputs[0]->shape(); } + if (outputs[0]->is_continue_mem()) { dst_real_shape = outputs[0]->valid_shape(); } else { dst_real_shape = outputs[0]->shape(); } + float scale_w = 0.f; + float scale_h = 0.f; + if (param.out_width != -1 && param.out_height != -1){ + scale_w = (float)param.out_width / w_in; + scale_h = (float)param.out_height / h_in; + } else { + scale_w = param.width_scale; + scale_h = param.height_scale; + } int src_stride_w = src_real_shape.count(width_idx + 1); int src_stride_h = src_real_shape.count(height_idx + 1); int src_stride_channel = src_real_shape.count(channel_idx + 1); @@ -57,35 +261,33 @@ SaberStatus SaberResize::dispatch( int dst_stride_h = dst_real_shape.count(height_idx + 1); int dst_stride_channel = dst_real_shape.count(channel_idx + 1); int dst_stride_batch = dst_real_shape.count(num_idx + 1); - float scale_w = 1. / param.width_scale; - float scale_h = 1. / param.height_scale; - for(int n = 0; n < n_in; ++n){ - for(int c = 0; c < c_in; ++c){ - int src_index = n * src_stride_batch + c * src_stride_channel; - for(int h = 0; h < h_out; ++h){ - for(int w = 0; w < w_out; ++w){ - dtype fw = w * scale_w; - dtype fh = h * scale_h; - int w_start = (int)fw; - int w_end = (int)fw + 1; - int h_start = (int)fh; - int h_end = (int)fh + 1; - fw -= w_start; - fh -= h_start; - const dtype w00 = (1.0 - fh) * (1.0 - fw); - const dtype w01 = fw * (1.0 - fh); - const dtype w10 = fh * (1.0 - fw); - const dtype w11 = fw * fh; - dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; - dtype tr = w_end >= w_in ? 0 : src[src_index + w_end * src_stride_w + h_start * src_stride_h]; - dtype bl = h_end >= h_in ? 0 : src[src_index + w_start * src_stride_w + h_end * src_stride_h]; - dtype br = (w_end >= w_in) || (h_end >= h_in) ? 0 : src[src_index + w_end * src_stride_w + h_end * src_stride_h]; - int dst_index = n * dst_stride_batch + c * dst_stride_channel + h * dst_stride_h + w * dst_stride_w; - dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); - } - } - } + + switch (param.resize_type){ + case BILINEAR_ALIGN: + resize_bilinear_align_kernel(w_out, h_out, n_in, c_in, dst_stride_w, dst_stride_h, \ + dst_stride_channel, dst_stride_batch, w_in, h_in, src_stride_w, src_stride_h, \ + src_stride_channel, src_stride_batch, 1.f / scale_w, 1.f / scale_h, src, dst); + break; + case BILINEAR_NO_ALIGN: + resize_bilinear_no_align_kernel(w_out, h_out, n_in, c_in, dst_stride_w, dst_stride_h, \ + dst_stride_channel, dst_stride_batch, w_in, h_in, src_stride_w, src_stride_h, \ + src_stride_channel, src_stride_batch, 1.f / scale_w, 1.f / scale_h, src, dst); + break; + case RESIZE_CUSTOM: + resize_bilinear_custom_kernel(w_out, h_out, n_in, c_in, dst_stride_w, dst_stride_h, \ + dst_stride_channel, dst_stride_batch, w_in, h_in, src_stride_w, src_stride_h, \ + src_stride_channel, src_stride_batch, 1.f / scale_w, 1.f / scale_h, src, dst); + break; + case NEAREST_ALIGN: + resize_nearest_kernel(w_out, h_out, n_in, c_in, dst_stride_w, dst_stride_h, \ + dst_stride_channel, dst_stride_batch, w_in, h_in, src_stride_w, src_stride_h, \ + src_stride_channel, src_stride_batch, 1.f / scale_w, 1.f / scale_h, src, dst); + break; + default: + LOG(FATAL) << "Unsupport resize type: " << (int)param.resize_type; } + + return SaberSuccess; } diff --git a/saber/funcs/impl/x86/saber_roi_align.cpp b/saber/funcs/impl/x86/saber_roi_align.cpp new file mode 100644 index 000000000..8772104d5 --- /dev/null +++ b/saber/funcs/impl/x86/saber_roi_align.cpp @@ -0,0 +1,155 @@ +#include "saber/funcs/impl/x86/saber_roi_align.h" +#include +#include +namespace anakin { + +namespace saber { + +// we calculate the src coordinary and weights previsiously. +template +void bilinear_interpolate( + const int height, const int width, + const int pooled_height, const int pooled_width, const int iy_upper, + const int ix_upper, dtype roi_ymin, dtype roi_xmin, dtype bin_size_h, dtype bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, const int kROISize, + const int prePosROISize, Tensor* pre_pos, Tensor* pre_w) { + int pre_calc_index = 0; + int* pre_pos_data = (int*)pre_pos->mutable_data(); + dtype* pre_w_data = (dtype*)pre_w->mutable_data(); + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + // calculate y of sample points + dtype y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + // calculate x of samle points + for (int ix = 0; ix < ix_upper; ix++) { + dtype x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + // deal with elements out of map + if (y < -1.0 || y > height || x < -1.0 || x > width) { + for (int i = 0; i < prePosROISize; ++i) { + pre_pos_data[i + pre_calc_index * prePosROISize] = 0; + pre_w_data[i + pre_calc_index * prePosROISize] = 0; + } + pre_calc_index += 1; + continue; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + dtype ly = y - y_low, lx = x - x_low; + dtype hy = 1. - ly, hx = 1. - lx; + pre_pos_data[pre_calc_index * prePosROISize] = y_low * width + x_low; + pre_pos_data[pre_calc_index * prePosROISize + 1] = y_low * width + x_high; + pre_pos_data[pre_calc_index * prePosROISize + 2] = y_high * width + x_low; + pre_pos_data[pre_calc_index * prePosROISize + 3] = y_high * width + x_high; + pre_w_data[pre_calc_index * prePosROISize] = hy * hx; + pre_w_data[pre_calc_index * prePosROISize + 1] = hy * lx; + pre_w_data[pre_calc_index * prePosROISize + 2] = ly * hx; + pre_w_data[pre_calc_index * prePosROISize + 3] = ly * lx; + pre_calc_index += 1; + } + } + } + } +} + +template +SaberStatus SaberRoiAlign::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + RoiAlignParam& param) { + + const OpDataType* input_data = (const OpDataType*)inputs[0]->data(); + const OpDataType* rois = (const OpDataType*)inputs[1]->data(); + OpDataType* output_data = (OpDataType*)outputs[0]->mutable_data(); + + int batch_size = inputs[0]->num(); + int channels = inputs[0]->channel(); + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int rois_num = inputs[1]->num(); + // int count = input[0]->valid_size(); + + if (inputs[0]->is_continue_mem() && outputs[0]->is_continue_mem()) { + // For each ROIs, do fix-sized align. + for (int n = 0; n < rois_num; ++n) { + const OpDataType* cur_rois = rois + n * _kROISize; + int rois_id = cur_rois[0]; + OpDataType roi_xmin = cur_rois[1] * param.spatial_scale; + OpDataType roi_ymin = cur_rois[2] * param.spatial_scale; + OpDataType roi_xmax = cur_rois[3] * param.spatial_scale; + OpDataType roi_ymax = cur_rois[4] * param.spatial_scale; + + OpDataType roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); + OpDataType roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); + OpDataType bin_size_h = static_cast(roi_height) / static_cast(param.pooled_height); + OpDataType bin_size_w = static_cast(roi_width) / static_cast(param.pooled_width); + const OpDataType* batch_data = input_data + rois_id * _in_n_stride; + int roi_bin_grid_h = (param.sampling_ratio > 0)? param.sampling_ratio : ceil(roi_height / param.pooled_height); + int roi_bin_grid_w = (param.sampling_ratio > 0)? param.sampling_ratio : ceil(roi_width / param.pooled_width); + int count = roi_bin_grid_h * roi_bin_grid_w; + int pre_size = count * _out_c_stride; + _pre_pos.reshape(Shape({pre_size, _prePosROISize, 1, 1})); //pre ROI + _pre_w.reshape(Shape({pre_size, _prePosROISize, 1, 1})); // pre ROI weights. + + bilinear_interpolate(height, width, + param.pooled_height, param.pooled_width, + roi_bin_grid_h,roi_bin_grid_w, + roi_ymin, roi_xmin, + bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, + _kROISize, _prePosROISize, + &_pre_pos, &_pre_w); + const int* pre_pos_data = (const int*)_pre_pos.data(); + const OpDataType* pre_w_data = (const OpDataType*)_pre_w.data(); + for (int c = 0; c < channels; c++) { + int pre_calc_index = 0; + for (int ph = 0; ph < param.pooled_height; ph++) { + for (int pw = 0; pw < param.pooled_width; pw++) { + const int pool_index = ph * param.pooled_width + pw; + OpDataType output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + for (int i = 0; i < _prePosROISize; i++) { + int pos = pre_pos_data[pre_calc_index * _prePosROISize + i]; + OpDataType w = pre_w_data[pre_calc_index * _prePosROISize + i]; + output_val += w * batch_data[pos]; + } + pre_calc_index += 1; + } + } + output_val /= count; + output_data[pool_index] = output_val; + } + } + batch_data += _in_c_stride; + output_data += _out_c_stride; + } + } + } + return SaberSuccess; +} +template class SaberRoiAlign; +DEFINE_OP_TEMPLATE(SaberRoiAlign, RoiAlignParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberRoiAlign, RoiAlignParam, X86, AK_INT8); +} //namespace saber. +} //namespace anakin. diff --git a/saber/funcs/impl/x86/saber_roi_align.h b/saber/funcs/impl/x86/saber_roi_align.h new file mode 100644 index 000000000..774509dae --- /dev/null +++ b/saber/funcs/impl/x86/saber_roi_align.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ROI_ALIGN_H + +#include "saber/funcs/impl/impl_roi_align.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberRoiAlign: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberRoiAlign() + {} + + ~SaberRoiAlign() { + + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + Shape out_stride = outputs[0]->get_stride(); + Shape in_stride = inputs[0]->get_stride(); + int in_n_index = inputs[0]->num_index(); + int in_c_index = inputs[0]->channel_index(); + int in_h_index = inputs[0]->height_index(); + int in_w_index = inputs[0]->width_index(); + int out_n_index = outputs[0]->num_index(); + int out_c_index = outputs[0]->channel_index(); + int out_h_index = outputs[0]->height_index(); + int out_w_index = outputs[0]->width_index(); + _in_n_stride = in_stride[in_n_index]; + _in_c_stride = in_stride[in_c_index]; + _in_h_stride = in_stride[in_h_index]; + _in_w_stride = in_stride[in_w_index]; + _out_n_stride = out_stride[out_n_index]; + _out_c_stride = out_stride[out_c_index]; + _out_h_stride = out_stride[out_h_index]; + _out_w_stride = out_stride[out_w_index]; + + return SaberSuccess; + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m, + Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + RoiAlignParam ¶m); + +private: + int _in_n_stride; + int _in_c_stride; + int _in_h_stride; + int _in_w_stride; + int _out_n_stride; + int _out_c_stride; + int _out_h_stride; + int _out_w_stride; + const int _prePosROISize = 4; + const int _kROISize = 5; + Tensor _pre_pos; + Tensor _pre_w; +}; + +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ROI_POOL_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_rpn_proposal_ssd.h b/saber/funcs/impl/x86/saber_rpn_proposal_ssd.h index b94523232..843de4c60 100644 --- a/saber/funcs/impl/x86/saber_rpn_proposal_ssd.h +++ b/saber/funcs/impl/x86/saber_rpn_proposal_ssd.h @@ -33,15 +33,7 @@ class SaberRPNProposalSSD : public ImplROIOutputSSD < public: - SaberRPNProposalSSD() - : _img_info_data_host_tensor(NULL) - , _prob_data_host_tensor(NULL) - , _tgt_data_host_tensor(NULL) - , _outputs_boxes_scores_host_tensor(NULL) - , box_dev_nms_(NULL) - , boxes_dev_len(0) - , mask_dev_nms_(NULL) - {} + SaberRPNProposalSSD() = default; ~SaberRPNProposalSSD() { if (_img_info_data_host_tensor != NULL) { @@ -71,21 +63,21 @@ class SaberRPNProposalSSD : public ImplROIOutputSSD < ProposalParam ¶m) override; private: - int num_rpns_; - int num_anchors_; - bool has_img_info_; - int rois_dim_; + int num_rpns_{0}; + int num_anchors_{0}; + bool has_img_info_{false}; + int rois_dim_{0}; // ADD CPU TENSORS - Tensor *_img_info_data_host_tensor; - Tensor *_prob_data_host_tensor; - Tensor *_tgt_data_host_tensor; - Tensor *_outputs_boxes_scores_host_tensor; + Tensor *_img_info_data_host_tensor{nullptr}; + Tensor *_prob_data_host_tensor{nullptr}; + Tensor *_tgt_data_host_tensor{nullptr}; + Tensor *_outputs_boxes_scores_host_tensor{nullptr}; //caffe pyramid_layers.hpp:615 - float* box_dev_nms_; - unsigned long long* mask_dev_nms_; - int boxes_dev_len; + float* box_dev_nms_{nullptr}; + unsigned long long* mask_dev_nms_{nullptr}; + int boxes_dev_len{0}; //caffe pyramid_layers.hpp:618 }; diff --git a/saber/funcs/impl/x86/saber_scale.cpp b/saber/funcs/impl/x86/saber_scale.cpp index 43c5750db..1bfba2e2c 100644 --- a/saber/funcs/impl/x86/saber_scale.cpp +++ b/saber/funcs/impl/x86/saber_scale.cpp @@ -1,5 +1,8 @@ #include "saber/funcs/impl/x86/saber_scale.h" +#include +#include "saber/funcs/impl/x86/saber_avx2_expand.h" +#include "saber/funcs/timer.h" namespace anakin{ namespace saber { @@ -29,6 +32,100 @@ SaberStatus SaberScale::create( return SaberSuccess; } +/* +inline avx2_scale_inner_dim_1(float* data_in_ptr,float* data_out_ptr,int batch,int length,float* scale_ptr,float* bias_ptr){ + int round_dim=length/8*8; + int remainder=length%8; + if(bias_ptr!= nullptr) { + for (int batch_id = 0; batch_id < batch; batch_id++) { + const float* data_in=data_in+batch_id*length; + float* data_out=data_out_ptr+batch_id*length; + for (int i = 0; i < round_dim; i += 8) { + __m256 x = _mm256_loadu_ps(&data_in[i]); + __m256 bias = _mm256_loadu_ps(&bias_ptr[i]); + __m256 scale = _mm256_loadu_ps(&scale_ptr[i]); + __m256 ans = _mm256_fmadd_ps(scale, x, bias); + _mm256_storeu_ps(&data_out[i], ans); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 x = _mm256_maskload_ps(&data_in[round_dim], _vec_mask); + __m256 bias = _mm256_maskload_ps(&bias_ptr[round_dim], _vec_mask); + __m256 scale = _mm256_maskload_ps(&scale_ptr[round_dim], _vec_mask); + __m256 ans = _mm256_fmadd_ps(scale, x, bias); + _mm256_maskstore_ps(&data_out[round_dim], _vec_mask, ans); + } + } + }else{ + for (int batch_id = 0; batch_id < batch; batch_id++) { + const float* data_in=data_in+batch_id*length; + float* data_out=data_out_ptr+batch_id*length; + for (int i = 0; i < round_dim; i += 8) { + __m256 x = _mm256_loadu_ps(&data_in[i]); + __m256 scale = _mm256_loadu_ps(&scale_ptr[i]); + __m256 ans = _mm256_mul_ps(scale, x); + _mm256_storeu_ps(&data_out[i], ans); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 x = _mm256_maskload_ps(&data_in[round_dim], _vec_mask); + __m256 bias = _mm256_maskload_ps(&bias_ptr[round_dim], _vec_mask); + __m256 scale = _mm256_maskload_ps(&scale_ptr[round_dim], _vec_mask); + __m256 ans = _mm256_mul_ps(scale, x); + _mm256_maskstore_ps(&data_out[round_dim], _vec_mask, ans); + } + } + } + +} + +inline avx2_scale_inner_dim_1(float* data_in_ptr,float* data_out_ptr,int batch,int length,float* scale_ptr,float* bias_ptr){ + int round_dim=length/8*8; + int remainder=length%8; + if(bias_ptr!= nullptr) { + for (int batch_id = 0; batch_id < batch; batch_id++) { + const float* data_in=data_in+batch_id*length; + float* data_out=data_out_ptr+batch_id*length; + for (int i = 0; i < round_dim; i += 8) { + __m256 x = _mm256_loadu_ps(&data_in[i]); + __m256 bias = _mm256_loadu_ps(&bias_ptr[i]); + __m256 scale = _mm256_loadu_ps(&scale_ptr[i]); + __m256 ans = _mm256_fmadd_ps(scale, x, bias); + _mm256_storeu_ps(&data_out[i], ans); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 x = _mm256_maskload_ps(&data_in[round_dim], _vec_mask); + __m256 bias = _mm256_maskload_ps(&bias_ptr[round_dim], _vec_mask); + __m256 scale = _mm256_maskload_ps(&scale_ptr[round_dim], _vec_mask); + __m256 ans = _mm256_fmadd_ps(scale, x, bias); + _mm256_maskstore_ps(&data_out[round_dim], _vec_mask, ans); + } + } + }else{ + for (int batch_id = 0; batch_id < batch; batch_id++) { + const float* data_in=data_in+batch_id*length; + float* data_out=data_out_ptr+batch_id*length; + for (int i = 0; i < round_dim; i += 8) { + __m256 x = _mm256_loadu_ps(&data_in[i]); + __m256 scale = _mm256_loadu_ps(&scale_ptr[i]); + __m256 ans = _mm256_mul_ps(scale, x); + _mm256_storeu_ps(&data_out[i], ans); + } + if (remainder > 0) { + __m256i _vec_mask = _m256_continue_mask_m256i(remainder); + __m256 x = _mm256_maskload_ps(&data_in[round_dim], _vec_mask); + __m256 bias = _mm256_maskload_ps(&bias_ptr[round_dim], _vec_mask); + __m256 scale = _mm256_maskload_ps(&scale_ptr[round_dim], _vec_mask); + __m256 ans = _mm256_mul_ps(scale, x); + _mm256_maskstore_ps(&data_out[round_dim], _vec_mask, ans); + } + } + } + +} +*/ + template SaberStatus SaberScale::dispatch( const std::vector& inputs, @@ -56,8 +153,36 @@ SaberStatus SaberScale::dispatch( } else { CHECK_EQ(scale_dim, param.scale_w.size()) << "scale dim not valid"; } - + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); +// LOG(INFO)<<"outer_dim "< 0) { +// __m256i _vec_mask = _m256_continue_mask_m256i(remainder); +// __m256 x = _mm256_maskload_ps(&data_in_ptr[round_dim], _vec_mask); +// __m256 bias = _mm256_maskload_ps(&bias_ptr[round_dim], _vec_mask); +// __m256 scale = _mm256_maskload_ps(&scale_ptr[round_dim], _vec_mask); +// __m256 ans = _mm256_mul_ps(scale, x); +// _mm256_maskstore_ps(&data_out_ptr[round_dim], _vec_mask, ans); +// } +// +// } +// } +// } // TODO !! need add other types of scale + for (int outer_id = 0; outer_id < outer_dim; outer_id++) { for (int scale_id = 0; scale_id < scale_dim; scale_id++) { auto scale = scale_data[scale_id]; @@ -69,6 +194,9 @@ SaberStatus SaberScale::dispatch( } } } + + + return SaberSuccess; } diff --git a/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.cpp b/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.cpp new file mode 100644 index 000000000..4e19c30f1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.cpp @@ -0,0 +1,101 @@ +#include "anakin_thread.h" +#include "saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h" +#include "saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberSeqConcatSeqPoolSoftSign::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + _emb_size = inputs[0]->valid_size() / inputs[0]->num(); + int seq_len = inputs[0]->get_seq_offset()[0].size() - 1; + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_len = inputs[i]->get_seq_offset()[0].size() - 1 ; + CHECK_EQ(_emb_size, cur_emb_size) << "emb size must be the same"; + CHECK_EQ(seq_len, cur_seq_len) << "seq len must be the same"; + } + _buf = new OpDataType[anakin_get_num_procs() * _emb_size]; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSeqConcatSeqPoolSoftSign::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSeqConcatSeqPoolSoftSign::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m) { + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_num = inputs[i]->get_seq_offset()[0].size() - 1 ; + CHECK_EQ(emb_size, cur_emb_size) << "emb size must be the same"; + CHECK_EQ(seq_num, cur_seq_num) << "seq len must be the same"; + } + + outputs[0]->reshape(Shape({seq_num, emb_size, 1, 1}, Layout_NCHW)); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + std::vector> offset_vecs; + for (int i = 0; i < inputs.size(); i++) { + offset_vecs.push_back(inputs[i]->get_seq_offset()[0]); + } +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < seq_num; i++) { + auto tmp_out = output_data + i * emb_size; + auto tmp_buf = _buf + anakin_get_thread_num() * emb_size; + memset(tmp_buf, 0, sizeof(OpDataType) * emb_size); + for (int j = 0; j < inputs.size(); j++) { + const OpDataType *in_data = (const OpDataType*)inputs[j]->data(); + for (int k = offset_vecs[j][i]; k < offset_vecs[j][i + 1]; k++) { + auto tmp_in = in_data + k * emb_size; +//#if defined(__AVX2__) and defined(__FMA__) +// avx2_vector_sum(tmp_in, emb_size, tmp_buf); +//#else +//#pragma omp parallel for schedule(static) + for (int m = 0; m < emb_size; m++) { + tmp_buf[m] += tmp_in[m]; + } +//#endif + } + } + +//#if defined(__AVX2__) and defined(__FMA__) +// avx2_vector_soft_sign(tmp_buf, emb_size, tmp_out); +//#else +//#pragma omp parallel for schedule(static) + for (int m = 0; m < emb_size; m++) { + auto data = tmp_buf[m]; + auto tmp = data > 0 ? data : -data; + tmp_out[m] = data / (1 + tmp); + } +//#endif + } + + return SaberSuccess; +} + +template class SaberSeqConcatSeqPoolSoftSign; +DEFINE_OP_TEMPLATE(SaberSeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSeqConcatSeqPoolSoftSign, SeqConcatSeqPoolSoftSignParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h b/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h new file mode 100644 index 000000000..965b9d1c4 --- /dev/null +++ b/saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H + +#include "saber/funcs/impl/impl_seq_concat_seq_pool_soft_sign.h" + +namespace anakin { +namespace saber { + +template +class SaberSeqConcatSeqPoolSoftSign : + public ImplBase< + X86, OpDtype, + SeqConcatSeqPoolSoftSignParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberSeqConcatSeqPoolSoftSign() {} + + ~SaberSeqConcatSeqPoolSoftSign() { + if (_buf) { + delete _buf; + } + } + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam ¶m) override; + +private: + OpDataType* _buf; + int _emb_size; + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_sequence_concat.cpp b/saber/funcs/impl/x86/saber_sequence_concat.cpp new file mode 100644 index 000000000..b0d605c0d --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_concat.cpp @@ -0,0 +1,65 @@ + +#include "saber/funcs/impl/x86/saber_sequence_concat.h" +#include "saber/funcs/impl/x86/saber_sequence_concat.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberSequenceConcat::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSequenceConcat::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSequenceConcat::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m) { + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_num = inputs[i]->get_seq_offset()[0].size() - 1; + CHECK_EQ(emb_size, cur_emb_size) << "sequence concat emb size must be the same"; + CHECK_EQ(seq_num, cur_seq_num) << "sequence concat seq num must be the same"; + } + + for (int i = 0; i < seq_num; i++) { + for (int j = 0; j < inputs.size(); j++) { + size_t cur_len = inputs[j]->get_seq_offset()[0][i+1] - inputs[j]->get_seq_offset()[0][i]; + + const OpDataType *input_data = (const OpDataType*)inputs[j]->data() + inputs[j]->get_seq_offset()[0][i] * emb_size; + memcpy(output_data, input_data, sizeof(OpDataType) * cur_len * emb_size); + output_data += cur_len * emb_size; + } + } + + return SaberSuccess; +} + +template class SaberSequenceConcat; +DEFINE_OP_TEMPLATE(SaberSequenceConcat, SequenceConcatParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequenceConcat, SequenceConcatParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_concat.h b/saber/funcs/impl/x86/saber_sequence_concat.h new file mode 100644 index 000000000..ddba2b6fd --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_concat.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_CONCAT_H + +#include "saber/funcs/impl/impl_sequence_concat.h" + +namespace anakin { +namespace saber { + +template +class SaberSequenceConcat : + public ImplBase< + X86, OpDtype, + SequenceConcatParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberSequenceConcat() {} + + ~SaberSequenceConcat() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_sequence_conv.cpp b/saber/funcs/impl/x86/saber_sequence_conv.cpp index 723a83633..58774ebe5 100644 --- a/saber/funcs/impl/x86/saber_sequence_conv.cpp +++ b/saber/funcs/impl/x86/saber_sequence_conv.cpp @@ -57,8 +57,8 @@ SaberStatus SaberSequenceConv::dispatch( _hidden_size); } - gemm(false, false, word_num, _feature_size, _hidden_kernel_size, 1.f, _temp_im2col_tensor.data(), - param.filter_tensor->data(), 0.f, out_data->mutable_data()); + gemm(false, false, word_num, _feature_size, _hidden_kernel_size, 1.f, static_cast(_temp_im2col_tensor.data()), + static_cast(param.filter_tensor->data()), 0.f, static_cast(out_data->mutable_data())); std::vector> voffset; voffset.push_back(offset); out_data->set_seq_offset(voffset); diff --git a/saber/funcs/impl/x86/saber_sequence_depadding.cpp b/saber/funcs/impl/x86/saber_sequence_depadding.cpp new file mode 100644 index 000000000..b96aa1eed --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_depadding.cpp @@ -0,0 +1,55 @@ + +#include "saber/funcs/impl/x86/saber_sequence_depadding.h" +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberSequenceDePadding::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSequenceDePadding::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSequenceDePadding::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m) { + typedef typename DataTrait::Dtype OpDataType; + OpDataType *input_data = (OpDataType*)inputs[0]->mutable_data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + auto pad_offset = inputs[0]->get_seq_offset()[0]; + auto src_offset = inputs[1]->get_seq_offset()[0]; + int seq_num = src_offset.size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + + for (size_t i = 0; i < seq_num; i++) { + int src_len_i = src_offset[i+1] - src_offset[i]; + int pad_len_i = pad_offset[i+1] - pad_offset[i]; + CHECK_LE(src_len_i, pad_len_i) << "pad sequence length is bigger than source sequence length"; + memcpy(output_data + src_offset[i] * emb_size, input_data + i * pad_len_i * emb_size, src_len_i * emb_size * sizeof(OpDataType)); + } + + return SaberSuccess; +} + +template class SaberSequenceDePadding; +DEFINE_OP_TEMPLATE(SaberSequenceDePadding, SequenceDePaddingParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequenceDePadding, SequenceDePaddingParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_depadding.h b/saber/funcs/impl/x86/saber_sequence_depadding.h new file mode 100644 index 000000000..59ce3edcf --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_depadding.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_DEPADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_DEPADDING_H + +#include "saber/funcs/impl/impl_sequence_depadding.h" + +namespace anakin { +namespace saber { + +template +class SaberSequenceDePadding : + public ImplBase< + X86, OpDtype, + SequenceDePaddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberSequenceDePadding() {} + + ~SaberSequenceDePadding() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_sequence_expand.cpp b/saber/funcs/impl/x86/saber_sequence_expand.cpp index c554d1143..4b3afcd0d 100644 --- a/saber/funcs/impl/x86/saber_sequence_expand.cpp +++ b/saber/funcs/impl/x86/saber_sequence_expand.cpp @@ -37,7 +37,7 @@ SequenceExpandParam& param) { auto ref_offset = inputs[1]->get_seq_offset()[0]; size_t len = inputs[0]->valid_size(); - OpDataType* input_data = static_cast(inputs[0]->data()); + const OpDataType* input_data = static_cast(inputs[0]->data()); OpDataType* output_data = static_cast(outputs[0]->mutable_data()); int dim = inputs[0]->valid_size() / inputs[0]->num(); diff --git a/saber/funcs/impl/x86/saber_sequence_padding.cpp b/saber/funcs/impl/x86/saber_sequence_padding.cpp new file mode 100644 index 000000000..a8de3a5cb --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_padding.cpp @@ -0,0 +1,72 @@ + +#include "saber/funcs/impl/x86/saber_sequence_padding.h" +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberSequencePadding::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSequencePadding::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSequencePadding::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m) { + + size_t len = inputs[0]->valid_size(); + OpDataType *input_data = (OpDataType*)inputs[0]->mutable_data(); + OpDataType *output_data = (OpDataType*)outputs[0]->mutable_data(); + int max_len = 0; + auto seq_offset = inputs[0]->get_seq_offset()[0]; + int seq_num = seq_offset.size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + for (int i = 0; i < seq_num; i++) { + int cur_len = seq_offset[i+1] - seq_offset[i]; + max_len = cur_len > max_len ? cur_len : max_len; + } + Shape out_shape = inputs[0]->valid_shape(); + out_shape[0] = seq_num * max_len; + outputs[0]->reshape(out_shape); + for (size_t i = 0; i < seq_num; i++) { + int start = i * max_len * emb_size; + int cur_len = seq_offset[i+1] - seq_offset[i]; + int pad_start = start + cur_len * emb_size; + int pad_num = max_len - cur_len; + memcpy(output_data + start, input_data + seq_offset[i] * emb_size, cur_len * emb_size * sizeof(OpDataType)); + if (pad_num > 0) { + memset(output_data + pad_start, 0, pad_num * emb_size * sizeof(OpDataType)); + } + } + + std::vector out_offset; + for (int i = 0; i < seq_num + 1; i++) { + out_offset.push_back(i * max_len); + } + outputs[0]->set_seq_offset({out_offset}); + + return SaberSuccess; +} + +template class SaberSequencePadding; +DEFINE_OP_TEMPLATE(SaberSequencePadding, SequencePaddingParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequencePadding, SequencePaddingParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_padding.h b/saber/funcs/impl/x86/saber_sequence_padding.h new file mode 100644 index 000000000..e3a8f9da9 --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_padding.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_PADDING_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_PADDING_H + +#include "saber/funcs/impl/impl_sequence_padding.h" + +namespace anakin { +namespace saber { + +template +class SaberSequencePadding : + public ImplBase< + X86, OpDtype, + SequencePaddingParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberSequencePadding() {} + + ~SaberSequencePadding() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_sequence_pool.cpp b/saber/funcs/impl/x86/saber_sequence_pool.cpp index 6eb646826..b64993d4f 100644 --- a/saber/funcs/impl/x86/saber_sequence_pool.cpp +++ b/saber/funcs/impl/x86/saber_sequence_pool.cpp @@ -10,13 +10,9 @@ namespace saber { template void seq_pool_average(dtype* dst, const dtype* src_in, const int slice_num, const int slice_size) { - dtype sum = 0.f; #pragma omp parallel for for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; -#pragma vector aligned -#pragma simd reduction(+:sum) -#pragma unroll(8) + dtype sum = src_in[i]; for (int s = 1; s < slice_num; ++s) { dtype src_in_read = src_in[s * slice_size +i]; sum += src_in_read; @@ -28,13 +24,10 @@ void seq_pool_average(dtype* dst, const dtype* src_in, template void seq_pool_sum(dtype* dst, const dtype* src_in, const int slice_num, const int slice_size) { - dtype sum = 0.f; #pragma omp parallel for for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; -#pragma vector aligned -#pragma simd reduction(+:sum) -#pragma unroll(8) + dtype sum = src_in[i]; + //dtype sum = 0.f; for (int s = 1; s < slice_num; ++s) { dtype src_in_read = src_in[s * slice_size +i]; sum += src_in_read; @@ -47,13 +40,9 @@ template void seq_pool_sqrt(dtype* dst, const dtype* src_in, const int slice_num, const int slice_size) { dtype sqrt_len = sqrtf(slice_num); - dtype sum = 0.f; #pragma omp parallel for for (int i = 0; i < slice_size; ++i) { - sum = src_in[i]; -#pragma vector aligned -#pragma simd reduction(+:sum) -#pragma unroll(4) + dtype sum = src_in[i]; for (int s = 1; s < slice_num; ++s) { dtype src_in_read = src_in[s * slice_size +i]; sum += src_in_read; @@ -65,10 +54,9 @@ void seq_pool_sqrt(dtype* dst, const dtype* src_in, template void seq_pool_max(dtype* dst, const dtype* src_in, const int slice_num, const int slice_size) { - dtype max = 0.f; #pragma omp parallel for for (int i = 0; i < slice_size; ++i) { - max = src_in[i]; + dtype max = src_in[i]; for (int s = 1; s < slice_num; ++s) { dtype src_in_read = src_in[s * slice_size +i]; if (max < src_in_read) { @@ -139,11 +127,12 @@ SaberStatus SaberSequencePool::dispatch( int slice_size = outputs[0]->channel() * outputs[0]->height() * outputs[0]->width(); - + DataType_in* dst_ptr = (DataType_in*)outputs[0]->mutable_data(); const DataType_out* src_ptr = (const DataType_out*)inputs[0]->data(); for (int i = 0; i < seq_offset.size()-1; ++i) { int slice_num = seq_offset[i+1] - seq_offset[i]; + //LOG(INFO)<<"sequence pool slice size " << slice_size << "slice_num" << slice_num; kernel_direct_map[param.sequence_pool_type]( dst_ptr, src_ptr, slice_num, slice_size); @@ -166,4 +155,4 @@ template class SaberSequencePool; DEFINE_OP_TEMPLATE(SaberSequencePool, SequencePoolParam, X86, AK_HALF); DEFINE_OP_TEMPLATE(SaberSequencePool, SequencePoolParam, X86, AK_INT8); } -} // namespace anakin \ No newline at end of file +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_sequence_pool_concat.cpp b/saber/funcs/impl/x86/saber_sequence_pool_concat.cpp new file mode 100644 index 000000000..ae8da0f63 --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_pool_concat.cpp @@ -0,0 +1,151 @@ +#include "saber/funcs/impl/x86/saber_sequence_pool_concat.h" +#include "saber/funcs/impl/x86/saber_avx2_expand.h" +#include "saber/funcs/impl/x86/saber_avx512_expand.h" +namespace anakin { +namespace saber { + + +template <> +SaberStatus SaberSequencePoolConcat::create(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) { + return SaberSuccess; +}; + +template <> +SaberStatus SaberSequencePoolConcat::init(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +}; + +#if defined(__AVX2__) +static void avx2_sequence_pool_sum_concat(const float* data, std::vector& seq_offset, + int dim, + float* out) { + int round_dim = dim / 8 * 8; + int remainder = dim % 8; + __m256i mask_m256i = _m256_continue_mask_m256i(remainder); + +#pragma omp parallel for + for (int i = 0; i < seq_offset.size() - 1; i++) { + for (int k = 0; k < round_dim; k += 8) { + __m256 temp_out = _mm256_setzero_ps(); + + for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { + const float* tmp_data = data + j * dim; + __m256 temp_in = _mm256_loadu_ps(&tmp_data[k]); + temp_out += temp_in; + } + + _mm256_storeu_ps(out + i * dim + k, temp_out); + } + + if (remainder > 0) { + __m256 temp_out = _mm256_setzero_ps(); + + for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { + const float* tmp_data = data + j * dim; + __m256 temp_in = _mm256_maskload_ps(&tmp_data[round_dim], mask_m256i); + temp_out += temp_in; + } + + _mm256_maskstore_ps(out + i * dim + round_dim, mask_m256i, temp_out); + } + } +} +#endif + +#if defined(__AVX512F__) +static void avx512_sequence_pool_sum_concat(const float* data, std::vector& seq_offset, + int dim, + float* out) { + int round_dim = dim / 16 * 16; + int remainder = dim % 16; + __mmask16 remain_mask = __mm512_get_mask(remainder); + const int seq_number = seq_offset.size() - 1; + + if (round_dim == 0) { + +#pragma omp parallel for + for (int i = 0; i < seq_number; i++) { + __m512 temp_out = _mm512_setzero_ps(); + + for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { + const float* tmp_data = data + j * dim; + temp_out = _mm512_add_ps(temp_out, _mm512_mask_loadu_ps(temp_out, remain_mask, tmp_data)); + } + + _mm512_mask_storeu_ps(out + i * dim, remain_mask, temp_out); + } + + } else { +#pragma omp parallel for + for (int i = 0; i < seq_number; i++) { + for (int k = 0; k < round_dim; k += 16) { + __m512 temp_out = _mm512_setzero_ps(); + + for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { + const float* tmp_data = data + j * dim; + __m512 temp_in = _mm512_loadu_ps(&tmp_data[k]); + temp_out += temp_in; + } + + _mm512_storeu_ps(out + i * dim + k, temp_out); + } + + if (remainder > 0) { + __m512 temp_out = _mm512_setzero_ps(); + + for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) { + const float* tmp_data = data + j * dim; + temp_out = _mm512_add_ps(temp_out, _mm512_mask_loadu_ps(temp_out, remain_mask, + &tmp_data[round_dim])); + } + + _mm512_mask_storeu_ps(out + i * dim + round_dim, remain_mask, temp_out); + + } + } + } +} +#endif + +template <> +SaberStatus SaberSequencePoolConcat::dispatch(const std::vector*>& + inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param) { + CHECK_GE(inputs[0]->get_seq_offset().size(), 1); + SequencePoolParam seq_param = param.sequence_pool_param; + auto seq_vec = inputs[0]->get_seq_offset()[0]; + int seq_num = seq_vec.back(); + float* input_ptr = static_cast(inputs[0]->data()); + float* output_ptr = static_cast(outputs[0]->data()); + + int out_channel = inputs[0]->valid_size() / seq_num; + + if (seq_param.sequence_pool_type == Sequence_pool_sum) { + +#if defined(__AVX512F__) + avx512_sequence_pool_sum_concat(input_ptr, seq_vec, out_channel, output_ptr); +#elif defined(__AVX2__) + avx2_sequence_pool_sum_concat(input_ptr, seq_vec, out_channel, output_ptr); +#else + LOG(FATAL) << "not support for not open avx2"; +#endif + } else { + LOG(FATAL) << "not support " << seq_param.sequence_pool_type; + } + + return SaberSuccess; +}; + +DEFINE_OP_TEMPLATE(SaberSequencePoolConcat, SequencePoolConcatParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSequencePoolConcat, SequencePoolConcatParam, X86, AK_INT8); + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_sequence_pool_concat.h b/saber/funcs/impl/x86/saber_sequence_pool_concat.h new file mode 100644 index 000000000..45fe9431c --- /dev/null +++ b/saber/funcs/impl/x86/saber_sequence_pool_concat.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_POOL_CONCAT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SEQUENCE_POOL_CONCAT_H + +#include "saber/funcs/impl/impl_sequence_pool_concat.h" +#include "saber/saber_funcs_param.h" +#include +#include + +namespace anakin { +namespace saber { + +template +class SaberSequencePoolConcat : + public ImplBase < X86, OpDtype, SequencePoolConcatParam > { + +public: + + SaberSequencePoolConcat() = default; + + ~SaberSequencePoolConcat() {} + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param, + Context& ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePoolConcatParam& param) override; + +private: + +}; + +} +} + +#endif diff --git a/saber/funcs/impl/x86/saber_shuffle_channel.cpp b/saber/funcs/impl/x86/saber_shuffle_channel.cpp new file mode 100644 index 000000000..905f290b1 --- /dev/null +++ b/saber/funcs/impl/x86/saber_shuffle_channel.cpp @@ -0,0 +1,96 @@ +#include "saber/funcs/impl/x86/saber_shuffle_channel.h" + +namespace anakin{ + +namespace saber{ + +template +void shuffle_kernel(Dtype* output, const Dtype* input, int group_row, int group_col, int len) { + for (int i = 0; i < group_row; ++i) { + for (int j = 0; j < group_col; ++j) { + const Dtype* p_i = input + (i * group_col + j) * len; + Dtype* p_o = output + (j * group_row + i) * len; + memcpy(p_o, p_i, len * sizeof(Dtype)); + } + } +} + +template <> +SaberStatus SaberShuffleChannel::dispatch(\ + const std::vector *>& inputs, + std::vector *>& outputs, + ShuffleChannelParam ¶m) { + +#ifdef ENABLE_OP_TIMER + this->_timer.clear(); + this->_timer.start(); +#endif + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int fea_size = channel * height * width; + int spatial_size = height * width; + + int group_row = param.group; + int group_col = channel / param.group; + const float* din = static_cast(inputs[0]->data()); + float* dout = static_cast(outputs[0]->data()); + for (int i = 0; i < num; ++i) { + shuffle_kernel(dout + i * fea_size, din + i * fea_size, group_row, group_col, spatial_size); + } +#ifdef ENABLE_OP_TIMER + this->_timer.end(); + float ts = this->_timer.get_average_ms(); + LOGI("ShuffleChannel : %s: time: %f\n", this->_op_name.c_str(), ts); + GOPS ops; + //fixme + ops.ops = 0; + ops.ts = ts; + OpTimer::add_timer("ShuffleChannel", ops); + OpTimer::add_timer("total", ops); +#endif + return SaberSuccess; +} +template <> +SaberStatus SaberShuffleChannel::dispatch(\ + const std::vector *>& inputs, + std::vector *>& outputs, + ShuffleChannelParam ¶m) { + +#ifdef ENABLE_OP_TIMER + this->_timer.clear(); + this->_timer.start(); +#endif + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int fea_size = channel * height * width; + int spatial_size = height * width; + + int group_row = param.group; + int group_col = channel / param.group; + const char* din = static_cast(inputs[0]->data()); + char* dout = static_cast(outputs[0]->data()); + for (int i = 0; i < num; ++i) { + shuffle_kernel(dout + i * fea_size, din + i * fea_size, group_row, group_col, spatial_size); + } +#ifdef ENABLE_OP_TIMER + this->_timer.end(); + float ts = this->_timer.get_average_ms(); + LOGI("ShuffleChannel : %s: time: %f\n", this->_op_name.c_str(), ts); + GOPS ops; + //fixme + ops.ops = 0; + ops.ts = ts; + OpTimer::add_timer("ShuffleChannel", ops); + OpTimer::add_timer("total", ops); +#endif + return SaberSuccess; +} +DEFINE_OP_TEMPLATE(SaberShuffleChannel, ShuffleChannelParam, X86, AK_HALF); + +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_shuffle_channel.h b/saber/funcs/impl/x86/saber_shuffle_channel.h new file mode 100644 index 000000000..21d8468df --- /dev/null +++ b/saber/funcs/impl/x86/saber_shuffle_channel.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SHUFFLE_CHANNEL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SHUFFLE_CHANNEL_H + +#include "saber/funcs/impl/impl_shuffle_channel.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberShuffleChannel : \ + public ImplBase< + X86, + OpDtype, + ShuffleChannelParam > +{ +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberShuffleChannel() + {} + + ~SaberShuffleChannel() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ShuffleChannelParam& param, Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ShuffleChannelParam& param, Context &ctx) { + return SaberSuccess; + } + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ShuffleChannelParam& param); +private: +}; + +} + +} +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_ShuffleChannel_H diff --git a/saber/funcs/impl/x86/saber_slice.cpp b/saber/funcs/impl/x86/saber_slice.cpp index 8b2235400..9ceb00e86 100644 --- a/saber/funcs/impl/x86/saber_slice.cpp +++ b/saber/funcs/impl/x86/saber_slice.cpp @@ -29,6 +29,7 @@ SaberStatus SaberSlice::dispatch(\ const int out_slice_axis_size = outputs[i]->valid_shape()[param.axis]; const int out_slice_size = out_slice_axis_size * _slice_size; const int slice_count = out_slice_size * _slice_num; +#pragma omp parallel for schedule(static) for(int j = 0; j < slice_count; ++j){ const int _num_slice = j / out_slice_size; const int _slice_index = j % out_slice_size; diff --git a/saber/funcs/impl/x86/saber_slice_v2.cpp b/saber/funcs/impl/x86/saber_slice_v2.cpp new file mode 100644 index 000000000..a5fd8edaa --- /dev/null +++ b/saber/funcs/impl/x86/saber_slice_v2.cpp @@ -0,0 +1,77 @@ +#include "saber/funcs/impl/x86/saber_slice_v2.h" + +namespace anakin{ + +namespace saber{ + +template +SaberStatus SaberSliceV2::create(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx) { + auto starts = param.starts; + auto ends = param.ends; + auto axes = param.axes; + CHECK_EQ(axes.size(), starts.size()) << "the size of axes and starts are not equal "; + CHECK_EQ(ends.size(), starts.size()) << "the size of starts and ends are not valid"; + _starts.resize(starts.size()); + _ends.resize(ends.size()); + Shape output_shape = inputs[0]->valid_shape(); + Shape input_shape = inputs[0]->valid_shape(); + for (int i = 0; i < starts.size(); i++) { + int dim_value = input_shape[axes[i]]; + int start = starts[i] < 0 ? starts[i] + dim_value : starts[i]; + int end = ends[i] < 0 ? ends[i] + dim_value : ends[i]; + start = std::max(start, 0); + start = std::min(start, dim_value); + end = std::max(end, 0); + end = std::min(end, dim_value); + output_shape[axes[i]] = end - start; + _starts[i] = start; + _ends[i] = end; + } + return SaberSuccess; +} + + +template +SaberStatus SaberSliceV2::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SliceV2Param& param) { + + //! inputs only has one tensor + Shape shape_in = inputs[0]->valid_shape(); + auto axes = param.axes; + CHECK_EQ(outputs.size(), 1) << "SliceV2 only support one output"; + const OpDataType* in_data = (const OpDataType*)inputs[0]->data(); + OpDataType* out_data = (OpDataType*)outputs[0]->mutable_data(); + auto out_stride = outputs[0]->get_stride(); + auto in_stride = inputs[0]->get_stride(); + int inner = inputs[0]->count_valid(param.axes.back() + 1, inputs[0]->dims()); + int out_outer_stride = outputs[0]->count_valid(param.axes[0], inputs[0]->dims()); + int in_outer_stride = inputs[0]->count_valid(param.axes[0], inputs[0]->dims()); + int count = outputs[0]->valid_size(); + auto out_shape = outputs[0]->valid_shape(); + + for (int i = 0; i < count; i++) { + int out_id = i / out_outer_stride; + int inner_id = i % inner; + int new_i = i / inner; + int in_offset = inner_id + out_id * in_outer_stride; + for (int k = _starts.size() - 1; k >= 0; k--) { + int cur_id = new_i % out_shape[axes[k]]; + in_offset += (cur_id + _starts[k]) * in_stride[axes[k]]; + new_i /= out_shape[axes[k]]; + } + out_data[i] = in_data[in_offset]; + } + + return SaberSuccess; + +} +DEFINE_OP_TEMPLATE(SaberSliceV2, SliceV2Param, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSliceV2, SliceV2Param, X86, AK_INT8); +} //namespace anakin + +} //namespace anakin diff --git a/saber/funcs/impl/x86/saber_slice_v2.h b/saber/funcs/impl/x86/saber_slice_v2.h new file mode 100644 index 000000000..c4ea7341a --- /dev/null +++ b/saber/funcs/impl/x86/saber_slice_v2.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_X86_SABER_SLICE_V2_H +#define ANAKIN_SABER_FUNCS_X86_SABER_SLICE_V2_H + +#include "saber/funcs/impl/impl_slice_v2.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSliceV2: + public ImplBase> { + +public: + + typedef typename DataTrait::Dtype OpDataType; + + SaberSliceV2() = default; + ~SaberSliceV2() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx) { + // get context + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + } + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m, + Context &ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SliceV2Param ¶m); + +private: + std::vector _starts; + std::vector _ends; + std::vector _axes; + +}; +template class SaberSliceV2; +} //namespace saber + +} //namespace anakin + +#endif //ANAKIN_SABER_FUNCS_X86_SABER_SLICE_V2_H diff --git a/saber/funcs/impl/x86/saber_soft_sign.cpp b/saber/funcs/impl/x86/saber_soft_sign.cpp new file mode 100644 index 000000000..9506083aa --- /dev/null +++ b/saber/funcs/impl/x86/saber_soft_sign.cpp @@ -0,0 +1,59 @@ +#include "saber/funcs/impl/x86/saber_soft_sign.h" +#include "mkl.h" +#if defined(__AVX2__) and defined(__FMA__) +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" +#endif +#include + +namespace anakin{ +namespace saber { + +template +SaberStatus SaberSoftSign::init( + const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template +SaberStatus SaberSoftSign::create( + const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m, + Context &ctx) { + this->_ctx = &ctx; + return SaberSuccess; +} + +template +SaberStatus SaberSoftSign::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m) { + // y= x / (1.0 + fabs(x)) + for (size_t vc = 0; vc < inputs.size(); vc++) { + size_t len = inputs[vc]->valid_size(); + OpDataType *input_data = (OpDataType*)inputs[vc]->mutable_data(); + OpDataType *output_data = (OpDataType*)outputs[vc]->mutable_data(); +//#if defined(__AVX2__) and defined(__FMA__) +// avx2_vector_soft_sign(input_data, len, output_data); +//#else +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < len; i++) { + OpDataType tmp = input_data[i] > 0 ? input_data[i] : -input_data[i]; + output_data[i] = input_data[i] / (1 + tmp); + } +//#endif + } + + return SaberSuccess; +} + +template class SaberSoftSign; +DEFINE_OP_TEMPLATE(SaberSoftSign, SoftSignParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSoftSign, SoftSignParam, X86, AK_INT8); +} +} // namespace anakin diff --git a/saber/funcs/impl/x86/saber_soft_sign.h b/saber/funcs/impl/x86/saber_soft_sign.h new file mode 100644 index 000000000..c29427170 --- /dev/null +++ b/saber/funcs/impl/x86/saber_soft_sign.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 Anakin Authors All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SOFT_SIGN_H + +#include "saber/funcs/impl/impl_soft_sign.h" + +namespace anakin { +namespace saber { + +template +class SaberSoftSign : + public ImplBase< + X86, OpDtype, + SoftSignParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberSoftSign() {} + + ~SaberSoftSign() {} + + virtual SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m, + Context &ctx) override; + + virtual SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m, + Context &ctx) override; + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam ¶m) override; + +private: + +}; + +} +} +#endif diff --git a/saber/funcs/impl/x86/saber_softmax.cpp b/saber/funcs/impl/x86/saber_softmax.cpp index e9cce0044..c91ff173d 100644 --- a/saber/funcs/impl/x86/saber_softmax.cpp +++ b/saber/funcs/impl/x86/saber_softmax.cpp @@ -1,7 +1,9 @@ #include "saber/funcs/impl/x86/saber_softmax.h" #include +#include "saber/funcs/impl/x86/saber_avx2_funcs.h" #include "mkl_cblas.h" #include "mkl_vml_functions.h" +#include "saber/funcs/impl/x86/kernel/jit_generator.h" namespace anakin { namespace saber { @@ -11,6 +13,9 @@ SaberStatus SaberSoftmax::init( std::vector& outputs, SoftmaxParam& param, Context& ctx) { this->_ctx = &ctx; + if (inputs[0]->get_dtype() != AK_FLOAT) { + _input_scale.re_alloc(inputs[0]->valid_shape(), AK_FLOAT); + } return create(inputs, outputs, param, ctx); } @@ -32,53 +37,67 @@ SaberStatus SaberSoftmax::create( _output_stride.reshape(sh); memcpy(_input_stride.mutable_data(), (inputs[0]->get_stride()).data(), sizeof(int) * _dims); memcpy(_output_stride.mutable_data(), (outputs[0]->get_stride()).data(), sizeof(int) * _dims); + + if (inputs[0]->get_dtype() != AK_FLOAT) { + utils::try_expand_tensor(_input_scale, inputs[0]->valid_shape()); + } return SaberSuccess; } + template -void _max(int n, const dtype *x, dtype *max_data) { - max_data[0] = x[0]; +void _max(int n, const dtype* x, dtype* output_max_data) { +// print_vec(x,n,"max"); + dtype max_data = x[0]; for (int c = 1; c < n; ++c) { - max_data[0] = max_data[0] > x[c] ? max_data[0] : x[c]; + max_data = max_data > x[c] ? max_data : x[c]; } + + output_max_data[0] = max_data; } template -void _sub(int n, dtype alpha, const dtype *x, dtype *y) { +void _sub(int n, dtype alpha, const dtype* x, dtype* y) { for (int c = 0; c < n; ++c) { y[c] = x[c] - alpha; } } template -void _exp(int n, const dtype *a, dtype *r) { +void _exp(int n, const dtype* a, dtype* r) { #if 1 vsExp(n, a, r); #else #pragma omp parallel for + for (int c = 0; c < n; ++c) { r[c] = expf(a[c]); } + #endif } template -void _sum(int n, const dtype *x, dtype *sum_data) { - sum_data[0] = 0; +void _sum(int n, const dtype* x, dtype* sum_data) { + dtype sum = 0; for (int c = 0; c < n; ++c) { - sum_data[0] += x[c]; + sum += x[c]; } + + sum_data[0] = sum; } template -void _scal (int n, dtype alpha, dtype *x) { +void _scal(int n, dtype alpha, dtype* x) { #if 0 cblas_sscal(n, alpha, x, 1); #else -#pragma omp parallel for for (int c = 0; c < n; ++c) { x[c] *= alpha; } + #endif } + + template SaberStatus SaberSoftmax::dispatch( const std::vector& inputs, @@ -89,78 +108,87 @@ SaberStatus SaberSoftmax::dispatch( int axis = param.axis; Shape sh_in = inputs[0]->valid_shape(); Shape sh_out = outputs[0]->valid_shape(); - bool use_avx2 = true; - use_avx2 = use_avx2 && (sh_in.count(axis + 1) == 1); -#if defined(__AVX2__) and defined(__FMA__) - if (use_avx2) { - int num = sh_in.count(0, axis); - int channel = sh_in.count(axis); - - const float *src_ptr = (const float *) inputs[0]->data(); - float *dst_ptr = (float *) outputs[0]->mutable_data(); - outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); - -#pragma omp parallel for schedule(static) - for (int ou = 0; ou < num; ou++) { - const float *src_data = src_ptr + ou * channel; - float *dst_data = dst_ptr + ou * channel; - float scalar = 0; - - _max(channel, src_data, &scalar); - _sub(channel, scalar, src_data, dst_data); - _exp(channel, dst_data, dst_data); - _sum(channel, dst_data, &scalar); - _scal(channel, float(1.f) / scalar, dst_data); - } - return SaberSuccess; - } -#endif - const OpDataType* data_in = (const OpDataType*)inputs[0]->data(); - OpDataType* data_out = (OpDataType*)outputs[0]->mutable_data(); - OpDataType* max_data = (OpDataType*)this->_max_data.mutable_data(); - const int* input_stride = (const int*)_input_stride.data(); - const int* output_stride = (const int*)_output_stride.data(); - int total_num = _inner_num * _outer_num; - - #pragma omp parallel for schedule(static) - - for (int num = 0; num < total_num; ++num) { - int num_tmp = num; - int in_index = 0, out_index = 0; + if (sh_in.get_layout() == Layout_NHWC) { + sh_in = Shape({sh_in.num(), sh_in.channel(), sh_in.height(), sh_in.width()}); + } - for (int i = _dims - 1; i >= 0; --i) { - if (i == axis) { - continue; + int axis_size = sh_in[axis]; + int outer_dim = sh_in.count(0, param.axis); + int inner_dim = sh_in.count(param.axis + 1, inputs[0]->dims()); + int batch_size = outer_dim * inner_dim; + const float* src_ptr = nullptr; + float* dst_ptr = (float*) outputs[0]->mutable_data(); + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); + + if (inputs[0]->get_dtype() == AK_FLOAT) { + src_ptr = static_cast(inputs[0]->data()); + } else if (inputs[0]->get_dtype() == AK_UINT8) { + DLOG(INFO) << "dispatch convert uint8 fp32"; + utils::ScaleUtils::scale_uint8_fp32(_input_scale, *inputs[0]); + src_ptr = static_cast(_input_scale.data()); + }else{ + LOG(INFO)<<"not support input "<get_dtype(); + } + if (avx2_can_used()){ +#if defined(__AVX2__) and defined(__FMA__) +#pragma omp parallel for schedule(static) if(outer_dim>1) + for(int outer_id=0; outer_iddata(); + OpDataType *data_out = (OpDataType *) outputs[0]->mutable_data(); + OpDataType *max_data = (OpDataType *) this->_max_data.mutable_data(); + const int *input_stride = (const int *) _input_stride.data(); + const int *output_stride = (const int *) _output_stride.data(); + int total_num = _inner_num * _outer_num; + + for (int num = 0; num < total_num; ++num) { + int num_tmp = num; + int in_index = 0, out_index = 0; + + for (int i = _dims - 1; i >= 0; --i) { + if (i == axis) { + continue; + } + + int pos = num_tmp % sh_in[i]; + in_index += pos * input_stride[i]; + out_index += pos * output_stride[i]; + num_tmp /= sh_in[i]; + } - OpDataType max = std::numeric_limits::lowest(); + OpDataType max = std::numeric_limits::lowest(); - for (int i = 0; i < _axis_size; ++i) { - max = data_in[in_index] > max ? data_in[in_index] : max; - in_index += input_stride[axis]; - } + for (int i = 0; i < _axis_size; ++i) { + max = data_in[in_index] > max ? data_in[in_index] : max; + in_index += input_stride[axis]; + } - OpDataType sum = (OpDataType)0; + OpDataType sum = (OpDataType) 0; - for (int i = 0; i < _axis_size; ++i) { - in_index -= input_stride[axis]; - max_data[_axis_size - i - 1] = expf(data_in[in_index] - max); - sum += max_data[_axis_size - i - 1]; - } + for (int i = 0; i < _axis_size; ++i) { + in_index -= input_stride[axis]; + max_data[_axis_size - i - 1] = expf(data_in[in_index] - max); + sum += max_data[_axis_size - i - 1]; + } - for (int i = 0; i < _axis_size; ++i) { - data_out[out_index] = max_data[i] / sum; - out_index += output_stride[axis]; + for (int i = 0; i < _axis_size; ++i) { + data_out[out_index] = max_data[i] / sum; + out_index += output_stride[axis]; + } } } + return SaberSuccess; } template class SaberSoftmax; diff --git a/saber/funcs/impl/x86/saber_softmax.h b/saber/funcs/impl/x86/saber_softmax.h index 2c502585b..76fe6aba4 100644 --- a/saber/funcs/impl/x86/saber_softmax.h +++ b/saber/funcs/impl/x86/saber_softmax.h @@ -57,6 +57,7 @@ class SaberSoftmax : Tensor _input_stride; Tensor _output_stride; Tensor _max_data; + Tensor _input_scale; }; } diff --git a/saber/funcs/impl/x86/saber_sproposal.cpp b/saber/funcs/impl/x86/saber_sproposal.cpp new file mode 100644 index 000000000..4341ad3dc --- /dev/null +++ b/saber/funcs/impl/x86/saber_sproposal.cpp @@ -0,0 +1,372 @@ + +#include "saber/funcs/impl/x86/saber_sproposal.h" +#include "mkl.h" +#include +#include + +namespace anakin { +namespace saber { + +struct abox{ + float batch_ind; + float x1; + float y1; + float x2; + float y2; + float score; + bool operator < (const abox&tmp) const { + return score < tmp.score; + } +}; + +template<> +std::vector SaberSProposal::mkanchor(float w, float h, float x_ctr, float y_ctr){ + std::vector tmp; + tmp.push_back(x_ctr - 0.5 * (w - 1)); + tmp.push_back(y_ctr - 0.5 * (h - 1)); + tmp.push_back(x_ctr + 0.5 * (w - 1)); + tmp.push_back(y_ctr + 0.5 * (h - 1)); + return tmp; +} + +template<> +std::vector SaberSProposal::whctrs(std::vector anchor){ + std::vector result; + result.push_back(anchor[2] - anchor[0] + 1); //w + result.push_back(anchor[3] - anchor[1] + 1); //h + result.push_back((anchor[2] + anchor[0]) / 2); //ctrx + result.push_back((anchor[3] + anchor[1]) / 2); //ctry + return result; +} + +template<> +std::vector > SaberSProposal::scale_enum(std::vector anchor){ + std::vector > result; + std::vector reform_anchor = whctrs(anchor); + float x_ctr = reform_anchor[2]; + float y_ctr = reform_anchor[3]; + float w = reform_anchor[0]; + float h = reform_anchor[1]; + for (int i = 0; i < _anchor_scales.size(); ++i) { + float ws = w * _anchor_scales[i]; + float hs = h * _anchor_scales[i]; + std::vector tmp = mkanchor(ws, hs, x_ctr, y_ctr); + result.push_back(tmp); + } + return result; +} + +template<> +std::vector > SaberSProposal::ratio_enum(std::vector anchor){ + std::vector > result; + std::vector reform_anchor = whctrs(anchor); + float x_ctr = reform_anchor[2]; + float y_ctr = reform_anchor[3]; + float size = reform_anchor[0] * reform_anchor[1]; + for (int i = 0; i < _ratios.size(); ++i) { + float size_ratios = size / _ratios[i]; + float ws = round(std::sqrt(size_ratios)); + float hs = round(ws * _ratios[i]); + std::vector tmp = mkanchor(ws, hs, x_ctr, y_ctr); + result.push_back(tmp); + } + return result; +} + +template<> +void SaberSProposal::generate_anchors(){ + //generate base anchor + std::vector base_anchor; + base_anchor.push_back(0); + base_anchor.push_back(0); + base_anchor.push_back(_base_size - 1); + base_anchor.push_back(_base_size - 1); + //enum ratio anchors + std::vector >ratio_anchors = ratio_enum(base_anchor); + for (int i = 0; i < ratio_anchors.size(); ++i) { + std::vector > tmp = scale_enum(ratio_anchors[i]); + _gen_anchors.insert(_gen_anchors.end(), tmp.begin(), tmp.end()); + } +} + +void nms(std::vector &input_boxes, float nms_thresh) { + std::vector vArea(input_boxes.size()); + for (int i = 0; i < input_boxes.size(); ++i) { + vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1) + * (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1); + } + for (int i = 0; i < input_boxes.size(); ++i) { + for (int j = i + 1; j < input_boxes.size();) { + float xx1 = std::max(input_boxes[i].x1, input_boxes[j].x1); + float yy1 = std::max(input_boxes[i].y1, input_boxes[j].y1); + float xx2 = std::min(input_boxes[i].x2, input_boxes[j].x2); + float yy2 = std::min(input_boxes[i].y2, input_boxes[j].y2); + float w = std::max(float(0), xx2 - xx1 + 1); + float h = std::max(float(0), yy2 - yy1 + 1); + float inter = w * h; + float ovr = inter / (vArea[i] + vArea[j] - inter); + if (ovr >= nms_thresh) { + input_boxes.erase(input_boxes.begin() + j); + vArea.erase(vArea.begin() + j); + } else { + j++; + } + } + } +} + +template<> +SaberStatus SaberSProposal::create( + const std::vector *>& inputs, + std::vector *>& outputs, + SProposalParam ¶m, + Context &ctx) { + + _map_width = inputs[1]->width(); //feat_width + _map_height = inputs[1]->height(); //feat_height + int length = std::max(_map_width, _map_height); + int step = _map_width * _map_height; + Shape local_anchors_shape({1, _anchors_nums * 4, _map_height, _map_width}, Layout_NCHW); + Shape map_m_shape({length}, Layout_W); + Shape step_shape({step}, Layout_W); + _local_anchors.reshape(local_anchors_shape); + _map_m_tensor.reshape(map_m_shape); + _shift_x_tensor.reshape(step_shape); + _shift_y_tensor.reshape(step_shape); + return SaberSuccess; +} + +template<> +SaberStatus SaberSProposal::init( + const std::vector *>& inputs, + std::vector *>& outputs, + SProposalParam ¶m, + Context &ctx) { + + this->_ctx = &ctx; + _anchor_scales.clear(); + _ratios.clear(); + _feat_stride = param.feat_stride; + _base_size = param.basesize; + _min_size = param.boxminsize; + _pre_nms_topN = param.pre_nms_topn; + _post_nms_topN = param.post_nms_topn; + _nms_thresh = param.nms_thresh; + int scales_num = param.scale.size(); + for (int i = 0; i < scales_num; ++i) { + _anchor_scales.push_back(param.scale[i]); + } + int ratios_num = param.ratio.size(); + for (int i = 0; i < ratios_num; ++i) { + _ratios.push_back(param.ratio[i]); + } + + generate_anchors(); + + _anchors_nums = _gen_anchors.size(); + Shape anchors_shape({_anchors_nums * 4}, Layout_W); + _anchors_tensor.re_alloc(anchors_shape, AK_FLOAT); + _anchors = (int*)_anchors_tensor.mutable_data(); + + for (int i = 0; i<_gen_anchors.size(); ++i) { + for (int j = 0; j < _gen_anchors[i].size(); ++j) { + _anchors[i * 4 + j] = _gen_anchors[i][j]; + } + } + _map_width = inputs[1]->width(); //feat_width + _map_height = inputs[1]->height(); //feat_height + int length = std::max(_map_width, _map_height); + int step = _map_width * _map_height; + Shape local_anchors_shape({1, _anchors_nums * 4, _map_height, _map_width}, Layout_NCHW); + Shape map_m_shape({length}, Layout_W); + Shape step_shape({step}, Layout_W); + _local_anchors.re_alloc(local_anchors_shape, AK_FLOAT); + _map_m_tensor.re_alloc(map_m_shape, AK_FLOAT); + _shift_x_tensor.re_alloc(step_shape, AK_FLOAT); + _shift_y_tensor.re_alloc(step_shape, AK_FLOAT); + return create(inputs, outputs, param, ctx); +} + +template<> +SaberStatus SaberSProposal::dispatch( + const std::vector *>& inputs, + std::vector *>& outputs, + SProposalParam ¶m) { + + _map_width = inputs[1]->width(); //feat_width + _map_height = inputs[1]->height(); //feat_height + //int channel = inputs[1]->channel(); + + //get boxs_delta,向右。 + auto m_box_ = inputs[1]; + //get sores 向右,前面_anchors_nums个位bg的得分,后面_anchors_nums为fg得分,我们需要的是后面的。 + auto m_score_ = inputs[0]; + //get im_info + const float* img_info = (const float*)inputs[2]->data(); + int img_info_h = inputs[2]->height(); + int img_info_w = inputs[2]->width(); + _src_height = img_info[0]; + _src_width = img_info[1 * img_info_h * img_info_w]; + _src_scale = img_info[2 * img_info_h * img_info_w]; + + //gen local anchors 向右 + int length = std::max(_map_width, _map_height); + int step = _map_width * _map_height; + int *_map_m = (int*)_map_m_tensor.mutable_data(); + for (int i = 0; i < length; ++i) { + _map_m[i] = i * _feat_stride; + } + float *_shift_x = (float*)_shift_x_tensor.mutable_data(); + float *_shift_y = (float*)_shift_y_tensor.mutable_data(); + for (int i = 0; i < _map_height; ++i) { + for (int j = 0; j < _map_width; ++j) { + _shift_x[i * _map_width + j] = _map_m[j]; + _shift_y[i * _map_width + j] = _map_m[i]; + } + } + + float *local_anchors_ptr = (float*)_local_anchors.mutable_data(); + for (int i = 0; i < _anchors_nums; ++i) { + for (int j = 0; j < step; ++j) { + (local_anchors_ptr + (i * 4 + 0) * step)[j] = float(_anchors[i * 4 + 0]); + } + for (int j = 0; j < step; ++j) { + (local_anchors_ptr + (i * 4 + 1) * step)[j] = float(_anchors[i * 4 + 1]); + } + for (int j = 0; j < step; ++j) { + (local_anchors_ptr + (i * 4 + 2) * step)[j] = float(_anchors[i * 4 + 2]); + } + for (int j = 0; j < step; ++j) { + (local_anchors_ptr + (i * 4 + 3) * step)[j] = float(_anchors[i * 4 + 3]); + } + cblas_saxpy(step, float(1), _shift_x, 1, local_anchors_ptr + (i * 4 + 0) * step, 1); + cblas_saxpy(step, float(1), _shift_x, 1, local_anchors_ptr + (i * 4 + 2) * step, 1); + cblas_saxpy(step, float(1), _shift_y, 1, local_anchors_ptr + (i * 4 + 1) * step, 1); + cblas_saxpy(step, float(1), _shift_y, 1, local_anchors_ptr + (i * 4 + 3) * step, 1); + } + + //Convert anchors into proposals via bbox transformations + + int channel = m_box_->channel(); + int height = m_box_->height(); + int width = m_box_->width(); + int m_box_step = height * width; + float* m_box_ptr = (float*)m_box_->mutable_data(); // bbox_deltas + + for (int i = 0; i < channel / 4; ++i) { + +// // [xmin, ymin, xmax, ymax] -> [width, height, ctr_x, ctr_y] + cblas_saxpy(2 * m_box_step, float(-1), + local_anchors_ptr + (i * 4 + 0) * m_box_step, 1, + local_anchors_ptr + (i * 4 + 2) * m_box_step, 1); + for (int i = 0; i < 2 * m_box_step; ++i) { + (local_anchors_ptr + (i * 4 + 2) * m_box_step)[i] += float(1); + } + cblas_saxpy(2 * m_box_step, float(0.5), + local_anchors_ptr + (i * 4 + 2) * m_box_step, 1, + local_anchors_ptr + (i * 4 + 0) * m_box_step, 1); + + // add offset: ctr_x = ctr_x + tx * width_delta, ctr_y = ctr_y + ty * height_delta + vsMul(2 * m_box_step, + local_anchors_ptr + (i * 4 + 2) * m_box_step, + m_box_ptr + (i * 4 + 0) * m_box_step, + m_box_ptr + (i * 4 + 0) * m_box_step); + + vsAdd(2 * m_box_step, + local_anchors_ptr + (i * 4 + 0) * m_box_step, + m_box_ptr + (i * 4 + 0) * m_box_step, + m_box_ptr + (i * 4 + 0) * m_box_step); + + // add offset: width = width * exp(width_delta), height = height * exp(height_delta) + vsExp(2 * m_box_step, + m_box_ptr + (i * 4 + 2) * m_box_step, + m_box_ptr + (i * 4 + 2) * m_box_step); + + vsMul(2 * m_box_step, + local_anchors_ptr + (i * 4 + 2) * m_box_step, + m_box_ptr + (i * 4 + 2) * m_box_step, + m_box_ptr + (i * 4 + 2) * m_box_step); +// +// // do not reverse the quantities +// // leaving [width, height, ctr_x, ctr_y] -> [xmin, ymin, xmax, ymax] undone. + } + + std::vector aboxes; + + int map_width = m_box_->width(); + int map_height = m_box_->height(); + int map_channel = m_box_->channel(); + const float *box = (const float*)m_box_->data(); // bbox_deltas + const float *score = (const float*)m_score_->data(); // scores + + int offset_step = 4 * map_height * map_width; + int one_step = map_height * map_width; + int offset_w, offset_h, offset_x, offset_y, offset_s; + + for (int h = 0; h < map_height; ++h) { + for (int w = 0; w < map_width; ++w) { + offset_x = h * map_width + w; + offset_y = offset_x + one_step; + offset_w = offset_y + one_step; + offset_h = offset_w + one_step; + offset_s = one_step * _anchors_nums + h * map_width + w; + for (int c = 0; c < map_channel / 4; ++c) { + float width = box[offset_w], height = box[offset_h]; + abox tmp; + tmp.batch_ind = 0; + tmp.x1 = box[offset_x] - 0.5 * width; + tmp.y1 = box[offset_y] - 0.5 * height; + tmp.x2 = box[offset_x] + 0.5 * width; + tmp.y2 = box[offset_y] + 0.5 * height; + tmp.x1 = std::min(std::max(tmp.x1, 0.f), _src_width - 1.f); + tmp.y1 = std::min(std::max(tmp.y1, 0.f), _src_height - 1.f); + tmp.x2 = std::min(std::max(tmp.x2, 0.f), _src_width - 1.f); + tmp.y2 = std::min(std::max(tmp.y2, 0.f), _src_height - 1.f); + tmp.score = score[offset_s]; + aboxes.push_back(tmp); + offset_x += offset_step; + offset_y += offset_step; + offset_w += offset_step; + offset_h += offset_step; + offset_s += one_step; + } + } + } + + std::sort(aboxes.rbegin(), aboxes.rend()); //降序 + + if (_pre_nms_topN > 0 && _pre_nms_topN < aboxes.size()) { + int tmp = std::min((size_t)_pre_nms_topN, aboxes.size()); + aboxes.erase(aboxes.begin() + tmp, aboxes.end()); + } + + nms(aboxes,_nms_thresh); + + if (_post_nms_topN > 0) { + int tmp = std::min((size_t)_post_nms_topN, aboxes.size()); + aboxes.erase(aboxes.begin() + tmp, aboxes.end()); + } + Shape output_shape({1, aboxes.size(), 5, 1}, Layout_NCHW); + outputs[0]->reshape(output_shape); + float *top0 = (float*)outputs[0]->mutable_data(); + int output_offset = outputs[0]->height() * outputs[0]->width(); + for (int i = 0; i < aboxes.size(); ++i) { + //caffe_copy(aboxes.size() * 5, (float*)aboxes.data(), top0); + top0[0] = aboxes[i].batch_ind; + top0[1] = aboxes[i].x1; + top0[2] = aboxes[i].y1; + top0[3] = aboxes[i].x2; + top0[4] = aboxes[i].y2; +// top0 += outputs[0]->offset(0, 1); + top0 += output_offset; + } + + return SaberSuccess; +} + +template class SaberSProposal; +DEFINE_OP_TEMPLATE(SaberSProposal, SProposalParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSProposal, SProposalParam, X86, AK_INT8); + +} //namespace saber. +} //namespace anakin. diff --git a/saber/funcs/impl/x86/saber_sproposal.h b/saber/funcs/impl/x86/saber_sproposal.h new file mode 100644 index 000000000..f132796f2 --- /dev/null +++ b/saber/funcs/impl/x86/saber_sproposal.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SPROPOSAL_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SPROPOSAL_H + +#include "saber/funcs/impl/impl_sproposal.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSProposal: + public ImplBase> { + +public: + + SaberSProposal() = default; + + ~SaberSProposal() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SProposalParam ¶m, + Context &ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SProposalParam ¶m, + Context &ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SProposalParam ¶m) override; + +private: + void generate_anchors(); + std::vector > ratio_enum(std::vector); + std::vector whctrs(std::vector); + std::vector mkanchor(float w,float h,float x_ctr,float y_ctr); + std::vector > scale_enum(std::vector); + + int _feat_stride{0}; + int _base_size{0}; + int _min_size{0}; + int _pre_nms_topN{0}; + int _post_nms_topN{0}; + float _nms_thresh{0}; + std::vector _anchor_scales; + std::vector _ratios; + + std::vector > _gen_anchors; + int *_anchors{nullptr}; + int _anchors_nums{0}; + int _src_height{0}; + int _src_width{0}; + float _src_scale{0}; + int _map_width{0}; + int _map_height{0}; + + Tensor _local_anchors; + Tensor _shift_x_tensor; + Tensor _shift_y_tensor; + Tensor _map_m_tensor; + Tensor _anchors_tensor; +}; + +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SPROPOSAL_H diff --git a/saber/funcs/impl/x86/saber_sroi_align.cpp b/saber/funcs/impl/x86/saber_sroi_align.cpp new file mode 100644 index 000000000..2266f6141 --- /dev/null +++ b/saber/funcs/impl/x86/saber_sroi_align.cpp @@ -0,0 +1,131 @@ + +#include "saber/funcs/impl/x86/saber_sroi_align.h" +#include +#include + +namespace anakin { + +namespace saber { + +template <> +SaberStatus SaberSRoiAlign::create(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SRoiAlignParam& param, + Context &ctx) { + return SaberSuccess; +} + +template <> +SaberStatus SaberSRoiAlign::init(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SRoiAlignParam& param, + Context &ctx) { + + this->_ctx = &ctx; + + CHECK_GT(param.pooled_h, 0) + << "pooled_h must be > 0"; + CHECK_GT(param.pooled_w, 0) + << "pooled_w must be > 0"; + _pooled_height = param.pooled_h; + _pooled_width = param.pooled_w; + _spatial_scale = param.spatial_scale; + LOG(INFO) << "Spatial scale: " << _spatial_scale; + _channels = inputs[0]->channel(); + _height = inputs[0]->height(); + _width = inputs[0]->width(); + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberSRoiAlign::dispatch(\ + const std::vector *>& inputs, \ + std::vector *>& outputs, \ + SRoiAlignParam& param) { + + const float* bottom_data = (const float*)inputs[0]->data(); + const float* bottom_rois = (const float*)inputs[1]->data(); + // Number of ROIs + int num_rois = inputs[1]->num(); + int batch_size = inputs[0]->num(); + float* top_data = (float*)outputs[0]->mutable_data(); + + int in_0_c = inputs[0]->channel(); + int in_0_h = inputs[0]->height(); + int in_0_w = inputs[0]->width(); + int in_1_c = inputs[1]->channel(); + int in_1_h = inputs[1]->height(); + int in_1_w = inputs[1]->width(); + int out_0_h = outputs[0]->height(); + int out_0_w = outputs[0]->width(); + // For each ROI R = [batch_index x1 y1 x2 y2]: roi align over R + for (int n = 0; n < num_rois; ++n) { + int roi_batch_ind = (int)bottom_rois[0]; + float roi_start_w = bottom_rois[1] * _spatial_scale; + float roi_start_h = bottom_rois[2] * _spatial_scale; + float roi_end_w = bottom_rois[3] * _spatial_scale; + float roi_end_h = bottom_rois[4] * _spatial_scale; + CHECK_GE(roi_batch_ind, 0); + CHECK_LT(roi_batch_ind, batch_size); + + float roi_height = std::max(roi_end_h - roi_start_h + 1, static_cast(0.)); + float roi_width = std::max(roi_end_w - roi_start_w + 1, static_cast(0.)); + const float bin_size_h = static_cast(roi_height) + / static_cast(_pooled_height - 1.); + const float bin_size_w = static_cast(roi_width) + / static_cast(_pooled_width - 1.); + + int offset_roi_batch_ind = roi_batch_ind * in_0_c * in_0_h * in_0_w; + const float* batch_data = bottom_data + offset_roi_batch_ind; + + for (int c = 0; c < _channels; ++c) { + for (int ph = 0; ph < _pooled_height; ++ph) { + for (int pw = 0; pw < _pooled_width; ++pw) { + float h = static_cast(ph) * bin_size_h + roi_start_h; + float w = static_cast(pw) * bin_size_w + roi_start_w; + + int hstart = std::min(static_cast(floor(h)), _height - 2); + int wstart = std::min(static_cast(floor(w)), _width - 2); + + bool is_empty(h < 0 || h >= _height || w < 0 || w >= _width); + const int pool_index = ph * _pooled_width + pw; + if (is_empty) { + top_data[pool_index] = 0; + } + else { + float h_ratio = h - static_cast(hstart); + float w_ratio = w - static_cast(wstart); + int upleft = hstart * _width + wstart; + int upright = upleft + 1; + int downleft = upleft + _width; + int downright = downleft + 1; + + top_data[pool_index] = batch_data[upleft] * (1.f - h_ratio) * (1.f - w_ratio) + + batch_data[upright] * (1.f - h_ratio) * w_ratio + + batch_data[downleft] * h_ratio * (1.f - w_ratio) + + batch_data[downright] * h_ratio * w_ratio; + } + } + } + // Increment all data pointers by one channel +// batch_data += inputs[0]->offset(0, 1); +// top_data += outputs[0]->offset(0, 1); + batch_data += in_0_h * in_0_w; + top_data += out_0_h * out_0_w; + } + // Increment ROI data pointer +// bottom_rois += inputs[1]->offset(1); + bottom_rois += in_1_c * in_1_h * in_1_w; + } + + return SaberSuccess; +} + +template class SaberSRoiAlign; +DEFINE_OP_TEMPLATE(SaberSRoiAlign, SRoiAlignParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberSRoiAlign, SRoiAlignParam, X86, AK_INT8); + +} //namespace saber. +} //namespace anakin. diff --git a/saber/funcs/impl/x86/saber_sroi_align.h b/saber/funcs/impl/x86/saber_sroi_align.h new file mode 100644 index 000000000..f411a1c0a --- /dev/null +++ b/saber/funcs/impl/x86/saber_sroi_align.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SROI_ALIGN_H + +#include "saber/funcs/impl/impl_sroi_align.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberSRoiAlign: + public ImplBase> { + +public: + + SaberSRoiAlign() = default; + + ~SaberSRoiAlign() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + SRoiAlignParam ¶m, + Context &ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + SRoiAlignParam ¶m, + Context &ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + SRoiAlignParam ¶m) override; + +private: + int _channels; + int _height; + int _width; + int _pooled_height; + int _pooled_width; + float _spatial_scale; +}; + +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_SROI_ALIGN_H \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_topk_avg_pooling.cpp b/saber/funcs/impl/x86/saber_topk_avg_pooling.cpp index 7653500aa..ca40b1bee 100644 --- a/saber/funcs/impl/x86/saber_topk_avg_pooling.cpp +++ b/saber/funcs/impl/x86/saber_topk_avg_pooling.cpp @@ -43,6 +43,7 @@ SaberStatus SaberTopKAvgPooling::get_topk(std::vector& for (int k = real_k; k < top_k; k++) { dst[k] = (OpDataType) 0.f; } + return SaberSuccess; } diff --git a/saber/funcs/impl/x86/saber_topk_pooling.cpp b/saber/funcs/impl/x86/saber_topk_pooling.cpp index a52ff68c8..4eee9dff6 100644 --- a/saber/funcs/impl/x86/saber_topk_pooling.cpp +++ b/saber/funcs/impl/x86/saber_topk_pooling.cpp @@ -43,6 +43,7 @@ SaberStatus SaberTopKPooling::get_topk(std::vector& sr for (int k = real_k; k < top_k; k++) { dst[k] = (OpDataType) 0.f; } + return SaberSuccess; } template @@ -76,7 +77,7 @@ SaberStatus SaberTopKPooling::dispatch( int feat_map_size = height_stride * width_stride; for (int c = 0; c < channel; c++) { OpDataType* tmp_out_data = output_data + (i * channel + c) * top_k; - OpDataType* tmp_in_data = input_data + (i * channel + c) * feat_map_size; + const OpDataType* tmp_in_data = input_data + (i * channel + c) * feat_map_size; std::vector vec; for (int h = 0; h < height; h++) { diff --git a/saber/funcs/impl/x86/saber_yolo_box.cpp b/saber/funcs/impl/x86/saber_yolo_box.cpp new file mode 100644 index 000000000..8cdc5dc77 --- /dev/null +++ b/saber/funcs/impl/x86/saber_yolo_box.cpp @@ -0,0 +1,158 @@ + +#include "saber/funcs/impl/x86/saber_yolo_box.h" +#include +namespace anakin { +namespace saber { + +namespace { + +inline float sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +inline void get_yolo_box(float* box, const float* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} + +inline int get_entry_index(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +inline void calc_detection_box(float* boxes, float* box, const int box_idx, + const int img_height, + const int img_width) { + + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} + +inline void calc_label_score(float* scores, const float* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} +} + +template <> +SaberStatus SaberYoloBox::create( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param, Context& ctx) { + + return SaberSuccess; +} + +template <> +SaberStatus SaberYoloBox::init( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param, Context& ctx) { + + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberYoloBox::dispatch( + const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam& param) { + + auto* input = inputs[0]; + auto* imgsize = inputs[1]; + auto* boxes = outputs[0]; + auto* scores = outputs[1]; + auto anchors = param.anchors; + int class_num = param.class_num; + float conf_thresh = param.conf_thresh; + int downsample_ratio = param.downsample_ratio; + + const int n = input->num(); + const int h = input->height(); + const int w = input->width(); + const int box_num = boxes->valid_shape()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + auto anchors_data = anchors.data(); + + const float* input_data = (const float*)input->data(); + const float* imgsize_data = (const float*)imgsize->data(); + + float* boxes_data = (float*)boxes->mutable_data(); +// memset(boxes_data, 0, boxes->numel() * sizeof(float)); + + float* scores_data = (float*)scores->mutable_data(); +// memset(scores_data, 0, scores->numel() * sizeof(float)); + + float box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4); + float conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0); + get_yolo_box(box, input_data, anchors_data, l, k, j, h, input_size, + box_idx, stride, img_height, img_width); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + calc_detection_box(boxes_data, box, box_idx, img_height, + img_width); + + int label_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + calc_label_score(scores_data, input_data, label_idx, score_idx, + class_num, conf, stride); + } + } + } + } + + return SaberSuccess; +} + +template class SaberYoloBox; +DEFINE_OP_TEMPLATE(SaberYoloBox, YoloBoxParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(SaberYoloBox, YoloBoxParam, X86, AK_INT8); + +} // namespace saber. +} // namespace anakin. \ No newline at end of file diff --git a/saber/funcs/impl/x86/saber_yolo_box.h b/saber/funcs/impl/x86/saber_yolo_box.h new file mode 100644 index 000000000..865ba9ccc --- /dev/null +++ b/saber/funcs/impl/x86/saber_yolo_box.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_SABER_YOLO_BOX_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_SABER_YOLO_BOX_H + +#include "saber/funcs/impl/impl_yolo_box.h" + +namespace anakin{ + +namespace saber{ + +template +class SaberYoloBox : + public ImplBase> { + +public: + + SaberYoloBox() = default; + ~SaberYoloBox() = default; + + SaberStatus init(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m, + Context &ctx) override; + + SaberStatus create(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m, + Context &ctx) override; + + SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + YoloBoxParam ¶m) override; + +private: +}; +} + +} + +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_SABER_YOLO_BOX_H diff --git a/saber/funcs/impl/x86/sequence2batch.cpp b/saber/funcs/impl/x86/sequence2batch.cpp index 8210ca964..fe21db525 100644 --- a/saber/funcs/impl/x86/sequence2batch.cpp +++ b/saber/funcs/impl/x86/sequence2batch.cpp @@ -29,13 +29,14 @@ void CopyMatrixRowsFunctor::operator()( LOG(ERROR) << "hidden size should be divided with no remainder by fragment_num."; exit(-1); } + typedef typename DataTrait::PtrDtype Data_ptr; auto height = dst_shape[0]; auto dst_width = dst_shape[1] / fragment_num; auto src_width = src_shape[1] / fragment_num; auto real_width = (width != 0) ? width : (dst_width > src_width ? src_width : dst_width); - auto* src_data = src->data(); - auto* dst_data = dst->mutable_data(); + Data_ptr src_data = static_cast(src->data()); + Data_ptr dst_data = static_cast(dst->mutable_data()); if (is_src_index) { #pragma omp parallel for collapse(2) diff --git a/saber/funcs/impl/x86/sequence2batch.h b/saber/funcs/impl/x86/sequence2batch.h index 1bf1b5b9c..ca2b58e4f 100644 --- a/saber/funcs/impl/x86/sequence2batch.h +++ b/saber/funcs/impl/x86/sequence2batch.h @@ -6,6 +6,7 @@ #include "saber/core/tensor.h" #include "saber/funcs/impl/x86/x86_utils.h" +#include "saber/funcs/impl/x86/anakin_thread.h" namespace anakin { namespace saber { @@ -361,8 +362,8 @@ class SequenceToBatch { std::vector seqStartAndLength_; std::vector batchStartPositions_; std::vector seq2BatchIdx_; - size_t numBatch_; - int thread_num = omp_get_max_threads(); + size_t numBatch_{0}; + int thread_num = anakin_get_max_threads(); }; } // namespace math } // namespace saber diff --git a/saber/funcs/impl/x86/vender_conv.cpp b/saber/funcs/impl/x86/vender_conv.cpp index e69de29bb..65591623d 100644 --- a/saber/funcs/impl/x86/vender_conv.cpp +++ b/saber/funcs/impl/x86/vender_conv.cpp @@ -0,0 +1,259 @@ +#include "anakin_config.h" +#ifndef USE_SGX +#include "saber/funcs/impl/x86/vender_conv.h" + +namespace anakin { +namespace saber { + +template +SaberStatus VenderConv2D::init_conv_prv_any(const std::vector *>& inputs, + std::vector *>& outputs, ConvParam& param){ + + _engine = std::make_shared(mkldnn::engine::cpu, 0); + _alg = mkldnn::algorithm::convolution_direct; + _stream = std::make_shared(mkldnn::stream::kind::eager); + + Shape in_sh = inputs[0]->valid_shape(); + Shape out_sh = outputs[0]->valid_shape(); + std::vector b_sh = {out_sh.channel()}; + std::vector w_sh = param.weight()->valid_shape(); + + auto in_md = create_mkldnn_memory_desc(in_sh); + auto bias_md = create_mkldnn_memory_desc(b_sh); + auto weights_md = create_mkldnn_memory_desc(w_sh); + auto out_md = create_mkldnn_memory_desc(out_sh); + + mkldnn_mem_dim strides = {param.stride_h, param.stride_w}; + mkldnn_mem_dim dilation = {param.dilation_h, param.dilation_w}; + mkldnn_mem_dim padding = {param.pad_h, param.pad_w}; + + bool with_bias = param.bias() && param.bias() -> valid_size() > 0 ? true : false; + bool with_dilation = (param.dilation_w == 1 && param.dilation_h == 1)? false : true; + + //TODO:here we ignored group + std::shared_ptr > conv_desc; + if (with_bias && with_dilation){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_bias){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_dilation){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + } + + pdesc conv_prv_desc = pdesc(*conv_desc, *_engine); + + //above: make convolution_primitive_description + //below: make memorys + + //make input_memory and weights_memory for user + _in_mem = create_mkldnn_memory_no_data(inputs[0], *_engine); + _w_mem = create_mkldnn_memory(param.mutable_weight(), w_sh, + mkldnn_mem_format::oihw, mkldnn_mem_dtype::f32, *_engine); + + //set input_memory and weights_memory for conv + _conv_in_mem = _in_mem; + if (pdesc(conv_prv_desc.src_primitive_desc()) != _in_mem->get_primitive_desc()){ + _conv_in_mem.reset(new mkldnn_mem(conv_prv_desc.src_primitive_desc())); + _prvs.push_back(mkldnn::reorder(*_in_mem, *_conv_in_mem)); + } + //std::vector weights_trans; + _conv_w_mem = _w_mem; + if (pdesc(conv_prv_desc.weights_primitive_desc()) != _w_mem->get_primitive_desc()){ + _conv_w_mem.reset(new mkldnn_mem(conv_prv_desc.weights_primitive_desc())); + + //weights_trans.push_back(mkldnn::reorder(w_mem, conv_w_mem)); + _prvs.push_back(mkldnn::reorder(*_w_mem, *_conv_w_mem)); + } + + //set output_memory for user and conv + _out_mem = create_mkldnn_memory_no_data(outputs[0], *_engine); + _conv_out_mem = _out_mem; + if (pdesc(conv_prv_desc.dst_primitive_desc()) != _out_mem->get_primitive_desc()){ + _conv_out_mem.reset(new mkldnn_mem(conv_prv_desc.dst_primitive_desc())); + } + + //set bias_memory for user and conv + //make convolution primitive + if (with_bias){ + _bias_mem = create_mkldnn_memory(param.mutable_bias(), b_sh, + mkldnn_mem_format::x, mkldnn_mem_dtype::f32, *_engine); + _conv_bias_mem = _bias_mem; + if (pdesc(conv_prv_desc.bias_primitive_desc()) != _bias_mem->get_primitive_desc()){ + _conv_bias_mem.reset(new mkldnn_mem(conv_prv_desc.bias_primitive_desc())); + _prvs.push_back(mkldnn::reorder(*_bias_mem, *_conv_bias_mem)); + } + + _prvs.push_back(mkldnn_conv(conv_prv_desc, *_conv_in_mem, *_conv_w_mem, *_conv_bias_mem, *_conv_out_mem)); + } else { + _prvs.push_back(mkldnn_conv(conv_prv_desc, *_conv_in_mem, *_conv_w_mem, *_conv_out_mem)); + } + + bool with_relu = param.activation_param.has_active && + param.activation_param.active == Active_relu; + float n_slope = param.activation_param.negative_slope; + if (with_relu){ + desc relu_desc = desc(mkldnn::prop_kind::forward_inference, + mkldnn::algorithm::eltwise_relu, conv_prv_desc.dst_primitive_desc().desc(), n_slope); + pdesc relu_pdesc = pdesc(relu_desc, *_engine); + _prvs.push_back(mkldnn_relu(relu_pdesc, *_conv_out_mem, *_conv_out_mem)); + } + + //check output_memory need reorder + if (_conv_out_mem->get_primitive_desc() != _out_mem->get_primitive_desc()){ + _prvs.push_back(mkldnn::reorder(*_conv_out_mem, *_out_mem)); + } + + //trans weights + //mkldnn::stream(mkldnn::stream::kind::eager).submit(weights_trans).wait(); + return SaberSuccess; + +} + +template +SaberStatus VenderConv2D::init_conv_prv_specify(const std::vector *>& inputs, + std::vector *>& outputs, ConvParam& param){ + + _engine = std::make_shared(mkldnn::engine::cpu, 0); + _alg = mkldnn::algorithm::convolution_direct; + + Shape in_sh = inputs[0]->valid_shape(); + Shape out_sh = outputs[0]->valid_shape(); + std::vector b_sh = {out_sh.channel()}; + std::vector w_sh = param.weight()->valid_shape(); + + auto in_md = create_mkldnn_memory_desc(in_sh, + get_mkldnn_dtype(inputs[0]->get_dtype()), get_mkldnn_format(inputs[0]->get_layout())); + auto bias_md = create_mkldnn_memory_desc(b_sh, + get_mkldnn_dtype(inputs[0]->get_dtype()), mkldnn_mem_format::x); + auto weights_md = create_mkldnn_memory_desc(w_sh); + auto out_md = create_mkldnn_memory_desc(out_sh, + get_mkldnn_dtype(outputs[0]->get_dtype()), get_mkldnn_format(outputs[0]->get_layout())); + + mkldnn_mem_dim strides = {param.stride_h, param.stride_w}; + mkldnn_mem_dim dilation = {param.dilation_h, param.dilation_w}; + mkldnn_mem_dim padding = {param.pad_h, param.pad_w}; + + bool with_bias = param.bias() && param.bias() -> valid_size() > 0 ? true : false; + bool with_dilation = (param.dilation_w == 1 && param.dilation_h == 1)? false : true; + + //TODO:here we ignored group + std::shared_ptr > conv_desc; + if (with_bias && with_dilation){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_bias){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_dilation){ + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + } + + pdesc conv_prv_desc = pdesc(*conv_desc, *_engine); + //above: make convolution_primitive_description + //below: make memorys + + //make input_memory and weights_memory for user + _in_mem = create_mkldnn_memory_no_data(inputs[0], *_engine); + _w_mem = create_mkldnn_memory(param.mutable_weight(), w_sh, + get_mkldnn_format(param.weight()->get_layout()), + get_mkldnn_dtype(param.weight()->get_dtype()), *_engine); + + //set output_memory for user and conv + _out_mem = create_mkldnn_memory_no_data(outputs[0], *_engine); + + //set bias_memory for user and conv + //make convolution primitive + _conv_w_mem = _w_mem; + if (pdesc(conv_prv_desc.weights_primitive_desc()) != _w_mem->get_primitive_desc()){ + _conv_w_mem.reset(new mkldnn_mem(conv_prv_desc.weights_primitive_desc())); + + //weights_trans.push_back(mkldnn::reorder(w_mem, conv_w_mem)); + _pre_prvs.push_back(mkldnn::reorder(*_w_mem, *_conv_w_mem)); + } + + if (with_bias){ + _bias_mem = create_mkldnn_memory(param.mutable_bias(), b_sh, + mkldnn_mem_format::x, get_mkldnn_dtype(param.bias()->get_dtype()), *_engine); + + _prvs.push_back(mkldnn_conv(conv_prv_desc, *_in_mem, *_conv_w_mem, *_bias_mem, *_out_mem)); + } else { + _prvs.push_back(mkldnn_conv(conv_prv_desc, *_in_mem, *_conv_w_mem, *_out_mem)); + } + + bool with_relu = param.activation_param.has_active && + param.activation_param.active == Active_relu; + float n_slope = param.activation_param.negative_slope; + if (with_relu){ + desc relu_desc = desc(mkldnn::prop_kind::forward_inference, + mkldnn::algorithm::eltwise_relu, conv_prv_desc.dst_primitive_desc().desc(), n_slope); + pdesc relu_pdesc = pdesc(relu_desc, *_engine); + _prvs.push_back(mkldnn_relu(relu_pdesc, *_out_mem, *_out_mem)); + } + + //trans weights + mkldnn::stream(mkldnn::stream::kind::eager).submit(_pre_prvs).wait(); + return SaberSuccess; + +} + +template <> +SaberStatus VenderConv2D::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + //init_conv_prv_any(inputs, outputs, param); + return init_conv_prv_specify(inputs, outputs, param); +} + +template <> +SaberStatus VenderConv2D::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + this->_ctx = &ctx; + if(param.group>1){ + return SaberUnImplError; + } + return create(inputs, outputs, param, ctx); + +} + +template <> +SaberStatus VenderConv2D::\ +dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + if(param.group>1){ + return SaberUnImplError; + } + //bind data + _in_mem->set_data_handle(inputs[0]->data()); + _out_mem->set_data_handle(outputs[0]->mutable_data()); + //submit stream + mkldnn::stream(mkldnn::stream::kind::eager).submit(_prvs).wait(); + return SaberSuccess; +} + +DEFINE_OP_TEMPLATE(VenderConv2D, ConvParam, X86, AK_HALF); +DEFINE_OP_TEMPLATE(VenderConv2D, ConvParam, X86, AK_INT8); + +} +} +#endif diff --git a/saber/funcs/impl/x86/vender_conv.h b/saber/funcs/impl/x86/vender_conv.h index e69de29bb..c84a930a5 100644 --- a/saber/funcs/impl/x86/vender_conv.h +++ b/saber/funcs/impl/x86/vender_conv.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_CONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_CONV_H + +#include "saber/funcs/impl/impl_conv.h" +#include "saber/funcs/impl/x86/mkldnn_helper.h" + +namespace anakin { +namespace saber { + +template +class VenderConv2D : public ImplBase< + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + VenderConv2D(){} + + ~VenderConv2D() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context &ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); + + SaberStatus trans_weights(Tensor &target_weights, + Tensor &target_bias, int pad_h, int pad_w, + int dilation_h, int dilation_w, int stride_h, + int stride_w, int group) { + return SaberUnImplError; + } + +private: + SaberStatus init_conv_prv_any(const std::vector*>& inputs, + std::vector*>& outputs, ConvParam& param); + SaberStatus init_conv_prv_specify(const std::vector*>& inputs, + std::vector*>& outputs, ConvParam& param); + + +private: + std::shared_ptr _engine; + mkldnn::algorithm _alg; + std::vector _prvs; + std::vector _pre_prvs; + std::shared_ptr _stream; + + mkldnn_mem_ptr _conv_in_mem; + mkldnn_mem_ptr _conv_w_mem; + mkldnn_mem_ptr _conv_bias_mem; + mkldnn_mem_ptr _conv_out_mem; + + mkldnn_mem_ptr _in_mem; + mkldnn_mem_ptr _w_mem; + mkldnn_mem_ptr _bias_mem; + mkldnn_mem_ptr _out_mem; + + int _in_order; + int _out_order; + + + + +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_CONV_H diff --git a/saber/funcs/impl/x86/vender_deconv.cpp b/saber/funcs/impl/x86/vender_deconv.cpp new file mode 100644 index 000000000..b1332cf65 --- /dev/null +++ b/saber/funcs/impl/x86/vender_deconv.cpp @@ -0,0 +1,171 @@ +#include "anakin_config.h" +#ifndef USE_SGX +#include "saber/funcs/impl/x86/vender_deconv.h" +#include "saber/funcs/impl/x86/mkldnn_helper.h" + +namespace anakin { +namespace saber { + +template +SaberStatus VenderDeconv2D::init_conv_prv(const std::vector *>& inputs, + std::vector *>& outputs, ConvParam& param) { + + _engine = std::make_shared(mkldnn::engine::cpu, 0); + _alg = mkldnn::algorithm::deconvolution_direct; + _stream = std::make_shared(mkldnn::stream::kind::eager); + + Shape in_sh = inputs[0]->valid_shape(); + Shape out_sh = outputs[0]->valid_shape(); + std::vector b_sh = {out_sh.channel()}; + std::vector w_sh = param.weight()->valid_shape(); + + auto in_md = create_mkldnn_memory_desc(in_sh); + auto bias_md = create_mkldnn_memory_desc(b_sh); + auto weights_md = create_mkldnn_memory_desc(w_sh); + auto out_md = create_mkldnn_memory_desc(out_sh); + + mkldnn_mem_dim strides = {param.stride_h, param.stride_w}; + mkldnn_mem_dim dilation = {param.dilation_h, param.dilation_w}; + mkldnn_mem_dim padding = {param.pad_h, param.pad_w}; + + bool with_bias = param.bias() && param.bias() -> valid_size() > 0 ? true : false; + bool with_dilation = (param.dilation_w == 1 && param.dilation_h == 1) ? false : true; + + //TODO:here we ignored group + std::shared_ptr > conv_desc; + + if (with_bias && with_dilation) { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_bias) { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + } else if (with_dilation) { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, dilation, padding, padding, + mkldnn::padding_kind::zero); + } else { + conv_desc = std::make_shared >(mkldnn::prop_kind::forward_inference, _alg, + in_md, weights_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + LOG(INFO)<<"it is me"; + } + + pdesc conv_prv_desc = pdesc(*conv_desc, *_engine); + //above: make convolution_primitive_description + //below: make memorys + + //make input_memory and weights_memory for user + _in_mem = create_mkldnn_memory_no_data(inputs[0], *_engine); + _w_mem = create_mkldnn_memory(param.mutable_weight(), w_sh, + mkldnn_mem_format::oihw, mkldnn_mem_dtype::f32, *_engine); + + //set input_memory and weights_memory for conv + _conv_in_mem = _in_mem; + + if (pdesc(conv_prv_desc.src_primitive_desc()) != _in_mem->get_primitive_desc()) { + _conv_in_mem.reset(new mkldnn_mem(conv_prv_desc.src_primitive_desc())); + _prvs.push_back(mkldnn::reorder(*_in_mem, *_conv_in_mem)); + } + + //std::vector weights_trans; + _conv_w_mem = _w_mem; +// LOG(INFO)<<"conv weight mem "<get_primitive_desc().desc().data.format; +// if (pdesc(conv_prv_desc.weights_primitive_desc()) != _w_mem->get_primitive_desc()) { +// _conv_w_mem.reset(new mkldnn_mem(conv_prv_desc.weights_primitive_desc())); +// //weights_trans.push_back(mkldnn::reorder(w_mem, conv_w_mem)); +// _prvs_weights_trans.push_back(mkldnn::reorder(*_w_mem, *_conv_w_mem)); +// mkldnn::stream(mkldnn::stream::kind::eager).submit(_prvs_weights_trans).wait(); +// +// LOG(INFO)<<"change weights"; +// } + + //set output_memory for user and conv + _out_mem = create_mkldnn_memory_no_data(outputs[0], *_engine); + _conv_out_mem = _out_mem; + + if (pdesc(conv_prv_desc.dst_primitive_desc()) != _out_mem->get_primitive_desc()) { + _conv_out_mem.reset(new mkldnn_mem(conv_prv_desc.dst_primitive_desc())); + } + + //set bias_memory for user and conv + //make convolution primitive + if (with_bias) { + _bias_mem = create_mkldnn_memory(param.mutable_bias(), b_sh, + mkldnn_mem_format::x, mkldnn_mem_dtype::f32, *_engine); + _conv_bias_mem = _bias_mem; + + if (pdesc(conv_prv_desc.bias_primitive_desc()) != _bias_mem->get_primitive_desc()) { + _conv_bias_mem.reset(new mkldnn_mem(conv_prv_desc.bias_primitive_desc())); + _prvs_weights_trans.push_back(mkldnn::reorder(*_bias_mem, *_conv_bias_mem)); + } + + _prvs.push_back(mkldnn_deconv(conv_prv_desc, *_conv_in_mem, *_conv_w_mem, *_conv_bias_mem, + *_conv_out_mem)); + } else { + LOG(INFO)<<"no bias"; + _prvs.push_back(mkldnn_deconv(conv_prv_desc, *_conv_in_mem, *_conv_w_mem, *_conv_out_mem)); + } + + bool with_relu = param.activation_param.has_active && + param.activation_param.active == Active_relu; + float n_slope = param.activation_param.negative_slope; + + if (with_relu) { + desc relu_desc = desc(mkldnn::prop_kind::forward_inference, + mkldnn::algorithm::eltwise_relu, conv_prv_desc.dst_primitive_desc().desc(), n_slope); + pdesc relu_pdesc = pdesc(relu_desc, *_engine); + _prvs.push_back(mkldnn_relu(relu_pdesc, *_conv_out_mem, *_conv_out_mem)); + } + LOG(INFO)<<"conv out mem "<<_conv_out_mem->get_primitive_desc().desc().data.format; + LOG(INFO)<<"out mem "<<_out_mem->get_primitive_desc().desc().data.format; + //check output_memory need reorder + if (_conv_out_mem->get_primitive_desc() != _out_mem->get_primitive_desc()) { + + _prvs.push_back(mkldnn::reorder(*_conv_out_mem, *_out_mem)); + } + + mkldnn::stream(mkldnn::stream::kind::eager).submit(_prvs_weights_trans).wait(); + //trans weights + //mkldnn::stream(mkldnn::stream::kind::eager).submit(weights_trans).wait(); + return SaberSuccess; +} + +template <> +SaberStatus VenderDeconv2D::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + + return init_conv_prv(inputs, outputs, param); +} + +template <> +SaberStatus VenderDeconv2D::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + +} + +template <> +SaberStatus VenderDeconv2D::\ +dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param) { + //bind data + _in_mem->set_data_handle(inputs[0]->data()); + _out_mem->set_data_handle(outputs[0]->mutable_data()); + //submit stream + //LOG(ERROR)<<"submitting _stream prvs"; + //_stream->submit(_prvs).wait(); + mkldnn::stream(mkldnn::stream::kind::eager).submit(_prvs).wait(); + return SaberSuccess; +} + +} +} +#endif diff --git a/saber/funcs/impl/x86/vender_deconv.h b/saber/funcs/impl/x86/vender_deconv.h new file mode 100644 index 000000000..ade293c13 --- /dev/null +++ b/saber/funcs/impl/x86/vender_deconv.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_DECONV_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_DECONV_H + +#include "anakin_config.h" +#include "saber/funcs/impl/impl_deconv.h" + +#ifndef USE_SGX +#include "saber/funcs/impl/x86/mkldnn_helper.h" +#endif + +namespace anakin { +namespace saber { + +template +class VenderDeconv2D : public ImplBase < + X86, OpDtype, ConvParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + VenderDeconv2D() {} + + ~VenderDeconv2D() {} + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector*>& inputs, + std::vector*>& outputs, + ConvParam& param); + + SaberStatus trans_weights(Tensor& target_weights, + Tensor& target_bias, int pad_h, int pad_w, + int dilation_h, int dilation_w, int stride_h, + int stride_w, int group) { + return SaberUnImplError; + } + +private: + SaberStatus init_conv_prv(const std::vector*>& inputs, + std::vector*>& outputs, ConvParam& param); + +private: + std::shared_ptr _engine; + mkldnn::algorithm _alg; + std::vector _prvs; + std::vector _prvs_weights_trans; + std::shared_ptr _stream; + + mkldnn_mem_ptr _conv_in_mem; + mkldnn_mem_ptr _conv_w_mem; + mkldnn_mem_ptr _conv_bias_mem; + mkldnn_mem_ptr _conv_out_mem; + + mkldnn_mem_ptr _in_mem; + mkldnn_mem_ptr _w_mem; + mkldnn_mem_ptr _bias_mem; + mkldnn_mem_ptr _out_mem; + + int _in_order; + int _out_order; + + +}; + +} // namespace saber +} // namespace anakin + +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_DECONV_H diff --git a/saber/funcs/impl/x86/vender_fc.cpp b/saber/funcs/impl/x86/vender_fc.cpp index 92474632e..ab7d762f4 100644 --- a/saber/funcs/impl/x86/vender_fc.cpp +++ b/saber/funcs/impl/x86/vender_fc.cpp @@ -2,33 +2,38 @@ #include "saber/funcs/impl/x86/x86_utils.h" #include "mkl_cblas.h" #include "mkl_vml_functions.h" +#include "tensor_op.h" namespace anakin { namespace saber { typedef MKL_INT cblas_int; -template class VenderFc; +template <> +void VenderFc::clean() { + if (bias_sum) { + free(bias_sum); + bias_sum = nullptr; + } -template -SaberStatus VenderFc - ::init(const std::vector *>& inputs, - std::vector *>& outputs, - FcParam ¶m, Context &ctx) { - this->_ctx = &ctx; + for (int i = packed_weights.size() - 1; i >= 0; i--) { + float* pw = packed_weights[i]; + cblas_sgemm_free(pw); + pw = nullptr; + packed_weights.pop_back(); + } - return create(inputs, outputs, param, ctx); + std::vector().swap(packed_weights); } -template -SaberStatus VenderFc - ::create(const std::vector *>& inputs, - std::vector *>& outputs, - FcParam ¶m, Context &ctx) { - //check - CHECK_EQ(OpDtype, AK_FLOAT) << "vender fc only supports FP32 currently"; - + +template <> +SaberStatus VenderFc +::create(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context& ctx) { + this->_ctx = &ctx; this->_param = ¶m; @@ -37,12 +42,19 @@ SaberStatus VenderFc // weights for (int i = packed_weights.size() - 1; i >= 0; i--) { - cblas_sgemm_free(packed_weights[i]); + cblas_sgemm_free(packed_weights[i]); } + std::vector ().swap(packed_weights); - const float *weights = (const float*)param.weights->data(); + const float* weights = (const float*)param.weights->data(); + + if (_need_weights_trans) { + weights = static_cast(_weights_trans.data()); + } + int total_IC = 0; + for (int i = 0; i < inputs.size(); i++) { cblas_int IC = inputs[i]->count_valid(param.axis, inputs[i]->dims()); packed_weights.push_back(cblas_sgemm_alloc(CblasAMatrix, OC, MB, IC)); @@ -58,19 +70,96 @@ SaberStatus VenderFc // LOG(INFO) << "anakin input[" << i << "] pack passed"; } + CHECK_EQ(inputs.size(), 1); + + if (inputs[0]->get_dtype() != AK_FLOAT) { + utils::try_expand_tensor(_input_scale, inputs[0]->valid_shape()); + } + return SaberSuccess; } -template -SaberStatus VenderFc - ::dispatch(const std::vector *>& inputs, - std::vector *>& outputs, - FcParam ¶m) { - - //check - CHECK_EQ(OpDtype, AK_FLOAT) << "vender fc only supports FP32 currently"; +template <> +SaberStatus VenderFc +::init(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, Context& ctx) { + this->_ctx = &ctx; + LayoutType in_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + + if (in_layout == Layout_NCHW_C8R && out_layout == Layout_NCHW) { + CHECK(inputs[0]->channel() % 8 == 0) << "only support channel div 8 == 0"; + _need_weights_trans = true; + _weights_trans.re_alloc(param.weights->valid_shape()); + int oc_value = param.weights->height(); + int oc_stride = param.weights->width(); + int ic_value = inputs[0]->channel(); + int c_value_div_8 = ic_value / 8; + int hw_value = inputs[0]->height() * inputs[0]->width(); + float* out_weights = static_cast(_weights_trans.mutable_data()); + const float* in_weights = static_cast(param.weights->data()); + + for (int oc = 0; oc < oc_value; oc++) { + for (int ic_div_8 = 0; ic_div_8 < c_value_div_8; ic_div_8++) { + for (int hw = 0; hw < hw_value; hw++) { + for (int inner_c = 0; inner_c < 8; inner_c++) { + int out_index = oc * oc_stride + ic_div_8 * hw_value * 8 + hw * 8 + inner_c; + int in_index = oc * oc_stride + (ic_div_8 * 8 + inner_c) * hw_value + hw; + out_weights[out_index] = in_weights[in_index]; + } + } + } + } + + DLOG(INFO) << "ak trans weights nchw to c8r"; + } else if (in_layout == Layout_NHWC && out_layout == Layout_NCHW) { + _need_weights_trans = true; + _weights_trans.re_alloc(param.weights->valid_shape()); + int oc_value = param.weights->height(); + int oc_stride = param.weights->width(); + int ic_value = inputs[0]->channel(); + int hw_value = inputs[0]->height() * inputs[0]->width(); + float* out_weights = static_cast(_weights_trans.mutable_data()); + const float* in_weights = static_cast(param.weights->data()); + + for (int oc = 0; oc < oc_value; oc++) { + for (int hw = 0; hw < hw_value; hw++) { + for (int ic = 0; ic < ic_value; ic++) { + int out_index = oc * oc_stride + hw * ic_value + ic; + int in_index = oc * oc_stride + ic * hw_value + hw; + out_weights[out_index] = in_weights[in_index]; + } + } + } + + DLOG(INFO) << "ak trans weights nchw to nchwc"; + } else if ((in_layout == Layout_NCHW || in_layout == Layout_NC || in_layout == Layout_NHW + || in_layout == Layout_HW) + && out_layout == Layout_NCHW) { + _need_weights_trans = false; + } else { + LOG(FATAL) << "not support input layout in = " << inputs[0]->get_layout() << " , out = " << + outputs[0]->get_layout(); + } + + CHECK_EQ(inputs.size(), 1); + + if (inputs[0]->get_dtype() != AK_FLOAT) { + _input_scale.re_alloc(inputs[0]->valid_shape(), AK_FLOAT); + } - float* dst = (float *)outputs[0]->mutable_data(); + return create(inputs, outputs, param, ctx); +} + + +template <> +SaberStatus VenderFc +::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param) { + + float* dst = (float*)outputs[0]->mutable_data(); const float* bias = NULL; if (param.bias) { @@ -78,9 +167,21 @@ SaberStatus VenderFc } for (int i = 0; i < inputs.size(); i++) { - const float* src = static_cast(inputs[i]->data()); + + const float* src = nullptr; + + if (inputs[i]->get_dtype() == AK_FLOAT) { + src = static_cast(inputs[i]->data()); + } else if (inputs[i]->get_dtype() == AK_UINT8) { + DLOG(INFO) << "dispatch convert uint8 fp32"; + utils::ScaleUtils::scale_uint8_fp32(_input_scale, *inputs[i]); + src = static_cast(_input_scale.data()); + } + + cblas_int IC = inputs[i]->count_valid(param.axis, inputs[i]->dims()); - if(i == 0) { + + if (i == 0) { // C := alpha * op(A) * op(B) + beta * C cblas_sgemm_compute(CblasColMajor, // Layout CblasPacked, // a @@ -100,6 +201,7 @@ SaberStatus VenderFc 1.0, // beta dst, OC); // c, ldc } + //LOG(INFO) << "anakin compute[" << i << "] passed"; // LOG(INFO) << "inputs[]:dims: " << inputs[0]->dims(); @@ -111,6 +213,7 @@ SaberStatus VenderFc if (bias) { #pragma omp parallel for schedule(static) + for (cblas_int mb = 0; mb < MB; mb++) { cblas_saxpy(OC, 1.0, bias, 1.0, dst + mb * OC, 1); } @@ -118,7 +221,231 @@ SaberStatus VenderFc return SaberSuccess; } +template class VenderFc; + + +template <> +void VenderFc::clean() { + if (ws_) { + zfree(ws_); + ws_ = nullptr; + } +} + +template <> +SaberStatus VenderFc::create(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, + Context& ctx) { + if (inputs[0]->get_dtype() == AK_INT8 || inputs[0]->get_dtype() == AK_FLOAT) { + return SaberSuccess; + } + + if (ws_) { + zfree(ws_); + ws_ = nullptr; + } + + // LOG(INFO)<<"batch size = "<<_batch_size<<","<<_output_channel; + ws_ = zmalloc(_batch_size * _output_channel * sizeof(int), 256); + + if (ws_ == nullptr) { + LOG(FATAL) << "OutOfMem"; + return SaberOutOfMem; + } + + if (inputs[0]->get_dtype() == AK_FLOAT) { + utils::try_expand_tensor(_input_scale, inputs[0]->valid_shape()); + } + + return SaberSuccess; +} + +template <> +SaberStatus VenderFc::init(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param, + Context& ctx) { + if (inputs[0]->get_dtype() == AK_INT8 || inputs[0]->get_dtype() == AK_FLOAT) { + int m = inputs[0]->count_valid(0, param.axis); + int n = outputs[0]->channel(); + int k = inputs[0]->count_valid(param.axis, inputs[0]->dims()); + CHECK(inputs[0]->get_scale().size() > 0); + + _packed_int8_gemm.init(false, true, m, n, k, *param.weights, inputs[0]->get_scale()[0]); + return SaberSuccess; + } + + this->_ctx = &ctx; + this->_param = ¶m; + + CHECK(inputs[0]->get_dtype() == AK_FLOAT + || inputs[0]->get_dtype() == AK_UINT8) << "not support input type " << inputs[0]->get_dtype(); + CHECK_GT(inputs[0]->get_scale().size(), 0) << "input scale must >0"; + CHECK_GT(outputs[0]->get_scale().size(), 0) << "output scale must >0"; + + _output_channel = outputs[0]->channel(); + _batch_size = inputs[0]->count_valid(0, param.axis); + + if (param.weights->get_dtype() == AK_FLOAT) { + _need_weights_trans = true; + _weights_trans.re_alloc(param.weights->valid_shape(), AK_INT8); + utils::ScaleUtils::scale_fc_weights_to_nchw_host(_weights_trans, *param.weights); + // LOG(INFO)<<"input shape "<valid_shape()<<" , weights shape "<valid_shape(); + } + + if (_need_weights_trans) { + for (int i = 0; i < _output_channel; i ++) { + _scale.push_back((inputs[0]->get_scale()[0] * _weights_trans.get_scale()[i]) / + outputs[0]->get_scale()[0]); + } + } else { + for (int i = 0; i < _output_channel; i ++) { + _scale.push_back((inputs[0]->get_scale()[0] * param.weights->get_scale()[i]) / + outputs[0]->get_scale()[0]); + } + } + + if (param.bias != nullptr && param.bias->valid_size() > 0 && param.bias->get_dtype() == AK_FLOAT) { + _bias_scale.re_alloc(param.bias->valid_shape(), AK_INT32); + _bias_scale.set_scale(_scale); + utils::ScaleUtils::scale_bias_fp32_int32(_bias_scale, *param.bias); + } + + _is_transpose_weights = param.is_transpose_weights ? + CblasNoTrans : + CblasTrans; + + if (inputs[0]->get_dtype() == AK_FLOAT) { + _input_scale.re_alloc(inputs[0]->valid_shape(), AK_UINT8); + } + + return create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus VenderFc::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + FcParam& param) { + + if (inputs[0]->get_dtype() == AK_INT8 || inputs[0]->get_dtype() == AK_FLOAT) { + int m = inputs[0]->count_valid(0, param.axis); + _packed_int8_gemm.dispatch(1.f, 0.f, m, *inputs[0], *outputs[0], param.bias); + return SaberSuccess; + } + +#define __FC_PARALLEL_FUNC [&](int mb, int oc) { \ + int dst_index = mb * _output_channel + oc; \ + if (bias) { \ + dst[dst_index] = (_scale[oc] == 1.f) ? \ + static_cast(ws_)[dst_index] + bias[oc] : \ + _scale[oc] * (static_cast(ws_)[dst_index] + bias[oc]); \ + } else { \ + dst[dst_index] = (_scale[oc] == 1.f) ? \ + dst[dst_index] = static_cast(ws_)[dst_index] : \ + _scale[oc] * static_cast(ws_)[dst_index]; \ + } \ +} + + int c_offset = 0; + int total_ic = 0; + + auto bias = param.bias != nullptr && param.bias->valid_size() > 0 ? + (param.bias->get_dtype() == AK_INT32 ? static_cast(param.bias->data()) : + static_cast(_bias_scale.data())) + : nullptr; + + for (int i = 0; i < inputs.size(); i++) { + int IC = inputs[i]->count_valid(param.axis, inputs[i]->dims()); + + auto src = static_cast(inputs[i]->data()); + + if (inputs[i]->get_dtype() == AK_FLOAT) { + utils::ScaleUtils::scale_fp32_uint8(_input_scale, *inputs[0]); + src = static_cast(_input_scale.data()); + // print_tensor(_input_scale); + } + + auto weight = static_cast(param.weights->data()) + total_ic * _output_channel; + + if (_need_weights_trans) { + // LOG(INFO)<<"weights trans"; + weight = static_cast(_weights_trans.data()) + total_ic * _output_channel; + // print_tensor(_weights_trans); + } + + // for(auto a:_scale){ + // LOG(INFO)<<"scale = "<(ws_), // c + _output_channel, // ldc + &c_offset); + } else { + cblas_gemm_s8u8s32(CblasColMajor, + _is_transpose_weights, + CblasNoTrans, + CblasFixOffset, + _output_channel, + _batch_size, + IC, + 1.0, + weight, + IC, + 0, + src, + IC, + 0, + 1.0, + static_cast(ws_), + _output_channel, + &c_offset); + } + + total_ic += IC; + } + + auto dst_dtype = outputs[0]->get_dtype(); + + if (dst_dtype == AK_FLOAT) { + auto dst = static_cast(outputs[0]->mutable_data()); + parallel_nd(_batch_size, _output_channel, __FC_PARALLEL_FUNC); + } else if (dst_dtype == AK_INT32) { + auto dst = static_cast(outputs[0]->mutable_data()); + parallel_nd(_batch_size, _output_channel, __FC_PARALLEL_FUNC); + } else if (dst_dtype == AK_INT8) { + auto dst = static_cast(outputs[0]->mutable_data()); + parallel_nd(_batch_size, _output_channel, __FC_PARALLEL_FUNC); + } else { + LOG(FATAL) << "not support this type " << dst_dtype; + return SaberUnImplError; + } + + return SaberSuccess; +} + +template class VenderFc; + DEFINE_OP_TEMPLATE(VenderFc, FcParam, X86, AK_HALF); -DEFINE_OP_TEMPLATE(VenderFc, FcParam, X86, AK_INT8); + } // namespace saber } // namespace anakin diff --git a/saber/funcs/impl/x86/vender_fc.h b/saber/funcs/impl/x86/vender_fc.h index d6d0e34fb..a794f5fbd 100644 --- a/saber/funcs/impl/x86/vender_fc.h +++ b/saber/funcs/impl/x86/vender_fc.h @@ -20,6 +20,7 @@ #include "mkl_cblas.h" #include "saber/funcs/impl/impl_fc.h" +#include "saber/funcs/impl/x86/mkl_packed_int8_gemm.h" namespace anakin { namespace saber { @@ -29,22 +30,12 @@ class VenderFc : public ImplBase > { public: typedef typename DataTrait::Dtype OpDataType; - VenderFc() : bias_sum(nullptr) + VenderFc() : bias_sum(nullptr),_need_weights_trans(false),ws_(nullptr),MB(0),OC(0), + _batch_size(0),_output_channel(0),_is_transpose_weights(CblasNoTrans) {} ~VenderFc() { - if (bias_sum) { - free(bias_sum); - bias_sum = nullptr; - } - - for (int i = packed_weights.size() - 1; i >= 0; i--) { - OpDataType *pw = packed_weights[i]; - cblas_sgemm_free(pw); - pw = nullptr; - packed_weights.pop_back(); - } - std::vector ().swap(packed_weights); + clean(); } virtual SaberStatus init(const std::vector *>& inputs, @@ -60,12 +51,24 @@ class VenderFc : public ImplBase > { virtual SaberStatus dispatch(const std::vector *>& inputs, std::vector *>& outputs, FcParam ¶m) override; + virtual void clean(); private: OpDataType *bias_sum; int MB; int OC; - std::vector packed_weights; + Tensor _weights_trans; + bool _need_weights_trans; + std::vector packed_weights; + void *ws_; + int _batch_size; + int _output_channel; + std::vector _scale; + CBLAS_TRANSPOSE _is_transpose_weights;//trans in mklml + Tensor _input_scale; + Tensor _bias_scale; + + PackedMKLInt8Gemm _packed_int8_gemm; }; diff --git a/saber/funcs/impl/x86/vender_gru.cpp b/saber/funcs/impl/x86/vender_gru.cpp index 78f3e65f9..2a73a7c40 100644 --- a/saber/funcs/impl/x86/vender_gru.cpp +++ b/saber/funcs/impl/x86/vender_gru.cpp @@ -17,7 +17,7 @@ SaberStatus VenderGru::init( std::vector& outputs, GruParam& param, Context& ctx) { this->_ctx = &ctx; - this->max_thread_num_ = omp_get_max_threads(); + this->max_thread_num_ = anakin_get_max_threads(); hidden_size_ = outputs[0]->channel(); word_size_ = inputs[0]->channel(); diff --git a/saber/funcs/impl/x86/vender_lstm.cpp b/saber/funcs/impl/x86/vender_lstm.cpp index ae8b3a056..b5e6d047c 100644 --- a/saber/funcs/impl/x86/vender_lstm.cpp +++ b/saber/funcs/impl/x86/vender_lstm.cpp @@ -29,9 +29,13 @@ SaberStatus VenderLstm::init( const std::vector& inputs, std::vector& outputs, LstmParam& param, Context& ctx) { +#ifdef USE_SGX + const char *ret = "1"; +#else const char* ret = std::getenv("OMP_NUM_THREADS"); +#endif this->_ctx = &ctx; - this->max_thread_num_ = ret ? atoi(ret) : omp_get_max_threads(); + this->max_thread_num_ = ret ? atoi(ret) : anakin_get_max_threads(); int layer_num_ = param.num_layers; int direc_num_ = param.num_direction; hidden_size_ = outputs[0]->channel() / direc_num_; @@ -630,7 +634,7 @@ SaberStatus VenderLstm::dispatch( int i_offset = 1; int c_offset = 2; int o_offset = 3; - omp_set_nested(1); + anakin_set_nested(1); mkl_set_dynamic(0); if (batch_size_ == 1) { diff --git a/saber/funcs/impl/x86/vender_lstm.h b/saber/funcs/impl/x86/vender_lstm.h index ffe5f08ff..b9a607d70 100644 --- a/saber/funcs/impl/x86/vender_lstm.h +++ b/saber/funcs/impl/x86/vender_lstm.h @@ -14,7 +14,6 @@ limitations under the License. */ #include "saber/funcs/impl/impl_lstm.h" #include "saber_funcs_param.h" #include "saber/funcs/impl/x86/x86_utils.h" -#include #include "mkl_cblas.h" #include "mkl_vml_functions.h" #include "mkl_service.h" @@ -94,4 +93,4 @@ class VenderLstm: public ImplBase < } // namespace saber } // namespace anakin -#endif // ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_LSTM_H \ No newline at end of file +#endif // ANAKIN_SABER_FUNCS_IMPL_X86_VENDER_LSTM_H diff --git a/saber/funcs/impl/x86/vender_mat_mul.h b/saber/funcs/impl/x86/vender_mat_mul.h index faf2c9242..ae90f00d6 100644 --- a/saber/funcs/impl/x86/vender_mat_mul.h +++ b/saber/funcs/impl/x86/vender_mat_mul.h @@ -37,6 +37,7 @@ class SaberMatMul: public ImplBase std::vector *>& outputs, MatMulParam ¶m, Context &ctx) { + alpha = param._scale; this->_ctx = &ctx; return create(inputs, outputs, param, ctx); @@ -123,4 +124,4 @@ class SaberMatMul: public ImplBase } //namespace anakin -#endif //ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H \ No newline at end of file +#endif //ANAKIN_SABER_FUNCS_IMPL_X86_MAT_MUL_H diff --git a/saber/funcs/impl/x86/winograd.cpp b/saber/funcs/impl/x86/winograd.cpp new file mode 100644 index 000000000..53555ba66 --- /dev/null +++ b/saber/funcs/impl/x86/winograd.cpp @@ -0,0 +1,50 @@ +#include "saber/funcs/impl/x86/winograd.h" +#include "saber/funcs/impl/x86/winograd_float.h" +#include "saber/funcs/impl/x86/winograd_avx2.h" +//#include "saber/funcs/impl/x86/winograd_avx.h" +//#include "saber/funcs/impl/x86/winograd_avx2_nchwc8.h" +namespace anakin { +namespace saber { + +template <> +SaberStatus SaberConvWinograd::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + return _impl->create(inputs, outputs, param, ctx); +} + +template <> +SaberStatus SaberConvWinograd::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + LayoutType input_layout = inputs[0]->get_layout(); + LayoutType out_layout = outputs[0]->get_layout(); + + // if(input_layout==Layout_NCHW_C8R&&out_layout==Layout_NCHW_C8R){ + // this->_impl = new SaberConvWinogradAvx2Nchwc8; + // }else + if (input_layout == Layout_NCHW && out_layout == Layout_NCHW) { +#if defined(__AVX2__) and defined(__FMA__) + this->_impl = new SaberConvWinogradAvx2; +#else + this->_impl = new SaberConvWinogradFloat; +#endif + } else { + LOG(FATAL) << "winograd conv not support this layout"; + } + + return _impl->init(inputs, outputs, param, ctx); + +} + +template <> +SaberStatus SaberConvWinograd::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + return _impl->dispatch(inputs, outputs, param); + +} + +} +} \ No newline at end of file diff --git a/saber/funcs/impl/x86/winograd.h b/saber/funcs/impl/x86/winograd.h new file mode 100644 index 000000000..1571e7b06 --- /dev/null +++ b/saber/funcs/impl/x86/winograd.h @@ -0,0 +1,42 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_H +#include "saber/funcs/impl/impl_conv.h" +#include "saber/core/tensor.h" + +namespace anakin { +namespace saber { +template +class SaberConvWinograd : public ImplBase < + X86, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + typedef ImplBase > Impl_t; + + SaberConvWinograd() {} + + ~SaberConvWinograd() { + if (_impl!= nullptr){ + delete _impl; + } + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param); + +private: + Impl_t *_impl; + +}; +} +} +#endif //ANAKIN_WINOGRAD_H diff --git a/saber/funcs/impl/x86/winograd_avx2.cpp b/saber/funcs/impl/x86/winograd_avx2.cpp new file mode 100644 index 000000000..8453756d8 --- /dev/null +++ b/saber/funcs/impl/x86/winograd_avx2.cpp @@ -0,0 +1,795 @@ +#include "saber/funcs/impl/x86/winograd_avx2.h" +#include "mkl_cblas.h" +#include "mkl_trans.h" +#include "tensor_op.h" +#include "saber/funcs/impl/x86/saber_avx2_expand.h" +namespace anakin { +namespace saber { + +#if defined(__AVX2__) and defined(__FMA__) + +/** + * \brief transpose with arm neon optimization + * @param data_out + * @param data_in + * @param w_in + * @param h_in + */ +static void transpose(float* data_out, const float* data_in, int w_in, int h_in) { + for (int j = 0; j < h_in; ++j) { + for (int i = 0; i < w_in; ++i) { + data_out[i * h_in + j] = data_in[j * w_in + i]; + } + } +} + +/** +* \brief winograd transform conv3x3 weights, f63 +* this is done in op initialization or creation, only do once +* dout = G * g * GT, where G is the transform coeff, g is the input weights +* @param dout +* @param din +* @param ch_out +* @param ch_in +* @param work_space +*/ +static void winograd_transform_weights(float* dout, const float* din, int ch_out, \ + int ch_in, float* work_space) { + const float coeff[8][3] = { + { 1.0f, 0.0f, 0.0f}, + { -2.0f / 9, -2.0f / 9, -2.0f / 9}, + { -2.0f / 9, 2.0f / 9, -2.0f / 9}, + { 1.0f / 90, 1.0f / 45, 2.0f / 45}, + { 1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + { 0.0f, 0.0f, 1.0f} + }; + + float* ptr_out = (float*)work_space; + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = static_cast(din) + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + tmpp[1] * coeff[i][1] + \ + tmpp[2] * coeff[i][2]; + } + } + } + } + + transpose(static_cast(dout), ptr_out, 64, ch_out * ch_in); +} + + +inline void transpose8_ps(__m256& row0, __m256& row1, __m256& row2, __m256& row3, __m256& row4, + __m256& row5, __m256& row6, __m256& row7) { + __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; + __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; + __t0 = _mm256_unpacklo_ps(row0, row1); + __t1 = _mm256_unpackhi_ps(row0, row1); + __t2 = _mm256_unpacklo_ps(row2, row3); + __t3 = _mm256_unpackhi_ps(row2, row3); + __t4 = _mm256_unpacklo_ps(row4, row5); + __t5 = _mm256_unpackhi_ps(row4, row5); + __t6 = _mm256_unpacklo_ps(row6, row7); + __t7 = _mm256_unpackhi_ps(row6, row7); + __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); + __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); + __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); + __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); + __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); + __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); + __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); + __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); + row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); + row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); + row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); + row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); + row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); + row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); + row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); + row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); +} + +static inline void winograd_f6k3_output_inplace_avx2( + __m256& m0, + __m256& m1, + __m256& m2, + __m256& m3, + __m256& m4, + __m256& m5, + __m256& m6, + __m256& m7, const float& bias, const bool& with_relu) { + + + + const __m256 m_32p0 = _mm256_set1_ps(32.f); + const __m256 m_16p0 = _mm256_set1_ps(16.f); + const __m256 m_8p0 = _mm256_set1_ps(8.f); + const __m256 m_4p0 = _mm256_set1_ps(4.f); + const __m256 m_2p0 = _mm256_set1_ps(2.f); + + const __m256 m_0p5 = _mm256_set1_ps(0.5f); + const __m256 m_0p25 = _mm256_set1_ps(0.25f); + const __m256 m_0p125 = _mm256_set1_ps(0.125f); + const __m256 m_0p0625 = _mm256_set1_ps(0.0625f); + const __m256 m_0p03125 = _mm256_set1_ps(0.03125f); + + __m256 m1_add_m2 = m1 + m2; + __m256 m1_sub_m2 = m1 - m2; + __m256 m3_add_m4 = m3 + m4; + __m256 m3_sub_m4 = m3 - m4; + __m256 m5_add_m6 = m5 + m6; + __m256 m5_sub_m6 = m5 - m6; + + // Finised with M[0-6] as **inputs** here. + m0 = m0 + m1_add_m2 + m3_add_m4 + m5_add_m6; + m2 = m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6; + m4 = m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625; + m1 = m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5; + m3 = m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125; + m5 = m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125; + m6 = _mm256_setzero_ps(); + m7 = _mm256_setzero_ps(); + + transpose8_ps(m0, m1, m2, m3, m4, m5, m6, m7); + + m1_add_m2 = m1 + m2; + m1_sub_m2 = m1 - m2; + m3_add_m4 = m3 + m4; + m3_sub_m4 = m3 - m4; + m5_add_m6 = m5 + m6; + m5_sub_m6 = m5 - m6; + + const __m256 bias_value = _mm256_set1_ps(bias); + const __m256 m_0p0 = _mm256_setzero_ps(); + + if (with_relu) { + m0 = _mm256_max_ps(bias_value + m0 + m1_add_m2 + m3_add_m4 + m5_add_m6, m_0p0); + m2 = _mm256_max_ps(bias_value + m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6, m_0p0); + m4 = _mm256_max_ps(bias_value + m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625, m_0p0); + m1 = _mm256_max_ps(bias_value + m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5, m_0p0); + m3 = _mm256_max_ps(bias_value + m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125, m_0p0); + m5 = _mm256_max_ps(bias_value + m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125, m_0p0); + } else { + m0 = bias_value + m0 + m1_add_m2 + m3_add_m4 + m5_add_m6; + m2 = bias_value + m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6; + m4 = bias_value + m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625; + m1 = bias_value + m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5; + m3 = bias_value + m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125; + m5 = bias_value + m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125; + } + + +} + +static inline void winograd_f6k3_output_inplace_avx2_float_in( + __m256& m0, + __m256& m1, + __m256& m2, + __m256& m3, + __m256& m4, + __m256& m5, + __m256& m6, + __m256& m7, float* din, const float& bias, const bool& with_relu) { + + + + const __m256 m_32p0 = _mm256_set1_ps(32.f); + const __m256 m_16p0 = _mm256_set1_ps(16.f); + const __m256 m_8p0 = _mm256_set1_ps(8.f); + const __m256 m_4p0 = _mm256_set1_ps(4.f); + const __m256 m_2p0 = _mm256_set1_ps(2.f); + + const __m256 m_0p5 = _mm256_set1_ps(0.5f); + const __m256 m_0p25 = _mm256_set1_ps(0.25f); + const __m256 m_0p125 = _mm256_set1_ps(0.125f); + const __m256 m_0p0625 = _mm256_set1_ps(0.0625f); + const __m256 m_0p03125 = _mm256_set1_ps(0.03125f); + + m0 = _mm256_loadu_ps(&din[0 * 8]); + m1 = _mm256_loadu_ps(&din[1 * 8]); + m2 = _mm256_loadu_ps(&din[2 * 8]); + m3 = _mm256_loadu_ps(&din[3 * 8]); + m4 = _mm256_loadu_ps(&din[4 * 8]); + m5 = _mm256_loadu_ps(&din[5 * 8]); + m6 = _mm256_loadu_ps(&din[6 * 8]); + m7 = _mm256_loadu_ps(&din[7 * 8]); + + __m256 m1_add_m2 = m1 + m2; + __m256 m1_sub_m2 = m1 - m2; + __m256 m3_add_m4 = m3 + m4; + __m256 m3_sub_m4 = m3 - m4; + __m256 m5_add_m6 = m5 + m6; + __m256 m5_sub_m6 = m5 - m6; + + // Finised with M[0-6] as **inputs** here. + m0 = m0 + m1_add_m2 + m3_add_m4 + m5_add_m6; + m2 = m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6; + m4 = m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625; + m1 = m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5; + m3 = m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125; + m5 = m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125; + m6 = _mm256_setzero_ps(); + m7 = _mm256_setzero_ps(); + + transpose8_ps(m0, m1, m2, m3, m4, m5, m6, m7); + + m1_add_m2 = m1 + m2; + m1_sub_m2 = m1 - m2; + m3_add_m4 = m3 + m4; + m3_sub_m4 = m3 - m4; + m5_add_m6 = m5 + m6; + m5_sub_m6 = m5 - m6; + + const __m256 bias_value = _mm256_set1_ps(bias); + const __m256 m_0p0 = _mm256_setzero_ps(); + + if (with_relu) { + m0 = _mm256_max_ps(bias_value + m0 + m1_add_m2 + m3_add_m4 + m5_add_m6, m_0p0); + m2 = _mm256_max_ps(bias_value + m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6, m_0p0); + m4 = _mm256_max_ps(bias_value + m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625, m_0p0); + m1 = _mm256_max_ps(bias_value + m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5, m_0p0); + m3 = _mm256_max_ps(bias_value + m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125, m_0p0); + m5 = _mm256_max_ps(bias_value + m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125, m_0p0); + } else { + m0 = bias_value + m0 + m1_add_m2 + m3_add_m4 + m5_add_m6; + m2 = bias_value + m1_add_m2 + m_4p0 * m3_add_m4 + m_0p25 * m5_add_m6; + m4 = bias_value + m1_add_m2 + m3_add_m4 * m_16p0 + m5_add_m6 * m_0p0625; + m1 = bias_value + m1_sub_m2 + m3_sub_m4 * m_2p0 + m5_sub_m6 * m_0p5; + m3 = bias_value + m1_sub_m2 + m3_sub_m4 * m_8p0 + m5_sub_m6 * m_0p125; + m5 = bias_value + m7 + m1_sub_m2 + m3_sub_m4 * m_32p0 + m5_sub_m6 * m_0p03125; + } + +} + +static inline void winograd_f6k3_input_inplace_avx2( + __m256& m0, + __m256& m1, + __m256& m2, + __m256& m3, + __m256& m4, + __m256& m5, + __m256& m6, + __m256& m7) { + const __m256 m_5p25 = _mm256_set1_ps(5.25f); + const __m256 m_4p25 = _mm256_set1_ps(4.25f); + const __m256 m_4p0 = _mm256_set1_ps(4.f); + const __m256 m_2p5 = _mm256_set1_ps(2.5f); + const __m256 m_2p0 = _mm256_set1_ps(2.f); + const __m256 m_1p25 = _mm256_set1_ps(1.25f); + const __m256 m_0p5 = _mm256_set1_ps(0.5f); + const __m256 m_0p25 = _mm256_set1_ps(0.25f); + m0 = m0 - m6 + (m4 - m2) * m_5p25; + m7 = m7 - m1 + (m3 - m5) * m_5p25; + + __m256 t1 = m2 + m6 - m4 * m_4p25; + __m256 t2 = m1 + m5 - m3 * m_4p25; + + __m256 s1 = m4 * m_1p25; + __m256 s2 = m3 * m_2p5; + + __m256 p1 = m6 + (m2 * m_0p25 - s1); + __m256 p2 = m1 * m_0p5 - s2 + m5 * m_2p0; + + m3 = p1 + p2; + m4 = p1 - p2; + + + p1 = m6 + (m2 - s1) * m_4p0; + p2 = m1 * m_2p0 - s2 + m5 * m_0p5; + + m5 = p1 + p2; + m6 = p1 - p2; + + m1 = _mm256_add_ps(t1, t2); + m2 = _mm256_sub_ps(t1, t2); + + transpose8_ps(m0, m1, m2, m3, m4, m5, m6, m7); + + m0 = m0 - m6 + (m4 - m2) * m_5p25; + m7 = m7 - m1 + (m3 - m5) * m_5p25; + + t1 = m2 + m6 - m4 * m_4p25; + t2 = m1 + m5 - m3 * m_4p25; + + s1 = m4 * m_1p25; + s2 = m3 * m_2p5; + + p1 = m6 + (m2 * m_0p25 - s1); + p2 = m1 * m_0p5 - s2 + m5 * m_2p0; + + m3 = p1 + p2; + m4 = p1 - p2; + + + p1 = m6 + (m2 - s1) * m_4p0; + p2 = m1 * m_2p0 - s2 + m5 * m_0p5; + + m5 = p1 + p2; + m6 = p1 - p2; + + m1 = _mm256_add_ps(t1, t2); + m2 = _mm256_sub_ps(t1, t2); +} + +static inline void winograd_f6k3_input_inplace_avx2( + __m256& m0, + __m256& m1, + __m256& m2, + __m256& m3, + __m256& m4, + __m256& m5, + __m256& m6, + __m256& m7, float* out) { + const __m256 m_5p25 = _mm256_set1_ps(5.25f); + const __m256 m_4p25 = _mm256_set1_ps(4.25f); + const __m256 m_4p0 = _mm256_set1_ps(4.f); + const __m256 m_2p5 = _mm256_set1_ps(2.5f); + const __m256 m_2p0 = _mm256_set1_ps(2.f); + const __m256 m_1p25 = _mm256_set1_ps(1.25f); + const __m256 m_0p5 = _mm256_set1_ps(0.5f); + const __m256 m_0p25 = _mm256_set1_ps(0.25f); + m0 = m0 - m6 + (m4 - m2) * m_5p25; + m7 = m7 - m1 + (m3 - m5) * m_5p25; + + __m256 t1 = m2 + m6 - m4 * m_4p25; + __m256 t2 = m1 + m5 - m3 * m_4p25; + + __m256 s1 = m4 * m_1p25; + __m256 s2 = m3 * m_2p5; + + __m256 p1 = m6 + (m2 * m_0p25 - s1); + __m256 p2 = m1 * m_0p5 - s2 + m5 * m_2p0; + + m3 = p1 + p2; + m4 = p1 - p2; + + + p1 = m6 + (m2 - s1) * m_4p0; + p2 = m1 * m_2p0 - s2 + m5 * m_0p5; + + m5 = p1 + p2; + m6 = p1 - p2; + + m1 = _mm256_add_ps(t1, t2); + m2 = _mm256_sub_ps(t1, t2); + + transpose8_ps(m0, m1, m2, m3, m4, m5, m6, m7); + + m0 = m0 - m6 + (m4 - m2) * m_5p25; + m7 = m7 - m1 + (m3 - m5) * m_5p25; + _mm256_storeu_ps(out + 0 * 8, m0); + _mm256_storeu_ps(out + 7 * 8, m7); + + t1 = m2 + m6 - m4 * m_4p25; + t2 = m1 + m5 - m3 * m_4p25; + + s1 = m4 * m_1p25; + s2 = m3 * m_2p5; + + p1 = m6 + (m2 * m_0p25 - s1); + p2 = m1 * m_0p5 - s2 + m5 * m_2p0; + + m3 = p1 + p2; + m4 = p1 - p2; + _mm256_storeu_ps(out + 3 * 8, m3); + _mm256_storeu_ps(out + 4 * 8, m4); + + p1 = m6 + (m2 - s1) * m_4p0; + p2 = m1 * m_2p0 - s2 + m5 * m_0p5; + + m5 = p1 + p2; + m6 = p1 - p2; + _mm256_storeu_ps(out + 5 * 8, m5); + _mm256_storeu_ps(out + 6 * 8, m6); + + m1 = _mm256_add_ps(t1, t2); + m2 = _mm256_sub_ps(t1, t2); + _mm256_storeu_ps(out + 1 * 8, m1); + _mm256_storeu_ps(out + 2 * 8, m2); +} + +static void winograd_all_in_one(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, float* tmp_work_space) { + int size_in_channel = win * hin; + int size_out_channel = wout * hout; + //! transform input + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = chin > chout ? chin : chout; + + for (int oc = 0; oc < chout; oc++) { + + for (int h = 0; h < tile_h; h++) { + + for (int w = 0; w < tile_w; w++) { + __m256 result[8] = {_mm256_setzero_ps()}; + + for (int ic = 0; ic < chin; ++ic) { + //! prepare data 8x8 + //! row 8 + __m256 data_in_tmp[8] = {_mm256_setzero_ps()}; + const float* din_channel = din + ic * size_in_channel; + + //memset(data_in_tmp[0], 0, sizeof(float) * 64); + for (int j = 0; j < 8; ++j) { + int start_row = h * 6 + j - pad_h; + + if (start_row >= 0 && start_row < hin) { + for (int k = 0; k < 8; ++k) { + int start_col = w * 6 + k - pad_w; + + if (start_col >= 0 && start_col < win) { + data_in_tmp[j][k] = din_channel[start_row * win + start_col]; + } + } + } + } + + winograd_f6k3_input_inplace_avx2(data_in_tmp[0], data_in_tmp[1], data_in_tmp[2], data_in_tmp[3], + data_in_tmp[4], + data_in_tmp[5], data_in_tmp[6], data_in_tmp[7]); + + + // exit(0); + ///////////////////////////////////// + for (int i = 0; i < 8; i++) { + int weights_index = oc * chin * 64 + ic * 64; + result[i] += data_in_tmp[i] * _mm256_loadu_ps(&weights[weights_index + i * 8]); + } + } + + float bias_value = flag_bias ? bias[oc] : 0.f; + //output + winograd_f6k3_output_inplace_avx2(result[0], result[1], result[2], result[3], result[4], + result[5], result[6], result[7], bias_value, flag_relu); + + float* dout_channel = dout + oc * hout * wout; + + for (int j = 0; j < 6; ++j) { + int end_row = h * 6 + j; + + if (end_row < hout) { + for (int k = 0; k < 6; ++k) { + int end_col = w * 6 + k; + + if (end_col < wout) { + dout_channel[end_row * wout + end_col] = result[j][k]; + } + } + } + } + } + } + } + +} + + +static void conv_x86_winograd3x3_avx2_opt(const float* din, float* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const float* weights, const float* bias, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, float* tmp_work_space) { + int size_in_channel = win * hin; + int size_out_channel = wout * hout; + //! transform input + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = chin > chout ? chin : chout; + + int m = chout; + int n = size_tile; + int k = chin; + + + //! tmp data buffer for input transform + float* tmp_data1 = tmp_work_space; + //! tmp data buffer for dot mul + float* tmp_data2 = tmp_data1 + size_trans_channel * max_ch; + + //SaberTimer t1; + //Context ctx1; + + + for (int i = 0; i < num; ++i) { + + const float* din_batch = static_cast(din) + i * chin * size_in_channel; + float* dout_batch = static_cast(dout) + i * chout * size_out_channel; + + //t1.start(ctx1); + //! transform input Bt * data * B +#if 1 + #pragma omp parallel for schedule(static) + + for (int j = 0; j < chin; ++j) { + + const float* din_channel = din_batch + j * size_in_channel; + float* data_trans_channel = tmp_data1 + j * size_trans_channel; + + for (int h = 0; h < tile_h; h++) { + + for (int w = 0; w < tile_w; w ++) { + //! prepare data 8x8 + //! row 8 + __m256 data_in_tmp[8] = {_mm256_setzero_ps()}; + + //memset(data_in_tmp[0], 0, sizeof(float) * 64); + for (int j = 0; j < 8; ++j) { + int start_row = h * 6 + j - pad_h; + + if (start_row >= 0 && start_row < hin) { + int start_col = w * 6 - pad_w; + + if (start_col >= 0) { + if (win - start_col >= 8) { + data_in_tmp[j] = _mm256_loadu_ps(&din_channel[start_row * win + start_col]); + } else { + int remainder = win - start_col; + data_in_tmp[j] = _mm256_maskload_ps(&din_channel[start_row * win + start_col], + _m256_continue_mask_m256i(remainder)); + } + } else { + for (int k = 0; k < 8; ++k) { + int start_col = w * 6 + k - pad_w; + + if (start_col >= 0 && start_col < win) { + data_in_tmp[j][k] = din_channel[start_row * win + start_col]; + } + } + } + + } + } + + winograd_f6k3_input_inplace_avx2(data_in_tmp[0], data_in_tmp[1], data_in_tmp[2], data_in_tmp[3], + data_in_tmp[4], + data_in_tmp[5], data_in_tmp[6], data_in_tmp[7], data_trans_channel); + + data_trans_channel += 64; + } + } + } + +#endif + //! end of transform input + +#if 1 + //////////////////////////////////////////////////////////////////////////////// + //! dot mul + //! transpose input, convert from ch_in * tile_h * tile_w * 64 to + //! 64 * ch_in * tile_h * tile_w + int hblock = 16; + int m_round = hblock * ((chout + hblock - 1) / hblock); + int stride_a = m_round * chin; + int stride_b = chin * size_tile; + int stride_c = chout * size_tile; +#if 1 + MKL_Somatcopy('R', 'T', stride_b, 64, 1.f, tmp_data1, 64, tmp_data2, stride_b); +#endif + + + CBLAS_TRANSPOSE trans[1] = {CblasNoTrans}; + int m_array[1] = {chout}; + int n_array[1] = {size_tile}; + int k_array[1] = {chin}; + int lda_array[1] = {chin}; + int ldb_array[1] = {size_tile}; + int ldc_array[1] = {size_tile}; + float alpha_array[1] = {1.f}; + float beta_array[1] = {0.f}; + const float* ptr_a_array[64]; + const float* ptr_b_array[64]; + float* ptr_c_array[64]; + int group_size[1] = {64}; + + for (int l = 0; l < 64; ++l) { + ptr_a_array[l] = static_cast(weights) + l * chout * chin; + ptr_b_array[l] = tmp_data2 + l * stride_b; + ptr_c_array[l] = tmp_data1 + l * stride_c; + + } + + + cblas_sgemm_batch(CblasRowMajor, trans, trans, m_array, n_array, k_array, alpha_array, ptr_a_array, + lda_array, ptr_b_array, ldb_array, beta_array, ptr_c_array, ldc_array, 1, group_size); + + //! transpose output, convert from 64 * ch_out * tile_h * tile_w to + //! ch_out * tile_h * tile_w * 64 +#if 1 + MKL_Somatcopy('R', 'T', 64, stride_c, 1.f, tmp_data1, stride_c, tmp_data2, 64); +#endif + //! end of dot mul +#endif + +#if 1 + /////////////////////////////////////////////////////////////////////////////// + //! transform output + #pragma omp parallel for schedule(static) + + for (int i = 0; i < chout; ++i) { + + float bias_value = flag_bias ? static_cast(bias)[i] : 0.f; + float* dout_tmp = tmp_data2 + i * size_trans_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + for (int h = 0; h < tile_h; ++h) { + for (int w = 0; w < tile_w; ++w) { + + __m256 out_tmp[8]; + + winograd_f6k3_output_inplace_avx2_float_in(out_tmp[0], out_tmp[1], out_tmp[2], out_tmp[3], + out_tmp[4], out_tmp[5], out_tmp[6], out_tmp[7], dout_tmp, bias_value, flag_relu); + dout_tmp += 64; + + for (int j = 0; j < 6; ++j) { + int end_row = h * 6 + j; + + if (end_row < hout) { + int end_col = w * 6 ; + + int remainder = std::min(wout - end_col, 6); + _mm256_maskstore_ps(&dout_channel[end_row * wout + end_col], _m256_continue_mask_m256i(remainder), + out_tmp[j]); + } + } + } + } + } + + //! end of transform output +#endif + } +} + +template <> +SaberStatus SaberConvWinogradAvx2::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + ConvParam* conv_param = ¶m.conv_param; + int batch_size = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = conv_param->weight()->height(); + int kernel_w = conv_param->weight()->width(); + int in_stride = in_h * in_w; + int out_stride = out_h * out_w; + int group = conv_param->group; + const float* weights_d = (const float*)conv_param->weight()->data(); + _winor_weights.re_alloc(Shape({8, 8, out_c, in_c})); + Tensor trans_temp(Shape({8, 8, out_c, in_c})); + float* trans_tmp_ptr = static_cast(trans_temp.mutable_data()); + + winograd_transform_weights(static_cast(_winor_weights.mutable_data()), + static_cast(conv_param->weight()->data()), out_c, in_c, + trans_tmp_ptr); + + + int tile_w = (out_w + 5) / 6; + int tile_h = (out_h + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = in_c > out_c ? in_c : out_c; + _winor_temp.re_alloc(Shape({1, 2, max_ch, size_trans_channel})); + + + return SaberSuccess; +} + +template <> +SaberStatus SaberConvWinogradAvx2::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + LOG(INFO) << "SaberConvWinogradAvx2 init"; + return create(inputs, outputs, param, ctx); + +} + + +template <> +SaberStatus SaberConvWinogradAvx2::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = ¶m.conv_param; + int batch_size = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = conv_param->weight()->height(); + int kernel_w = conv_param->weight()->width(); + int in_stride = in_h * in_w; + int out_stride = out_h * out_w; + int group = conv_param->group; + int weight_size_per_group = (out_c / group) * (in_c / group) * kernel_h * kernel_w; + const float* bias_ptr = nullptr; + + if (conv_param->bias() != nullptr && conv_param->bias()->valid_size() > 0) { + bias_ptr = static_cast(conv_param->bias()->data()); + } + + bool with_relu = conv_param->activation_param.active == Active_relu; + + + const float* din = (const float*)inputs[0]->data(); + float* dout = (float*)outputs[0]->mutable_data(); + + conv_x86_winograd3x3_avx2_opt(din, dout, batch_size, out_c, out_h, out_w, in_c, in_h, in_w, + static_cast(_winor_weights.data()), + bias_ptr, conv_param->pad_w, conv_param->pad_h, bias_ptr != nullptr, with_relu, + static_cast(_winor_temp.mutable_data())); + + return SaberSuccess; +} + +#else +template <> +SaberStatus SaberConvWinogradAvx2::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + + + return SaberUnImplError; +} + +template <> +SaberStatus SaberConvWinogradAvx2::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + return create(inputs, outputs, param, ctx); + +} + + +template <> +SaberStatus SaberConvWinogradAvx2::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + + + return SaberUnImplError; +} + +#endif + +} +} diff --git a/saber/funcs/impl/x86/winograd_avx2.h b/saber/funcs/impl/x86/winograd_avx2.h new file mode 100644 index 000000000..a53c6d9ed --- /dev/null +++ b/saber/funcs/impl/x86/winograd_avx2.h @@ -0,0 +1,39 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_AVX2_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_AVX2_H +#include "saber/funcs/impl/impl_conv.h" +#include "saber/core/tensor.h" + +namespace anakin { +namespace saber { +template +class SaberConvWinogradAvx2 : public ImplBase < + X86, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConvWinogradAvx2() {} + + ~SaberConvWinogradAvx2() { + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param); + +private: + Tensor _winor_weights; + Tensor _winor_temp; + +}; +} +} +#endif //ANAKIN_WINOGRAD_H diff --git a/saber/funcs/impl/x86/winograd_float.cpp b/saber/funcs/impl/x86/winograd_float.cpp new file mode 100644 index 000000000..662deff0d --- /dev/null +++ b/saber/funcs/impl/x86/winograd_float.cpp @@ -0,0 +1,656 @@ +#include "saber/funcs/impl/x86/winograd_float.h" +#include "mkl_cblas.h" +#include "mkl_trans.h" + +namespace anakin { +namespace saber { + +/** + * \brief transpose with arm neon optimization + * @param data_out + * @param data_in + * @param w_in + * @param h_in + */ +static void transpose(float* data_out, const float* data_in, int w_in, int h_in) { + for (int j = 0; j < h_in; ++j) { + for (int i = 0; i < w_in; ++i) { + data_out[i * h_in + j] = data_in[j * w_in + i]; + } + } +} + + +/** + * \brief winograd transform conv3x3 weights, f63 + * this is done in op initialization or creation, only do once + * dout = G * g * GT, where G is the transform coeff, g is the input weights + * @param dout + * @param din + * @param ch_out + * @param ch_in + * @param work_space + */ +static void winograd_transform_weights(float* dout, const float* din, int ch_out, \ + int ch_in, float* work_space) { + const float coeff[8][3] = { + { 1.0f, 0.0f, 0.0f}, + { -2.0f / 9, -2.0f / 9, -2.0f / 9}, + { -2.0f / 9, 2.0f / 9, -2.0f / 9}, + { 1.0f / 90, 1.0f / 45, 2.0f / 45}, + { 1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + { 0.0f, 0.0f, 1.0f} + }; + + float* ptr_out = work_space; + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = din + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + tmpp[1] * coeff[i][1] + \ + tmpp[2] * coeff[i][2]; + } + } + } + } + + transpose(static_cast(dout), ptr_out, 64, ch_out * ch_in); +} + +static void winograd_transform_weights_oc_ic_64(float* dout, const float* din, int ch_out, \ + int ch_in, float* work_space) { + const float coeff[8][3] = { + { 1.0f, 0.0f, 0.0f}, + { -2.0f / 9, -2.0f / 9, -2.0f / 9}, + { -2.0f / 9, 2.0f / 9, -2.0f / 9}, + { 1.0f / 90, 1.0f / 45, 2.0f / 45}, + { 1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + { 0.0f, 0.0f, 1.0f} + }; + + float* ptr_out = dout; + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = static_cast(din) + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + + for (int i = 0; i < 8; i++) { + tmp[i][0] = k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) { + ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + tmpp[1] * coeff[i][1] + \ + tmpp[2] * coeff[i][2]; + } + } + } + } + +} +template <> +SaberStatus SaberConvWinogradFloat::create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + ConvParam* conv_param = ¶m.conv_param; + int batch_size = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = conv_param->weight()->height(); + int kernel_w = conv_param->weight()->width(); + int in_stride = in_h * in_w; + int out_stride = out_h * out_w; + int group = conv_param->group; + const float* weights_d = (const float*)conv_param->weight()->data(); + _winor_weights.re_alloc(Shape({8, 8, out_c, in_c})); + Tensor trans_temp(Shape({8, 8, out_c, in_c})); + float* trans_tmp_ptr = static_cast(trans_temp.mutable_data()); + + winograd_transform_weights(static_cast(_winor_weights.mutable_data()), static_cast(conv_param->weight()->data()), out_c, in_c, + trans_tmp_ptr); + + + int tile_w = (out_w + 5) / 6; + int tile_h = (out_h + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = in_c > out_c ? in_c : out_c; + _winor_temp.re_alloc(Shape({1, 2, max_ch, size_trans_channel})); + + return SaberSuccess; +} + +template <> +SaberStatus SaberConvWinogradFloat::init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx) { + this->_ctx = &ctx; + LOG(INFO)<<"SaberConvWinogradFloat init"; + return create(inputs, outputs, param, ctx); + +} + +static void gemm(const bool trans_a, const bool transb, int m, int n, int k, const float alpha, + const float* a, const float* b, const float beta, float* c) { + // cout << "(" << m << "," << n << "," << k << ")" << endl; + int lda = (!trans_a/* == CblasNoTrans*/) ? k : m; + int ldb = (!transb/* == CblasNoTrans*/) ? n : k; + CBLAS_TRANSPOSE cblas_transa = + (!trans_a/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE cblas_transb = + (!transb/* == CblasNoTrans*/) ? CblasNoTrans : CblasTrans; + cblas_sgemm(CblasRowMajor, cblas_transa, cblas_transb, m, n, k, alpha, a, k, b, n, beta, c, n); +}; + +static void print_hw(const float* in, int h, int w) { + printf("\n"); + + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j++) { + printf("%f \t", in[i * w + j]); + } + + printf("\n"); + } + +} + + +/** + * \brief winograd conv, transform input, f6x3 + * dout = BT * d * B, whrer B is the transform + * BT = 1 0 -21/4 0 21/4 0 -1 0 + * 0 1 1 -17/4 -17/4 1 1 0 + * 0 -1 1 17/4 -17/4 -1 1 0 + * 0 1/2 1/4 -5/2 -5/4 2 1 0 + * 0 -1/2 1/4 5/2 -5/4 -2 1 0 + * 0 2 4 -5/2 -5 1/2 1 0 + * 0 -2 4 5/2 -5 -1/2 1 0 + * 0 -1 0 21/4 0 -21/4 0 1 + * @param dout + * @param din + */ +inline void transform_input_f6x6(float* dout, const float* din) { + float tmp[8][8]; + + //! BT * d + for (int m = 0; m < 8; m++) { + tmp[0][m] = din[0] - din[6] + (din[4] - din[2]) * 5.25f; + tmp[7][m] = din[7] - din[1] + (din[3] - din[5]) * 5.25f; + + float tmp12a = din[2] + din[6] - din[4] * 4.25f; + float tmp12b = din[1] + din[5] - din[3] * 4.25f; + + tmp[1][m] = tmp12a + tmp12b; + tmp[2][m] = tmp12a - tmp12b; + + float tmp34a = din[6] + din[2] * 0.25f - din[4] * 1.25f; + float tmp34b = din[1] * 0.5f - din[3] * 2.5f + din[5] * 2.f; + + tmp[3][m] = tmp34a + tmp34b; + tmp[4][m] = tmp34a - tmp34b; + + float tmp56a = din[6] + (din[2] - din[4] * 1.25f) * 4.f; + float tmp56b = din[1] * 2.f - din[3] * 2.5f + din[5] * 0.5f; + + tmp[5][m] = tmp56a + tmp56b; + tmp[6][m] = tmp56a - tmp56b; + + din += 8; + } + + for (int m = 0; m < 8; m++) { + const float* tmp0 = tmp[m]; + + dout[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f; + dout[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f; + + float tmp12a = tmp0[2] + tmp0[6] - tmp0[4] * 4.25f; + float tmp12b = tmp0[1] + tmp0[5] - tmp0[3] * 4.25f; + + dout[1] = tmp12a + tmp12b; + dout[2] = tmp12a - tmp12b; + + float tmp34a = tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f; + float tmp34b = tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f; + + dout[3] = tmp34a + tmp34b; + dout[4] = tmp34a - tmp34b; + + float tmp56a = tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f; + float tmp56b = tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f; + + dout[5] = tmp56a + tmp56b; + dout[6] = tmp56a - tmp56b; + + dout += 8; + } +} + +/** + * \brief winograd conv, transform input, f6x3 + * dout = BT * d * B, whrer B is the transform + * BT = 1 0 -21/4 0 21/4 0 -1 0 + * 0 1 1 -17/4 -17/4 1 1 0 + * 0 -1 1 17/4 -17/4 -1 1 0 + * 0 1/2 1/4 -5/2 -5/4 2 1 0 + * 0 -1/2 1/4 5/2 -5/4 -2 1 0 + * 0 2 4 -5/2 -5 1/2 1 0 + * 0 -2 4 5/2 -5 -1/2 1 0 + * 0 -1 0 21/4 0 -21/4 0 1 + * @param dout + * @param din + */ +inline void transform_input_f6x6_c8(float* dout, const float* din) { + float tmp[8][8][8]; + + //! BT * d + for (int m = 0; m < 8; m++) { + for (int i = 0; i < 8; i++) { + tmp[0][m][i] = din[0] - din[6] + (din[4] - din[2]) * 5.25f; + tmp[7][m][i] = din[7] - din[1] + (din[3] - din[5]) * 5.25f; + + float tmp12a = din[2] + din[6] - din[4] * 4.25f; + float tmp12b = din[1] + din[5] - din[3] * 4.25f; + + tmp[1][m][i] = tmp12a + tmp12b; + tmp[2][m][i] = tmp12a - tmp12b; + + float tmp34a = din[6] + din[2] * 0.25f - din[4] * 1.25f; + float tmp34b = din[1] * 0.5f - din[3] * 2.5f + din[5] * 2.f; + + tmp[3][m][i] = tmp34a + tmp34b; + tmp[4][m][i] = tmp34a - tmp34b; + + float tmp56a = din[6] + (din[2] - din[4] * 1.25f) * 4.f; + float tmp56b = din[1] * 2.f - din[3] * 2.5f + din[5] * 0.5f; + + tmp[5][m][i] = tmp56a + tmp56b; + tmp[6][m][i] = tmp56a - tmp56b; + din += 8; + } + + } + + for (int m = 0; m < 8; m++) { + for (int i = 0; i < 8; i++) { + const float* tmp0 = tmp[m][i]; + + dout[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f; + dout[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f; + + float tmp12a = tmp0[2] + tmp0[6] - tmp0[4] * 4.25f; + float tmp12b = tmp0[1] + tmp0[5] - tmp0[3] * 4.25f; + + dout[1] = tmp12a + tmp12b; + dout[2] = tmp12a - tmp12b; + + float tmp34a = tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f; + float tmp34b = tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f; + + dout[3] = tmp34a + tmp34b; + dout[4] = tmp34a - tmp34b; + + float tmp56a = tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f; + float tmp56b = tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f; + + dout[5] = tmp56a + tmp56b; + dout[6] = tmp56a - tmp56b; + + dout += 8; + } + } +} + + +inline void transform_output_f6x6(float* output, const float* din, float bias) { + float tmp[6][8]; + + for (int m = 0; m < 8; m++) { + float tmp024a = din[1] + din[2]; + float tmp135a = din[1] - din[2]; + + float tmp024b = din[3] + din[4]; + float tmp135b = din[3] - din[4]; + + float tmp024c = din[5] + din[6]; + float tmp135c = din[5] - din[6]; + + tmp[0][m] = din[0] + tmp024a + tmp024b + tmp024c; + tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 0.25f; + tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c * 0.0625f; + + tmp[1][m] = tmp135a + tmp135b * 2 + tmp135c * 0.5f; + tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 0.125f; + tmp[5][m] = din[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; + + din += 8; + } + + for (int m = 0; m < 6; m++) { + const float* tmp0 = tmp[m]; + + float tmp024a = tmp0[1] + tmp0[2]; + float tmp135a = tmp0[1] - tmp0[2]; + + float tmp024b = tmp0[3] + tmp0[4]; + float tmp135b = tmp0[3] - tmp0[4]; + + float tmp024c = tmp0[5] + tmp0[6]; + float tmp135c = tmp0[5] - tmp0[6]; + + output[0] = bias + tmp0[0] + tmp024a + tmp024b + tmp024c; + output[2] = bias + tmp024a + tmp024b * 4 + tmp024c * 0.25f; + output[4] = bias + tmp024a + tmp024b * 16 + tmp024c * 0.0625f; + + output[1] = bias + tmp135a + tmp135b * 2 + tmp135c * 0.5f; + output[3] = bias + tmp135a + tmp135b * 8 + tmp135c * 0.125f; + output[5] = bias + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f; + + output += 6; + } +} + +static void load_data_2_ic_th_tw_64_8(int pad_h, int pad_w, int tile_h, int tile_w, int chin, + int hin, + int win, const float* din_batch, float* dout) { + int size_in_channel = win * hin * 8; + int chin_div_up_8 = chin / 8; + + for (int ic = 0; ic < chin_div_up_8; ++ic) { + for (int h = 0; h < tile_h; h++) { + for (int w = 0; w < tile_w; w++) { + + const float* din_channel = din_batch + ic * size_in_channel; + float* data_trans_channel = dout + ic * tile_h * tile_w * 64 * 8 + h * tile_w * 64 * 8 + w * 64 * 8; + //! prepare data 8x8 + //! row 8 + float data_in_tmp[8][8][8] = {0.f}; + + //memset(data_in_tmp[0], 0, sizeof(float) * 64); + for (int j = 0; j < 8; ++j) { + int start_row = h * 6 + j - pad_h; + + if (start_row >= 0 && start_row < hin) { + for (int k = 0; k < 8; ++k) { + int start_col = w * 6 + k - pad_w; + + if (start_col >= 0 && start_col < win) { + for (int i = 0; i < 8; i++) { + data_in_tmp[j][k][i] = din_channel[start_row * win * 8 + start_col * 8 + i]; + } + } + } + } + } + + // print_hw(&data_in_tmp[0][0],8,8); + transform_input_f6x6(data_trans_channel, &data_in_tmp[0][0][0]); + + // print_hw(data_trans_channel,8,8); + // exit(0); + + + } + } + } +} + + + + + +static void conv_x86_winograd3x3(const void *din, void *dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const void *weights, const void *bias, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu, float *tmp_work_space) { + int size_in_channel = win * hin; + int size_out_channel = wout * hout; + //! transform input + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = chin > chout ? chin : chout; + + int m = chout; + int n = size_tile; + int k = chin; + + + //! tmp data buffer for input transform + float* tmp_data1 = tmp_work_space; + //! tmp data buffer for dot mul + float* tmp_data2 = tmp_data1 + size_trans_channel * max_ch; + + //SaberTimer t1; + //Context ctx1; + + + for (int i = 0; i < num; ++i) { + + const float* din_batch = static_cast(din) + i * chin * size_in_channel; + float* dout_batch = static_cast(dout) + i * chout * size_out_channel; + + //t1.start(ctx1); + //! transform input Bt * data * B +#if 1 + #pragma omp parallel for schedule(static) + + for (int j = 0; j < chin; ++j) { + + const float* din_channel = din_batch + j * size_in_channel; + float* data_trans_channel = tmp_data1 + j * size_trans_channel; + + for (int h = 0; h < tile_h; h++) { + + for (int w = 0; w < tile_w; w ++) { + //! prepare data 8x8 + //! row 8 + float data_in_tmp[8][8] = {0.f}; + + //memset(data_in_tmp[0], 0, sizeof(float) * 64); + for (int j = 0; j < 8; ++j) { + int start_row = h * 6 + j - pad_h; + + if (start_row >= 0 && start_row < hin) { + for (int k = 0; k < 8; ++k) { + int start_col = w * 6 + k - pad_w; + + if (start_col >= 0 && start_col < win) { + data_in_tmp[j][k] = din_channel[start_row * win + start_col]; + } + } + } + } + + transform_input_f6x6(data_trans_channel, &data_in_tmp[0][0]); + data_trans_channel += 64; + } + } + } + +#endif + + //! end of transform input + +#if 1 + //////////////////////////////////////////////////////////////////////////////// + //! dot mul + //! transpose input, convert from ch_in * tile_h * tile_w * 64 to + //! 64 * ch_in * tile_h * tile_w + int hblock = 16; + int m_round = hblock * ((chout + hblock - 1) / hblock); + int stride_a = m_round * chin; + int stride_b = chin * size_tile; + int stride_c = chout * size_tile; +#if 1 + MKL_Somatcopy('R', 'T', stride_b, 64, 1.f, tmp_data1, 64, tmp_data2, stride_b); +#endif + // transpose(tmp_data2, tmp_data1, 64, stride_b); + + + CBLAS_TRANSPOSE trans[1] = {CblasNoTrans}; + int m_array[1] = {chout}; + int n_array[1] = {size_tile}; + int k_array[1] = {chin}; + int lda_array[1] = {chin}; + int ldb_array[1] = {size_tile}; + int ldc_array[1] = {size_tile}; + float alpha_array[1] = {1.f}; + float beta_array[1] = {0.f}; + const float* ptr_a_array[64]; + const float* ptr_b_array[64]; + float* ptr_c_array[64]; + int group_size[1] = {64}; + + for (int l = 0; l < 64; ++l) { + ptr_a_array[l] = static_cast(weights) + l * chout * chin; + ptr_b_array[l] = tmp_data2 + l * stride_b; + ptr_c_array[l] = tmp_data1 + l * stride_c; + } + + cblas_sgemm_batch(CblasRowMajor, trans, trans, m_array, n_array, k_array, alpha_array, ptr_a_array, + lda_array, ptr_b_array, ldb_array, beta_array, ptr_c_array, ldc_array, 1, group_size); + + //! transpose output, convert from 64 * ch_out * tile_h * tile_w to + //! ch_out * tile_h * tile_w * 64 + // transpose(tmp_data2, tmp_data1, stride_c, 64); +#if 1 + MKL_Somatcopy('R', 'T', 64, stride_c, 1.f, tmp_data1, stride_c, tmp_data2, 64); +#endif + //! end of dot mul +#endif + +#if 1 + /////////////////////////////////////////////////////////////////////////////// + //! transform output + #pragma omp parallel for schedule(static) + + for (int i = 0; i < chout; ++i) { + + float bias_value = flag_bias ? static_cast(bias)[i] : 0.f; + float* dout_tmp = tmp_data2 + i * size_trans_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + for (int h = 0; h < tile_h; ++h) { + for (int w = 0; w < tile_w; ++w) { + + float out_tmp[6][6]; + + transform_output_f6x6(out_tmp[0], dout_tmp, bias_value); + dout_tmp += 64; + + for (int j = 0; j < 6; ++j) { + int end_row = h * 6 + j; + + if (end_row < hout) { + for (int k = 0; k < 6; ++k) { + int end_col = w * 6 + k; + + if (end_col < wout) { + if (flag_relu) { + dout_channel[end_row * wout + end_col] = out_tmp[j][k] > 0.f ? out_tmp[j][k] : 0.f; + } else { + dout_channel[end_row * wout + end_col] = out_tmp[j][k]; + } + } + } + } + } + } + } + } + + //! end of transform output +#endif + //t1.end(ctx1); + //LOG(INFO) << "winograd conv transform output time: " << t1.get_average_ms(); + } +} + + +template <> +SaberStatus SaberConvWinogradFloat::dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param) { + ConvParam* conv_param = ¶m.conv_param; + int batch_size = inputs[0]->num(); + int in_c = inputs[0]->channel(); + int in_h = inputs[0]->height(); + int in_w = inputs[0]->width(); + int out_c = outputs[0]->channel(); + int out_h = outputs[0]->height(); + int out_w = outputs[0]->width(); + int kernel_h = conv_param->weight()->height(); + int kernel_w = conv_param->weight()->width(); + int in_stride = in_h * in_w; + int out_stride = out_h * out_w; + int group = conv_param->group; + int weight_size_per_group = (out_c / group) * (in_c / group) * kernel_h * kernel_w; + const float* bias_ptr = nullptr; + + if (conv_param->bias() != nullptr && conv_param->bias()->valid_size() > 0) { + bias_ptr = static_cast(conv_param->bias()->data()); + } + + bool with_relu = conv_param->activation_param.active == Active_relu; + + + const float* din = (const float*)inputs[0]->data(); + float* dout = (float*)outputs[0]->mutable_data(); + + conv_x86_winograd3x3(din, dout, batch_size, out_c, out_h, out_w, in_c, in_h, in_w, + static_cast(_winor_weights.data()), + bias_ptr, conv_param->pad_w, conv_param->pad_h, bias_ptr != nullptr, with_relu, + static_cast(_winor_temp.mutable_data())); + return SaberSuccess; +} + +} +} diff --git a/saber/funcs/impl/x86/winograd_float.h b/saber/funcs/impl/x86/winograd_float.h new file mode 100644 index 000000000..22e26ac7f --- /dev/null +++ b/saber/funcs/impl/x86/winograd_float.h @@ -0,0 +1,39 @@ + +#ifndef ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_FLOAT_H +#define ANAKIN_SABER_FUNCS_IMPL_X86_WINOGRAD_FLOAT_H +#include "saber/funcs/impl/impl_conv.h" +#include "saber/core/tensor.h" + +namespace anakin { +namespace saber { +template +class SaberConvWinogradFloat : public ImplBase < + X86, OpDtype, ConvEltwiseParam > { +public: + typedef typename DataTrait::Dtype OpDataType; + + SaberConvWinogradFloat() {} + + ~SaberConvWinogradFloat() { + } + + virtual SaberStatus init(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus create(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param, Context& ctx); + + virtual SaberStatus dispatch(const std::vector *>& inputs, + std::vector *>& outputs, + ConvEltwiseParam& param); + +private: + Tensor _winor_weights; + Tensor _winor_temp; + +}; +} +} +#endif //ANAKIN_WINOGRAD_H diff --git a/saber/funcs/impl/x86/x86_utils.h b/saber/funcs/impl/x86/x86_utils.h index a101f86d1..18b69291f 100644 --- a/saber/funcs/impl/x86/x86_utils.h +++ b/saber/funcs/impl/x86/x86_utils.h @@ -20,12 +20,14 @@ #include #include #include -#include +#include +#include +#include "saber/funcs/impl/x86/anakin_thread.h" #include "saber/core/common.h" #include "saber/core/tensor.h" #include "saber/funcs/saber_util.h" -#include "omp.h" +#include "calibrate.h" namespace anakin { namespace saber { @@ -38,589 +40,395 @@ namespace saber { namespace utils { - -/* a bunch of std:: analogues to be compliant with any msvs version - * - * Rationale: msvs c++ (and even some c) headers contain special pragma that - * injects msvs-version check into object files in order to abi-mismatches - * during the static linking. This makes sense if e.g. std:: objects are passed - * through between application and library, which is not the case for mkl-dnn - * (since there is no any c++-rt dependent stuff, ideally...). */ - -/* SFINAE helper -- analogue to std::enable_if */ -class VectorPrint { -public: - template - static void print_float(Dtype* target) { - float* f = (float*)target; - printf("size = %d\n", sizeof(Dtype)); - - for (int i = 0; i < sizeof(Dtype) / sizeof(float); i++) { - printf(" %f ,", f[i]); - } - - printf("\n"); - } -}; template static inline void try_expand_clean_tensor(opTensor& tensor, anakin::saber::Shape shape) { - if (utils::try_expand_tensor(tensor, shape)) { + if (try_expand_tensor(tensor, shape)) { memset(tensor.mutable_data(), 0, tensor.valid_size()* type_length(tensor.get_dtype())); }; } -class AlignedUtils { +class ScaleUtils { public: - template - void aligned_last_dim(const Dtype* input, Dtype* output, int input_size, int ori_last_dim, - int aligned_dim) { - for (int row = 0; row < input_size / ori_last_dim; row++) { - for (int col = ori_last_dim; col < aligned_dim; col++) { - output[row * aligned_dim + col] = static_cast(0); - } - } + static void cvt_int32_fp32(int* data, float* scale, int m, int n) { + float* out_data = (float*)(data); - for (int i = 0; i < input_size; i++) { - int row = i / ori_last_dim; - int col = i % ori_last_dim; - output[row * aligned_dim + col] = input[i]; - } - } - template - void unaligned_last_dim(const Dtype* input, Dtype* output, int output_size, int ori_last_dim, - int aligned_dim) { - for (int i = 0; i < output_size; i++) { - int row = i / ori_last_dim; - int col = i % ori_last_dim; - output[i] = input[row * aligned_dim + col]; - } - } - -}; - -class SeqSortedseqTranseUtil { -public: - SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false) - : _is_reverse(is_reverse), - _is_bi(is_bi) {}; - void print_vec(int* in, int size, const char* perfix) { - for (int i = 0; i < size; i++) { - printf("[%s] %d = %d\n", perfix, i, in[i]); - } - } - template - void seq_2_sorted_seq(const Dtype* input, Dtype* output, int word_size) { - // _map_vec.resize(word_sum); - int word_sum = _map_vec.size(); - // std::cout << "word_sum = " << word_sum << std::endl; - - for (int ori_word_id = 0; ori_word_id < word_sum; ++ori_word_id) { - //can param - int word_start = ori_word_id * word_size; - int maped_id = _map_vec[ori_word_id]; - int maped_start = maped_id * word_size; - - for (int word_vec_offset = 0; word_vec_offset < word_size; ++word_vec_offset) { - // std::cout< "< "< "<& data_tensor,float scale){ + CHECK_EQ(data_tensor.get_dtype(), AK_FLOAT) << "input must be fp32"; + size_t length=data_tensor.valid_size(); + float* in_data = static_cast(data_tensor.data()); + for (size_t i = 0; i < length; i++){ + in_data[i] = in_data[i]*scale; } } - template - void sorted_seq_2_seq(const Dtype* input, Dtype* output, int hidden_size, - int alligned_hidden_size) { - int word_sum = _map_vec.size(); - - for (int ori_word_id = 0; ori_word_id < word_sum; ori_word_id++) { - //can param - int word_start = ori_word_id * hidden_size; - int maped_id = _map_vec[ori_word_id]; - int maped_start = maped_id * alligned_hidden_size; - - for (int word_vec_offset = 0; word_vec_offset < hidden_size; word_vec_offset++) { - // std::cout< "<& out_tensor, const Tensor& in_tensor){ + CHECK_EQ(in_tensor.get_dtype(), AK_UINT8) << "input must be fp32"; + CHECK_EQ(in_tensor.get_scale().size(),1); + CHECK_EQ(out_tensor.get_dtype(), AK_FLOAT) << "input must be fp32"; + size_t length = in_tensor.valid_size(); + uint8_t* in_data = static_cast(in_tensor.data()); + float* out_data = static_cast(out_tensor.data()); + float scale=in_tensor.get_scale()[0]*(127.f/255.f); + for (size_t i = 0; i < length; i++){ + out_data[i] = (float)in_data[i] * scale; } } - /** - * return whether need to transform - * @param offset_vec - * @param emit_offset_vec - * @param emit_length - * @return - */ - bool get_sorted_map(std::vector& offset_vec, - std::vector& emit_offset_vec, int& emit_length) { - int batch_size = offset_vec.size() - 1; - int word_sum = offset_vec[offset_vec.size() - 1]; - std::vectorlength_vec(batch_size); - _length_index.resize(batch_size); - - if (batch_size == 1) { - emit_length = offset_vec[1] - offset_vec[0]; - emit_offset_vec.resize(emit_length + 1); - - for (int i = 0; i <= emit_length; i++) { - emit_offset_vec[i] = i; - } - return false; + static void scale_fp32_int8_without_scale(Tensor& out_tensor, const Tensor& in_tensor) { + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int8"; + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be fp32"; + CHECK_EQ(in_tensor.get_scale().size(), 0) << "input no scale is perfer"; + size_t length = in_tensor.valid_shape().count(); + const float* in_data = static_cast(in_tensor.data()); + float* out_data = static_cast(out_tensor.data()); + float max = -1e10; + + for (size_t i = 0; i < length; i++) { + const float temp = fabsf(in_data[i]); + max = max > temp ? max : temp; } - int max_len = 0; + float scale_value = 127.f / max; - for (int i = 0; i < offset_vec.size() - 1; ++i) { - int len = offset_vec[i + 1] - offset_vec[i]; - max_len = max_len > len ? max_len : len; - length_vec[i] = len; - _length_index[i] = i; + for (size_t i = 0; i < length; i++) { + out_data[i] = static_cast(roundf((float)in_data[i] * scale_value)); } - emit_length = max_len; + out_tensor.set_scale({1.f / scale_value}); + } + + static void get_tensor_scale(const Tensor& tensor) { + CHECK_EQ(tensor.get_dtype(), AK_FLOAT); + size_t length = tensor.valid_shape().count(); + float* data = static_cast(tensor.data()); + float max = -1e10; - if (max_len == 1) { - emit_offset_vec.push_back(0); - emit_offset_vec.push_back(emit_length * batch_size); - return false; + for (size_t i = 0; i < length; i++) { + const float temp = fabsf(data[i]); + max = max > temp ? max : temp; } + LOG(FATAL) << "not impl"; + } + static float get_fp32_max(const float* input, size_t size) { + float max = -1e10; + for (size_t i = 0; i < size; i++) { + const float temp = fabsf(input[i]); + max = max > temp ? max : temp; + } + return max; + } - std::sort(_length_index.begin(), _length_index.end(), [&length_vec](int i1, int i2) { - return length_vec[i1] > length_vec[i2]; - }); + static SaberStatus get_tensor_scale(std::vector& vector_scale, + const Tensor& tensor, const int axis, bool reverse = false) { - emit_offset_vec.resize(max_len + 1); - _map_vec.resize(word_sum); + int out_dims = tensor.valid_shape()[axis]; - int target_word_id = 0; - std::vector length_vec_cnt = length_vec; + long long inner_dim = tensor.count_valid(axis + 1, tensor.dims()); + const float* in_data = (const float*)(tensor.data()); + const float eps = 1e-5; - for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) { - emit_offset_vec[word_id_in_seq] = target_word_id; + if (reverse == false) { + vector_scale.resize(out_dims); - for (int batch_id = 0; batch_id < batch_size; batch_id++) { - int old_batch_id = _length_index[batch_id]; + for (int c = 0; c < out_dims; ++c) { + float max_val = -1e20; - if (length_vec_cnt[old_batch_id] > 0) { - int inner_word_id_in_seq = word_id_in_seq; + for (int i = 0; i < inner_dim; ++i) { + float read_data = fabs(in_data[i]); + max_val = (read_data > max_val) ? read_data : max_val; + } - if (_is_reverse) { - inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq; - } + vector_scale[c] = (max_val) / 127.f; - int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq; - _map_vec[old_word_id] = target_word_id; - // printf("map %d -> %d\n",old_word_id,target_word_id); - length_vec_cnt[old_batch_id]--; - target_word_id++; - } else { + in_data += inner_dim; + } + } else { + vector_scale.resize(inner_dim); - break; + for (int i = 0; i < inner_dim; ++i) { + float max_val = -1e20; + + for (int c = 0; c < out_dims; ++c) { + float read_data = fabs(in_data[c * inner_dim + i]); + max_val = (read_data > max_val) ? read_data : max_val; } + + vector_scale[i] = max_val / 127.f; } } - - // print_vec(_map_vec.data(),word_sum,"map"); - emit_offset_vec[max_len] = word_sum; - return true; + return SaberSuccess; } + static SaberStatus get_tensor_scale_u8(std::vector& vector_scale, + const Tensor& tensor, const int axis, bool reverse = false) { + LOG(FATAL) << "not impl"; + return SaberSuccess; + } -private: - // std::vector _length_vec; - std::vector _length_index; - std::vector _map_vec; - bool _is_reverse; - bool _is_bi; - -}; - -inline int round_up(int k, int c) { - return ((k + c - 1) / c) * c; -} + static SaberStatus scale_fc_weights_to_nchw_host(Tensor& out_tensor, + const Tensor& in_tensor) { + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be ak_float"; + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int 8"; + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 2); + int oc = out_tensor.height(); + int other = out_tensor.width(); + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); -inline int div_up(int k, int c) { - return (k + c - 1) / c; -} + for (int idx = 0; idx < oc * other; ++idx) { -template struct enable_if {}; -template struct enable_if { - typedef T type; -}; + int n = idx / other; -/* analogue std::conditional */ -template struct conditional {}; -template struct conditional { - typedef T type; -}; -template struct conditional { - typedef F type; -}; + out_weight_data[idx] = static_cast(in_weight_data[idx] / vector_weight_scale[n]); -template struct conditional3 {}; -template -struct conditional3 { - typedef T type; -}; -template -struct conditional3 { - typedef FT type; -}; -template -struct conditional3 { - typedef FF type; -}; - -template struct conditional_v {}; -template struct conditional_v { - static constexpr U value = t; -}; -template struct conditional_v { - static constexpr U value = f; -}; - -template struct remove_reference { - typedef T type; -}; -template struct remove_reference { - typedef T type; -}; -template struct remove_reference < T&& > { - typedef T type; -}; + } -template -inline const T& min(const T& a, const T& b) { - return a < b ? a : b; -} + out_tensor.set_scale(vector_weight_scale); -template -inline const T& max(const T& a, const T& b) { - return a > b ? a : b; -} + return SaberSuccess; + } -template -inline T&& forward(typename utils::remove_reference::type& t) { - return static_cast < T && >(t); -} -template -inline T&& forward(typename utils::remove_reference::type&& t) { - return static_cast < T && >(t); -} + static SaberStatus scale_fc_weights_to_nchw_host_u8(Tensor& out_tensor, + const Tensor& in_tensor) { + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be ak_float"; + CHECK_EQ(out_tensor.get_dtype(), AK_UINT8) << "output must be int 8"; + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 2); + int oc = out_tensor.height(); + int other = out_tensor.width(); + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); -template -inline typename remove_reference::type zero() { - auto zero = typename remove_reference::type(); - return zero; -} + for (int idx = 0; idx < oc * other; ++idx) { -template -inline bool everyone_is(T val, P item) { - return val == item; -} -template -inline bool everyone_is(T val, P item, Args... item_others) { - return val == item && everyone_is(val, item_others...); -} + int n = idx / other; -template -inline bool one_of(T val, P item) { - return val == item; -} -template -inline bool one_of(T val, P item, Args... item_others) { - return val == item || one_of(val, item_others...); -} + out_weight_data[idx] = static_cast(in_weight_data[idx] / vector_weight_scale[n]); -template -inline bool any_null(Args... ptrs) { - return one_of(nullptr, ptrs...); -} + } -inline bool implication(bool cause, bool effect) { - return !cause || effect; -} + out_tensor.set_scale(vector_weight_scale); -template -inline void array_copy(T* dst, const T* src, size_t size) { - for (size_t i = 0; i < size; ++i) { - dst[i] = src[i]; + return SaberSuccess; } -} -template -inline bool array_cmp(const T* a1, const T* a2, size_t size) { - for (size_t i = 0; i < size; ++i) if (a1[i] != a2[i]) { - return false; + static SaberStatus scale_gemm_xw_weights_to_nchw_host(Tensor& out_tensor, + const Tensor& in_tensor, bool is_ic_oc = true) { + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be ak_float"; + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int 8"; + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 2, is_ic_oc); + int other = in_tensor.width(); + int k = in_tensor.height(); + if (!is_ic_oc){ + k = in_tensor.width(); + other = in_tensor.height(); } + CHECK_EQ(vector_weight_scale.size(),other); - return true; -} + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); -template -inline void array_set(T* arr, const U& val, size_t size) { - for (size_t i = 0; i < size; ++i) { - arr[i] = static_cast(val); - } -} + if (is_ic_oc) { + for (int idx = 0; idx < k * other; ++idx) { -namespace product_impl { + int n = idx % other; -template struct int2type {}; + out_weight_data[idx] = static_cast(in_weight_data[idx] / vector_weight_scale[n]); -template -constexpr int product_impl(const T* arr, int2type<0>) { - return arr[0]; -} + } + }else{ + for (int idx = 0; idx < k * other; ++idx) { -template -inline T product_impl(const T* arr, int2type) { - return arr[0] * product_impl(arr + 1, int2type < num - 1 > ()); -} -} + int n = idx / k; -template -inline T array_product(const T* arr) { - return product_impl::product_impl(arr, product_impl::int2type < num - 1 > ()); -} + out_weight_data[idx] = static_cast(in_weight_data[idx] / vector_weight_scale[n]); -template -inline R array_product(const T* arr, size_t size) { - R prod = 1; + } + } - for (size_t i = 0; i < size; ++i) { - prod *= arr[i]; - } + out_tensor.set_scale(vector_weight_scale); - return prod; -} + return SaberSuccess; + } -template -inline typename remove_reference::type div_up(const T a, const U b) { - assert(b); - return (a + b - 1) / b; -} + static SaberStatus scale_bias_fp32_int32(Tensor& out_tensor, + const Tensor& in_tensor) { + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be ak_float"; + CHECK_EQ(out_tensor.get_dtype(), AK_INT32) << "output must be int 8"; + CHECK_EQ(out_tensor.get_scale().size(), + out_tensor.valid_size()) << "bias scale size must equal bias size"; + std::vector vector_bias_scale = out_tensor.get_scale(); + const float* in_data = static_cast(in_tensor.data()); + int* out_data = static_cast(out_tensor.mutable_data()); + + for (int idx = 0; idx < in_tensor.valid_size(); ++idx) { + out_data[idx] = static_cast(in_data[idx] / vector_bias_scale[idx]); + } -template -inline typename remove_reference::type rnd_up(const T a, const U b) { - return div_up(a, b) * b; -} + return SaberSuccess; + } -template -inline typename remove_reference::type rnd_dn(const T a, const U b) { - return (a / b) * b; -} + static SaberStatus scale_conv_weights_to_nchw_host(Tensor& out_tensor, + const Tensor& in_tensor) { + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be ak_float"; + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int 8"; + std::vector vector_weight_scale; + get_tensor_scale(vector_weight_scale, in_tensor, 0); + int o_num = out_tensor.num(); + int o_channel = out_tensor.channel(); + int o_height = out_tensor.height(); + int o_width = out_tensor.width(); -template -inline U this_block_size(const T offset, const U max, const V block_size) { - assert(offset < max); - // TODO (Roma): can't use nstl::max() due to circular dependency... we - // need to fix this - const T block_boundary = offset + block_size; + int out_n_stride = o_channel * o_height * o_width; + int out_c_stride = o_height * o_width; + int out_h_stride = o_width; - if (block_boundary > max) { - return max - offset; - } else { - return block_size; - } -} + Shape in_stride = in_tensor.get_stride(); + const float* in_weight_data = (const float*)in_tensor.data(); + char* out_weight_data = (char*)out_tensor.mutable_data(); + for (int idx = 0; idx < o_num * o_channel * o_height * o_width; ++idx) { + int n = (idx / (out_n_stride)) % o_num; -template -inline void balance211(T n, U team, U tid, T& n_start, T& n_end) { - T n_min = 1; - T& n_my = n_end; + out_weight_data[idx] = static_cast(in_weight_data[idx] / vector_weight_scale[n]); - if (team <= 1 || n == 0) { - n_start = 0; - n_my = n; - } else if (n_min == 1) { - // team = T1 + T2 - // n = T1*n1 + T2*n2 (n1 - n2 = 1) - T n1 = div_up(n, (T)team); - T n2 = n1 - 1; - T T1 = n - n2 * (T)team; - n_my = (T)tid < T1 ? n1 : n2; - n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; - } + } - n_end += n_start; -} + out_tensor.set_scale(vector_weight_scale); -template -inline T nd_iterator_init(T start) { - return start; -} -template -inline T nd_iterator_init(T start, U& x, const W& X, Args&& ... tuple) { - start = nd_iterator_init(start, utils::forward(tuple)...); - x = start % X; - return start / X; -} + return SaberSuccess; + } -inline bool nd_iterator_step() { - return true; -} -template -inline bool nd_iterator_step(U& x, const W& X, Args&& ... tuple) { - if (nd_iterator_step(utils::forward(tuple)...)) { - x = (x + 1) % X; - return x == 0; + static inline char secur_cast2char(float value) { + float temp = roundf(value); + int temp_int = (int)temp; + temp_int = temp_int > 127 ? 127 : temp_int; + temp_int = temp_int < -128 ? -128 : temp_int; + return (char)temp_int; } + static void scale_fp32_int8(Tensor& out_tensor, const Tensor& in_tensor) { + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int8"; + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be fp32"; + auto scale_vec = in_tensor.get_scale(); + CHECK_EQ(scale_vec.size(), 1) << "scale must = 1"; + float scale_value = 1.f / in_tensor.get_scale_data()[0]; + int size = in_tensor.valid_size(); + char* out_ptr = static_cast(out_tensor.mutable_data()); + const float* in_ptr = static_cast(in_tensor.data()); - return false; -} + for (int i = 0; i < size; i++) { -template -inline void parallel_nd(const T0 D0, const T1 D1, F f) { - const size_t work_amount = (size_t)D0 * D1; + out_ptr[i] = secur_cast2char(in_ptr[i] * scale_value); + } - if (work_amount == 0) { - return; } - #pragma omp parallel - { - const int ithr = omp_get_thread_num(); - const int nthr = omp_get_num_threads(); - size_t start{0}, end{0}; - balance211(work_amount, nthr, ithr, start, end); - T0 d0{0}; - T1 d1{0}; - nd_iterator_init(start, d0, D0, d1, D1); - - for (size_t iwork = start; iwork < end; ++iwork) { - f(d0, d1); - nd_iterator_step(d0, D0, d1, D1); + static void scale_fp32_int8(Tensor& out_tensor , const float* input, size_t size){ + CHECK_EQ(out_tensor.get_dtype(), AK_INT8) << "output must be int8"; + float t_max=get_fp32_max(input,size); + float scale_value=127.f/t_max; + char* out_ptr = static_cast(out_tensor.mutable_data()); + for (int i = 0; i < size; i++) { + out_ptr[i] = secur_cast2char(input[i] * scale_value); } + out_tensor.set_scale({1.f/scale_value}); } -} - -template -inline void parallel_nd(const T0 D0, const T1 D1, const T2 D2, F f) { - const size_t work_amount = (size_t)D0 * D1 * D2; - if (work_amount == 0) { - return; - } + static void scale_fp32_uint8(Tensor& out_tensor, Tensor& in_tensor) { + CHECK_EQ(out_tensor.get_dtype(), AK_UINT8) << "output must be int8"; + CHECK_EQ(in_tensor.get_dtype(), AK_FLOAT) << "input must be fp32"; + auto scale_vec = in_tensor.get_scale(); + CHECK_EQ(scale_vec.size(), 1) << "scale must = 1"; + float scale_value = 1.f / (in_tensor.get_scale_data()[0]*(127.f/255.f)); + int size = in_tensor.valid_size(); + uint8_t * out_ptr = static_cast(out_tensor.mutable_data()); + const float* in_ptr = static_cast(in_tensor.data()); - #pragma omp parallel - { - const int ithr = omp_get_thread_num(); - const int nthr = omp_get_num_threads(); - size_t start{0}, end{0}; - balance211(work_amount, nthr, ithr, start, end); - T0 d0{0}; - T1 d1{0}; - T2 d2{0}; - nd_iterator_init(start, d0, D0, d1, D1, d2, D2); - - for (size_t iwork = start; iwork < end; ++iwork) { - f(d0, d1, d2); - nd_iterator_step(d0, D0, d1, D1, d2, D2); + for (int i = 0; i < size; i++) { + out_ptr[i] = static_cast(in_ptr[i] * scale_value); } } -} -template -inline bool nd_iterator_jump(U& cur, const U end, W& x, const Y& X) { - U max_jump = end - cur; - U dim_jump = X - x; - - if (dim_jump <= max_jump) { - x = 0; - cur += dim_jump; - return true; - } else { - cur += max_jump; - x += max_jump; - return false; - } -} +// static void scale_int8_fp32(Tensor& out_tensor, Tensor& in_tensor) { +// CHECK_EQ(out_tensor.get_dtype(), AK_FLOAT) << "output must be fp32"; +// CHECK_EQ(in_tensor.get_dtype(), AK_INT8) << "input must be int8"; +// float scale_value = 1.f / in_tensor.get_scale()[0]; +// int size = in_tensor.valid_size(); +// char* out_ptr = static_cast(out_tensor.mutable_data()); +// const float* in_ptr = static_cast(in_tensor.data()); +// +// for (int i = 0; i < size; i++) { +// out_ptr[i] = static_cast(roundf(in_ptr[i] * scale_value)); +// } +// } +}; -template -inline bool nd_iterator_jump(U& cur, const U end, W& x, const Y& X, - Args&& ... tuple) { - if (nd_iterator_jump(cur, end, utils::forward(tuple)...)) { - x = (x + 1) % X; - return x == 0; - } +template +static void reorder_nchwc_nchw(Tensor& input, + Tensor& output) { - return false; -} + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; -template -struct array_offset_calculator { - template - array_offset_calculator(Telem* base, Targs... Fargs) : _dims{ Fargs... } { - _base_ptr = base; - } + Shape shape = output.valid_shape(); + int n_value = shape[0]; + int c_value = shape[1]; + int h_value = shape[2]; + int w_value = shape[3]; + Shape shape_input = input.valid_shape(); + int aligned_length = shape_input.get_layout_aligned_length(); + CHECK_GT(aligned_length, 0) << "input aligned should > 0"; + int c_round_divk = shape_input[1]; - template - inline Telem& operator()(Targs... Fargs) { - return *(_base_ptr + _offset(1, Fargs...)); - } + c_round_divk = (shape_input.channel() + aligned_length - 1) / aligned_length; -private: - template - inline size_t _offset(size_t const dimension, size_t element) { - return element; - } + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(4) schedule(static) - template - inline size_t _offset(size_t const dimension, size_t theta, size_t element) { - return element + (_dims[dimension] * theta); + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + //#pragma ivdep + for (int w = 0; w < w_value; ++w) { + int round_c = c / aligned_length; + int remainder_c = c % aligned_length; + int input_idx = n * c_round_divk * h_value * w_value * aligned_length + round_c * h_value * + w_value * aligned_length + + h * w_value * aligned_length + w * aligned_length + remainder_c; + int output_idx = n * c_value * h_value * w_value + c * h_value * w_value + + h * w_value + w ; + + *(output_ptr + output_idx) = input_ptr[input_idx]; + } + } + } } - template - inline size_t _offset(size_t const dimension, size_t theta, size_t element, - Targs... Fargs) { - size_t t_prime = element + (_dims[dimension] * theta); - return _offset(dimension + 1, t_prime, Fargs...); - } +} - Telem* _base_ptr; - const int _dims[Tdims]; -}; } // namespace utils @@ -646,32 +454,67 @@ inline void zfree(void* p) { #endif } -struct c_compatible { - enum { default_alignment = 4096 }; +//struct c_compatible { +// enum { default_alignment = 4096 }; +// +// static void* operator new (size_t sz) { +// return zmalloc(sz, default_alignment); +// } +// +// static void* operator new (size_t sz, void* p) { +// UNUSED(sz); +// return p; +// } +// +// static void* operator new[](size_t sz) { +// return zmalloc(sz, default_alignment); +// } +// +// static void operator delete (void* p) { +// zfree(p); +// } +// +// static void operator delete[](void* p) { +// zfree(p); +// } +//}; - static void* operator new (size_t sz) { - return zmalloc(sz, default_alignment); - } +inline void yield_thread() { } - static void* operator new (size_t sz, void* p) { - UNUSED(sz); - return p; - } +// reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw16o16i +inline void weight_reorder_OIhw16o16i(Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.valid_shape(); + int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; - static void* operator new[](size_t sz) { - return zmalloc(sz, default_alignment); - } + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(6) schedule(static) - static void operator delete (void* p) { - zfree(p); - } + for (int oc_idx = 0; oc_idx < oc_value / 16; ++oc_idx) { + for (int ic_idx = 0; ic_idx < ic_value / 16; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int oc = 0; oc < 16; ++oc) { + for (int ic = 0; ic < 16; ++ic) { + int input_idx = (oc_idx * 16 + oc) * ic_value * kh_value * kw_value + + (ic_idx * 16 + ic) * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc_idx * ic_value / 16 * kh_value * kw_value * 16 * 16 + + ic_idx * kh_value * kw_value * 16 * 16 + + kh * kw_value * 16 * 16 + + kw * 16 * 16 + oc * 16 + ic; - static void operator delete[](void* p) { - zfree(p); + *(output_ptr + output_idx) = *(input_ptr + input_idx); + } + } + } + } + } } -}; - -inline void yield_thread() { } +} // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw16i16o inline void weight_reorder_OIhw16i16o(Tensor& input, @@ -708,6 +551,90 @@ inline void weight_reorder_OIhw16i16o(Tensor& input, } } +inline void weight_reorder_OIhw8o8i(Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.valid_shape(); + int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; + + Shape new_shape({utils::round_up(oc_value, 8), utils::round_up(ic_value, 8), kh_value, kw_value}, + Layout_NCHW); + + if ((oc_value % 8 != 0) || (ic_value % 8 != 0)) { + output.re_alloc(new_shape, AK_FLOAT); + } + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(6) schedule(static) + + for (int oc_idx = 0; oc_idx < new_shape[0] / 8; ++oc_idx) { + for (int ic_idx = 0; ic_idx < new_shape[1] / 8; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int oc = 0; oc < 8; ++oc) { + for (int ic = 0; ic < 8; ++ic) { + int input_idx = (oc_idx * 8 + oc) * ic_value * kh_value * kw_value + + (ic_idx * 8 + ic) * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc_idx * new_shape[1] / 8 * kh_value * kw_value * 8 * 8 + + ic_idx * kh_value * kw_value * 8 * 8 + + kh * kw_value * 8 * 8 + + kw * 8 * 8 + oc * 8 + ic; + + *(output_ptr + output_idx) = ((oc_idx * 8 + oc) < oc_value && (ic_idx * 8 + ic) < ic_value) + ? *(input_ptr + input_idx) : 0; + } + } + } + } + } + } +} + +inline void weight_reorder_OIhw8o8i_ak(Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.valid_shape(); + int oc_value = shape[1], ic_value = shape[0], kh_value = shape[2], kw_value = shape[3]; + + Shape new_shape({utils::round_up(oc_value, 8), utils::round_up(ic_value, 8), kh_value, kw_value}, + Layout_NCHW); + + if ((oc_value % 8 != 0) || (ic_value % 8 != 0)) { + output.re_alloc(new_shape, AK_FLOAT); + } + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(6) schedule(static) + + for (int oc_idx = 0; oc_idx < new_shape[0] / 8; ++oc_idx) { + for (int ic_idx = 0; ic_idx < new_shape[1] / 8; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int oc = 0; oc < 8; ++oc) { + for (int ic = 0; ic < 8; ++ic) { + int input_idx = (ic_idx * 8 + ic) * ic_value * kh_value * kw_value + + (oc_idx * 8 + oc) * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc_idx * new_shape[1] / 8 * kh_value * kw_value * 8 * 8 + + ic_idx * kh_value * kw_value * 8 * 8 + + kh * kw_value * 8 * 8 + + kw * 8 * 8 + oc * 8 + ic; + + *(output_ptr + output_idx) = ((oc_idx * 8 + oc) < oc_value && (ic_idx * 8 + ic) < ic_value) + ? *(input_ptr + input_idx) : 0; + } + } + } + } + } + } +} + // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw8i8o inline void weight_reorder_OIhw8i8o(Tensor& input, Tensor& output) { @@ -751,6 +678,49 @@ inline void weight_reorder_OIhw8i8o(Tensor& input, } } +// reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw8i8o +inline void weight_reorder_nchw2nchw8o8i(Tensor& input, + Tensor& output) { + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.valid_shape(); + int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; + + Shape new_shape({utils::round_up(oc_value, 8), utils::round_up(ic_value, 8), kh_value, kw_value}, + Layout_NCHW); + + if ((oc_value % 8 != 0) || (ic_value % 8 != 0)) { + output.re_alloc(new_shape, AK_FLOAT); + } + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + #pragma omp parallel for collapse(6) schedule(static) + + for (int oc_idx = 0; oc_idx < new_shape[0] / 8; ++oc_idx) { + for (int ic_idx = 0; ic_idx < new_shape[1] / 8; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int oc = 0; oc < 8; ++oc) { + for (int ic = 0; ic < 8; ++ic) { + int input_idx = (oc_idx * 8 + oc) * ic_value * kh_value * kw_value + + (ic_idx * 8 + ic) * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc_idx * new_shape[1] / 8 * kh_value * kw_value * 8 * 8 + + ic_idx * kh_value * kw_value * 8 * 8 + + kh * kw_value * 8 * 8 + + kw * 8 * 8 + oc * 8 + ic; + + *(output_ptr + output_idx) = ((oc_idx * 8 + oc) < oc_value && (ic_idx * 8 + ic) < ic_value) + ? *(input_ptr + input_idx) : 0; + } + } + } + } + } + } +} + // reorder weight layout from NCHW(oc, ic, kh, kw) to OIhw4i16o4i inline void weight_reorder_OIhw4i16o4i(Tensor& input, Tensor& output, @@ -861,15 +831,12 @@ inline void weight_reorder_OIhwi16o(Tensor& input, } } - -// reorder weight layout from NCHW(oc, ic, kh, kw) to OIhwi8o inline void weight_reorder_OIhwi8o(Tensor& input, Tensor& output) { - Shape shape = input.shape(); - CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = input.shape(); int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; Shape new_shape({utils::round_up(oc_value, 8) / 8, ic_value, kh_value, kw_value, 8}, @@ -881,22 +848,22 @@ inline void weight_reorder_OIhwi8o(Tensor& input, float* output_ptr = static_cast(output.mutable_data()); const float* input_ptr = static_cast(input.data()); - #pragma omp parallel for collapse(5) schedule(static) - for (int oc_idx = 0; oc_idx < shape[0] / 8; ++oc_idx) { - for (int kh = 0; kh < shape[2]; ++kh) { - for (int kw = 0; kw < shape[3]; ++kw) { - for (int ic = 0; ic < shape[1]; ++ic) { +#pragma omp parallel for collapse(5) schedule(static) + + for (int oc_idx = 0; oc_idx < new_shape[0]; ++oc_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int ic = 0; ic < ic_value; ++ic) { for (int oc = 0; oc < 8; ++oc) { - int input_idx = (oc_idx * 8 + oc) * shape[1] * shape[2] * shape[3] + - ic * shape[2] * shape[3] + - kh * shape[3] + kw; - int output_idx = oc_idx * shape[2] * shape[3] * shape[1] * 8 + - kh * shape[3] * shape[1] * 8 + - kw * shape[1] * 8 + + int input_idx = (oc_idx * 8 + oc) * ic_value * kh_value * kw_value + + ic * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc_idx * kh_value * kw_value * ic_value * 8 + + kh * kw_value * ic_value * 8 + + kw * ic_value * 8 + ic * 8 + oc; - - *(output_ptr + output_idx) = *(input_ptr + input_idx); + *(output_ptr + output_idx) = ((oc_idx * 8 + oc) < oc_value) ? *(input_ptr + input_idx) : 0; } } } @@ -943,7 +910,7 @@ static void weight_reorder_Goihw16g(Tensor& input, char* output_ptr = static_cast(output.mutable_data()); const char* input_ptr = static_cast(input.data()); - #pragma omp parallel for collapse(6) schedule(static) +#pragma omp parallel for collapse(6) schedule(static) for (int g_idx = 0; g_idx < g_value / 16; ++g_idx) { for (int oc_idx = 0; oc_idx < oc_value; ++oc_idx) { @@ -972,8 +939,76 @@ static void weight_reorder_Goihw16g(Tensor& input, } } +// reorder weight layout from NCHW to Goihw8g +static void weight_reorder_Goihw8g(Tensor& input, + Tensor& output) { + Shape shape = input.shape(); + int g_value = shape[0], oc_value = shape[1], ic_value = shape[1], kh_value = shape[2], + kw_value = shape[3]; + + if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_FLOAT) { + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + +#pragma omp parallel for collapse(6) schedule(static) + + for (int g_idx = 0; g_idx < g_value / 8; ++g_idx) { + for (int oc_idx = 0; oc_idx < oc_value; ++oc_idx) { + for (int ic_idx = 0; ic_idx < ic_value; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int g = 0; g < 8; ++g) { + int input_idx = (g_idx * 8 + g) * oc_value * ic_value * kh_value * kw_value + + oc_idx * ic_value * kh_value * kw_value + + ic_idx * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = g_idx * oc_value * ic_value * kh_value * kw_value * 8 + + oc_idx * ic_value * kh_value * kw_value * 8 + + ic_idx * kh_value * kw_value * 8 + + kh * kw_value * 8 + kw * 8 + g; + + *(output_ptr + output_idx) = *(input_ptr + input_idx); + } + } + } + } + } + } + } else if (input.get_dtype() == AK_INT8 && output.get_dtype() == AK_INT8) { + char* output_ptr = static_cast(output.mutable_data()); + const char* input_ptr = static_cast(input.data()); + +#pragma omp parallel for collapse(6) schedule(static) + + for (int g_idx = 0; g_idx < g_value / 8; ++g_idx) { + for (int oc_idx = 0; oc_idx < oc_value; ++oc_idx) { + for (int ic_idx = 0; ic_idx < ic_value; ++ic_idx) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + for (int g = 0; g < 8; ++g) { + int input_idx = (g_idx * 8 + g) * oc_value * ic_value * kh_value * kw_value + + oc_idx * ic_value * kh_value * kw_value + + ic_idx * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = g_idx * oc_value * ic_value * kh_value * kw_value * 8 + + oc_idx * ic_value * kh_value * kw_value * 8 + + ic_idx * kh_value * kw_value * 8 + + kh * kw_value * 8 + kw * 8 + g; + + *(output_ptr + output_idx) = *(input_ptr + input_idx); + } + } + } + } + } + } + } else { + ABORT_S() << "error: not supported reorder!"; + } +} + // reorder bias layout from NCHW to 1C11 -static void bias_reorder_nchw(Tensor& input, +static void bias_reorder_nchw(const Tensor& input, Tensor& output, const std::vector& scale) { Shape shape = input.shape(); @@ -983,7 +1018,7 @@ static void bias_reorder_nchw(Tensor& input, int* output_ptr = static_cast(output.mutable_data()); const float* input_ptr = static_cast(input.data()); - #pragma omp parallel for collapse(4) schedule(static) +#pragma omp parallel for collapse(4) schedule(static) for (int n_idx = 0; n_idx < n; ++n_idx) { for (int c_idx = 0; c_idx < c; ++c_idx) { @@ -1002,11 +1037,33 @@ static void bias_reorder_nchw(Tensor& input, } } } - } else if (input.get_dtype() == AK_INT32 && output.get_dtype() == AK_INT32) { + } else if (input.get_dtype() == AK_FLOAT && output.get_dtype() == AK_FLOAT) { + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); + CHECK(scale.size()>0); +#pragma omp parallel for collapse(4) schedule(static) + + for (int n_idx = 0; n_idx < n; ++n_idx) { + for (int c_idx = 0; c_idx < c; ++c_idx) { + for (int h_idx = 0; h_idx < h; ++h_idx) { + for (int w_idx = 0; w_idx < w; ++w_idx) { + int input_idx = n_idx * c * h * w + + c_idx * h * w + + h_idx * w + w_idx; + int output_idx = n_idx * c * h * w + + c_idx * h * w + + h_idx * w + w_idx; + float scale_v = scale[c_idx]; + *(output_ptr + output_idx) = (*(input_ptr + input_idx)) * scale_v; + } + } + } + } + }else if (input.get_dtype() == AK_INT32 && output.get_dtype() == AK_INT32) { int* output_ptr = static_cast(output.mutable_data()); const int* input_ptr = static_cast(input.data()); - #pragma omp parallel for collapse(4) schedule(static) +#pragma omp parallel for collapse(4) schedule(static) for (int n_idx = 0; n_idx < n; ++n_idx) { for (int c_idx = 0; c_idx < c; ++c_idx) { @@ -1036,17 +1093,13 @@ inline void input_reorder_nChwc8(Tensor& input, CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; Shape shape = input.valid_shape(); - int n_value = shape[0], c_value = shape[1], h_value = shape[2], w_value = shape[3]; + int n_value = shape.num(), c_value = shape.channel(), h_value = shape.height(), w_value = shape.width(); Shape new_shape({n_value, utils::round_up(c_value, 8) / 8, h_value, w_value, 8}, Layout_NCHW_C8); - if (c_value % 8 != 0) { - output.re_alloc(new_shape, AK_FLOAT); - } - float* output_ptr = static_cast(output.mutable_data()); const float* input_ptr = static_cast(input.data()); - #pragma omp parallel for collapse(5) schedule(static) +#pragma omp parallel for collapse(5) schedule(static) for (int n = 0; n < n_value; ++n) { for (int c_idx = 0; c_idx < new_shape[1]; ++c_idx) { @@ -1066,6 +1119,46 @@ inline void input_reorder_nChwc8(Tensor& input, } } +// reorder input layout from nchw_c8 to NCHW +inline void reorder_nchwc8_nchw(Tensor& input, + Tensor& output) { + + CHECK_EQ(input.get_dtype(), AK_FLOAT) << "only support float type"; + CHECK_EQ(output.get_dtype(), AK_FLOAT) << "only support float type"; + Shape shape = output.valid_shape(); + int n_value = shape[0]; + int c_value = shape[1]; + int h_value = shape[2]; + int w_value = shape[3]; + Shape shape_input = input.valid_shape(); + int c_round_div8 = shape_input[1]; + + if (input.get_layout() == Layout_NCHW_C8R) { + c_round_div8 = (shape_input.channel() + 7) / 8; + } + + float* output_ptr = static_cast(output.mutable_data()); + const float* input_ptr = static_cast(input.data()); +#pragma omp parallel for collapse(4) schedule(static) + for (int n = 0; n < n_value; ++n) { + for (int c = 0; c < c_value; ++c) { + for (int h = 0; h < h_value; ++h) { + for (int w = 0; w < w_value; ++w) { + int round_c = c / 8; + int remainder_c = c % 8; + int input_idx = n * c_round_div8 * h_value * w_value * 8 + round_c * h_value * w_value * 8 + + h * w_value * 8 + w * 8 + remainder_c; + int output_idx = n * c_value * h_value * w_value + c * h_value * w_value + + h * w_value + w ; + + *(output_ptr + output_idx) = input_ptr[input_idx]; + } + } + } + } + +} + // reorder output layout from NCHW(oc, ic, kh, kw) to nChwc8 inline void output_reorder_nChwc8(Tensor& input, Tensor& output) { @@ -1073,34 +1166,47 @@ inline void output_reorder_nChwc8(Tensor& input, input_reorder_nChwc8(input, output); } -inline size_t datatype_size(DataType data_type) { - switch (data_type) { - case AK_FLOAT: - return sizeof(float); - - case AK_INT32: - return sizeof(int32_t); - case AK_HALF: - return sizeof(int16_t); +inline void weight_padding_nhwc(Tensor* input, Tensor* output) { + CHECK_EQ(input->get_dtype(),AK_INT8); + CHECK_EQ(output->get_dtype(),AK_INT8); + Shape shape = input->shape(); + Shape shape_padding = output->shape(); + int oc_value = shape[0], ic_value = shape[1], kh_value = shape[2], kw_value = shape[3]; + int oc_padding = shape_padding[0], ic_padding = shape_padding[1];; - case AK_INT8: - return sizeof(int8_t); + char* output_ptr = static_cast(output->mutable_data()); + const char* input_ptr = static_cast(input->data()); - case AK_UINT8: - return sizeof(uint8_t); +#pragma omp parallel for collapse(4) schedule(static) - case AK_INVALID: - default: - assert(!"unknown data_type"); + for (int oc = 0; oc < oc_padding; ++oc) { + for (int ic = 0; ic < ic_padding; ++ic) { + for (int kh = 0; kh < kh_value; ++kh) { + for (int kw = 0; kw < kw_value; ++kw) { + int input_idx = oc * ic_value * kh_value * kw_value + + ic * kh_value * kw_value + + kh * kw_value + kw; + int output_idx = oc * ic_padding * kh_value * kw_value + + ic * kh_value * kw_value + + kh * kw_value + kw; + + if (oc < oc_value && ic < ic_value) { + *(output_ptr + output_idx) = (*(input_ptr + input_idx)); + } else { + *(output_ptr + output_idx) = 0; + } + } + } + } } - - return 0; } + + } // namespace saber } // namespace anakin -#endif // X86_UTILS_H \ No newline at end of file +#endif // X86_UTILS_H diff --git a/saber/funcs/layer_norm.h b/saber/funcs/layer_norm.h index cddf30d8d..aa26e4007 100644 --- a/saber/funcs/layer_norm.h +++ b/saber/funcs/layer_norm.h @@ -19,6 +19,11 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_layer_norm.h" + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_layer_norm.h" +#endif + #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_layer_norm.h" #endif diff --git a/saber/funcs/lrn.h b/saber/funcs/lrn.h index 874d4a7e2..1dd31bbdd 100644 --- a/saber/funcs/lrn.h +++ b/saber/funcs/lrn.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_LRN_H @@ -28,8 +28,11 @@ #include "saber/funcs/impl/x86/saber_lrn.h" #endif #ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_lrn.h" +#include "saber/funcs/impl/arm/saber_lrn.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_lrn.h" #endif namespace anakin { namespace saber { @@ -61,7 +64,7 @@ class Lrn : public BaseFunc< Param_t& param) override { SaberStatus status; CHECK_EQ(input.size(), 1); - + Shape output_shape = input[0]->valid_shape(); output[0]->set_shape(output_shape); diff --git a/saber/funcs/lstm.h b/saber/funcs/lstm.h index 79b2a82c2..50045d473 100644 --- a/saber/funcs/lstm.h +++ b/saber/funcs/lstm.h @@ -31,7 +31,7 @@ #endif #ifdef USE_ARM_PLACE -#include "saber/funcs/impl/impl_lstm.h" +// #include "saber/funcs/impl/impl_lstm.h" #endif namespace anakin { diff --git a/saber/funcs/lstmp.h b/saber/funcs/lstmp.h new file mode 100644 index 000000000..d3880d55a --- /dev/null +++ b/saber/funcs/lstmp.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_LSTMP_H +#define ANAKIN_SABER_FUNCS_LSTMP_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_lstmp.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_lstmp.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_lstmp.h" + +#endif + +#ifdef USE_ARM_PLACE + +#endif + +namespace anakin { +namespace saber { +template +class Lstmp : public BaseFunc < + TargetType, + OpDtype, + ImplBase, + LstmParam > { +public: + using BaseFunc < + TargetType, + OpDtype, + ImplBase, + LstmParam >::BaseFunc; + + Lstmp() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef LstmParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v& output, Param_t& param) override { + + int seqLength = input[0]->num(); + + Shape output_shape = Shape({seqLength, param.project_dim, param.num_direction, 1}, input[0]->get_layout()); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + + if (output.size() >= 2) { + output[1]->set_seq_offset(input[0]->get_seq_offset()); + } + + return output[0]->set_shape_without_layout(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + //this->_impl.push_back(new VenderLstmp _impl.push_back(new VenderLstmp ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberLstmp ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + +} // namespace saber +} // namepace anakin + + +#endif // ANAKIN_SABER_FUNCS_LSTM_H + diff --git a/saber/funcs/mat_mul.h b/saber/funcs/mat_mul.h index 6925b4219..b1948e66d 100644 --- a/saber/funcs/mat_mul.h +++ b/saber/funcs/mat_mul.h @@ -27,6 +27,10 @@ #include "saber/funcs/impl/x86/vender_mat_mul.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/vender_mat_mul.h" +#endif + namespace anakin{ namespace saber{ diff --git a/saber/funcs/mean.h b/saber/funcs/mean.h new file mode 100644 index 000000000..4638950e3 --- /dev/null +++ b/saber/funcs/mean.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_MEAN_H +#define ANAKIN_SABER_FUNCS_MEAN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_mean.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_mean.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_mean.h" +#endif + +#ifdef USE_AMD +#endif + +#ifdef USE_ARM_PLACE +#endif + +#ifdef USE_BM +#endif + +namespace anakin { +namespace saber { + +template +class Mean : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + MeanParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + MeanParam>::BaseFunc; + + Mean() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef MeanParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + CHECK_GT(input[0]->valid_size(), 0) << "[Mean] input's valid_size must over than 0."; + Shape output_shape({1, 1, 1, 1}); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderMean ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberMean ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/mvn.h b/saber/funcs/mvn.h index b74cfe30a..314526552 100644 --- a/saber/funcs/mvn.h +++ b/saber/funcs/mvn.h @@ -22,6 +22,9 @@ #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_mvn.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_mvn.h" +#endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_mvn.h" diff --git a/saber/funcs/normalize.h b/saber/funcs/normalize.h index c56052e08..c200eef07 100644 --- a/saber/funcs/normalize.h +++ b/saber/funcs/normalize.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_NORMALIZE_H @@ -28,6 +28,13 @@ #include "saber/funcs/impl/x86/saber_normalize.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_normalize.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_normalize.h" +#endif /* #ifdef AMD_GPU #include "saber/funcs/impl/impl_normalize.h" @@ -52,7 +59,7 @@ class Normalize : public BaseFunc< NormalizeParam>::BaseFunc; Normalize() = default; - + typedef Tensor InDataTensor; typedef Tensor OutDataTensor; typedef Tensor OpTensor; @@ -61,7 +68,7 @@ class Normalize : public BaseFunc< typedef std::vector Output_v; typedef std::vector Shape_v; - + virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ Param_t& param) override { diff --git a/saber/funcs/one_hot.h b/saber/funcs/one_hot.h new file mode 100644 index 000000000..0e7c7ba60 --- /dev/null +++ b/saber/funcs/one_hot.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_ONE_HOT_H +#define ANAKIN_SABER_FUNCS_ONE_HOT_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_one_hot.h" + +#ifdef USE_CUDA +#include "saber/funcs/impl/cuda/saber_one_hot.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_one_hot.h" +#endif + +namespace anakin { +namespace saber { + +template +class OneHot : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + OneHotParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + OneHotParam>::BaseFunc; + + OneHot() = default; + + virtual SaberStatus compute_output_shape( + const std::vector*> &input, + std::vector*> &output, + OneHotParam ¶m) override { + + CHECK_GE(input[0]->dims(), 2) << "Rank should greater than 1 "; + CHECK_EQ(input[0]->valid_shape()[input[0]->dims() - 1], 1) + << "last dim must be 1!!"; + + int depth = param.depth; + + CHECK_GT(depth, 0) << "depth should greater than 0"; + + Shape out_shape = input[0]->valid_shape(); + out_shape[out_shape.dims() - 1] = depth; + + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape_without_layout(out_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderOneHot ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberOneHot ); + return SaberSuccess; + default: + return SaberUnImplError; + } + } +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; +} +} +#endif diff --git a/saber/funcs/pad.h b/saber/funcs/pad.h index 4257f33e0..c7a99f47a 100644 --- a/saber/funcs/pad.h +++ b/saber/funcs/pad.h @@ -24,6 +24,14 @@ #include "saber/funcs/impl/cuda/saber_pad.h" #endif +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_pad.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_pad.h" +#endif + namespace anakin { namespace saber { diff --git a/saber/funcs/pad2d.h b/saber/funcs/pad2d.h new file mode 100644 index 000000000..1dede3fd7 --- /dev/null +++ b/saber/funcs/pad2d.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#ifndef ANAKIN_SABER_FUNCS_PAD2D_H +#define ANAKIN_SABER_FUNCS_PAD2D_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_pad2d.h" +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_pad2d.h" +#endif + +#if defined USE_X86_PLACE || defined BUILD_LITE +#include "saber/funcs/impl/impl_pad2d.h" +#endif + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_pad2d.h" +#endif + +namespace anakin { +namespace saber { + +template +class Pad2D : public BaseFunc { +public: + using BaseFunc::BaseFunc; + + Pad2D() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef Pad2DParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + int out_h = input[0]->height() + param._pad_h[0] + param._pad_h[1]; + int out_w = input[0]->width() + param._pad_w[0] + param._pad_w[1]; + Shape output_shape({input[0]->num(), input[0]->channel(), out_h, out_w}); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderPad2D ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberPad2D ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + + +#endif diff --git a/saber/funcs/permute.h b/saber/funcs/permute.h index 966f84f28..97ca16c02 100644 --- a/saber/funcs/permute.h +++ b/saber/funcs/permute.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_PERMUTE_H @@ -27,9 +27,15 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_permute.h" #endif + #ifdef USE_ARM_PLACE #include "saber/funcs/impl/arm/saber_permute.h" #endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_permute.h" +#endif + namespace anakin { namespace saber { diff --git a/saber/funcs/permute_power.h b/saber/funcs/permute_power.h index 0f5951b09..71eea7b01 100644 --- a/saber/funcs/permute_power.h +++ b/saber/funcs/permute_power.h @@ -24,6 +24,9 @@ #include "saber/funcs/impl/cuda/saber_permute_power.h" #include "saber/funcs/impl/cuda/vender_permute_power.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_permute_power.h" +#endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_permute_power.h" #endif diff --git a/saber/funcs/pixel_shuffle.h b/saber/funcs/pixel_shuffle.h new file mode 100644 index 000000000..244bf65dc --- /dev/null +++ b/saber/funcs/pixel_shuffle.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_PIXEL_SHUFFLE_H +#define ANAKIN_SABER_FUNCS_PIXEL_SHUFFLE_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_pixel_shuffle.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_pixel_shuffle.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_pixel_shuffle.h" +#endif + +namespace anakin { +namespace saber { + +template +class PixelShuffle : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + PixelShuffleParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + PixelShuffleParam>::BaseFunc; + + PixelShuffle() = default; + + typedef PixelShuffleParam Param_t; + typedef std::vector *> Input_v; + typedef std::vector *> Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, Output_v& output, \ + Param_t& param) override { + + int rh = param.rh; + int rw = param.rw; + + Shape in_shape = input[0]->valid_shape(); + Shape out_shape = in_shape; + int in_c = in_shape.channel(); + CHECK_EQ(in_c%(rw*rh), 0) << "input channel must mod rw*rh to 0"; + + int oc = in_c/(rw*rh); + int oh = in_shape.height() * rh; + int ow = in_shape.width() * rw; + + + if (param.channel_first){ + out_shape[1] = oc; + out_shape[2] = oh; + out_shape[3] = ow; + } else { + out_shape[1] = oh; + out_shape[2] = ow; + out_shape[3] = oc; + } + + return output[0] -> set_shape(out_shape); + + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderPixelShuffle ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberPixelShuffle ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} +} + +#endif //ANAKIN_SABER_FUNCS_PERMUTE_H diff --git a/saber/funcs/pooling.h b/saber/funcs/pooling.h index f004e7923..d70921376 100644 --- a/saber/funcs/pooling.h +++ b/saber/funcs/pooling.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_POOLING_H @@ -28,8 +28,14 @@ #include "saber/funcs/impl/x86/saber_pooling.h" #endif #ifdef USE_ARM_PLACE -#include "saber/funcs/impl/impl_pooling.h" +#include "saber/funcs/impl/arm/saber_pooling.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_pooling.h" +#include "saber/funcs/impl/amd/include/vender_pooling.h" #endif + namespace anakin { namespace saber { @@ -83,12 +89,20 @@ class Pooling : public BaseFunc< param.window_h = in_height; param.window_w = in_width; } else { + // printf("param.cmp_out_shape_floor_as_conv: %d \n", param.cmp_out_shape_floor_as_conv); if (param.cmp_out_shape_floor_as_conv) { out_height = static_cast((static_cast( in_height + 2 * pad_h - window_h) / stride_h)) + 1; out_width = static_cast((static_cast( in_width + 2 * pad_w - window_w) / stride_w)) + 1; + //onnx_pooling (pad_left + pad_right, pad_top + pad_bot) + if (out_height <= 0){ + out_height = 1; + } + if (out_width <= 0){ + out_width = 1; + } } else { out_height = static_cast(ceilf(static_cast( in_height + 2 * pad_h - window_h) / stride_h)) + 1; @@ -107,27 +121,24 @@ class Pooling : public BaseFunc< } } - int height_idx = input[0]->height_index(); - int width_idx = input[0]->width_index(); - - output_shape[height_idx] = out_height; - output_shape[width_idx] = out_width; + output_shape.set_height(out_height); + output_shape.set_width(out_width); - return output[0]->set_shape(output_shape); + return output[0]->set_shape_without_layout(output_shape); } virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: + switch (implenum) { + case VENDER_IMPL: this->_impl.push_back(new VenderPooling ); - return SaberSuccess; - case SABER_IMPL: - this->_impl.push_back(new SaberPooling ); - return SaberSuccess; - default: + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberPooling ); + return SaberSuccess; + default: return SaberUnImplError; - } + } } private: diff --git a/saber/funcs/pooling_with_index.h b/saber/funcs/pooling_with_index.h index b3315a0f7..cb0d3027c 100644 --- a/saber/funcs/pooling_with_index.h +++ b/saber/funcs/pooling_with_index.h @@ -20,6 +20,10 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_pooling_with_index.h" +#endif #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_pooling_with_index.h" #endif diff --git a/saber/funcs/power.h b/saber/funcs/power.h index cb7337569..22612abe6 100644 --- a/saber/funcs/power.h +++ b/saber/funcs/power.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_POWER_H @@ -22,10 +22,16 @@ #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_power.h" #endif - +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_power.h" +#endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_power.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_power.h" +#endif + namespace anakin { namespace saber { diff --git a/saber/funcs/priorbox.h b/saber/funcs/priorbox.h index 14b205bff..60cc2501d 100644 --- a/saber/funcs/priorbox.h +++ b/saber/funcs/priorbox.h @@ -65,7 +65,7 @@ class PriorBox : public BaseFunc< SaberStatus compute_priorbox_kernel(const Input_v& input, Output_v& output, Param_t& param) { - LOG(INFO) << "input tensor size: " << input.size(); + DLOG(INFO) << "input tensor size: " << input.size(); unsigned long long out_size = output[0]->valid_size(); if (_cpu_data == nullptr) { @@ -128,9 +128,9 @@ class PriorBox : public BaseFunc< for (int p = 0; p < density_; ++p) { for (int c = 0; c < density_; ++c) { // liu@20171207 changed to fix density bugs at anchor = 64 - float center_x_temp = center_x - step_average / 2 + \ + float center_x_temp = center_x - step_average / 2.0f + \ shift / 2.f + c * shift; - float center_y_temp = center_y - step_average / 2 + \ + float center_y_temp = center_y - step_average / 2.0f + \ shift / 2.f + p * shift; //float center_x_temp = center_x - fixed_size_ / 2 + shift/2. + c*shift; //float center_y_temp = center_y - fixed_size_ / 2 + shift/2. + r*shift; @@ -159,8 +159,8 @@ class PriorBox : public BaseFunc< for (int r = 0; r < density_; ++r) { for (int c = 0; c < density_; ++c) { - float center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift; - float center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift; + float center_x_temp = center_x - fixed_size_ / 2.f + shift / 2.f + c * shift; + float center_y_temp = center_y - fixed_size_ / 2.f + shift / 2.f + r * shift; // xmin _cpu_data[idx++] = (center_x_temp - box_width / 2.f) / img_width >= 0 ? \ (center_x_temp - box_width / 2.f) / img_width : 0 ; @@ -193,8 +193,8 @@ class PriorBox : public BaseFunc< for (int p = 0; p < density_; ++p) { for (int c = 0; c < density_; ++c) { - float center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift; - float center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + p * shift; + float center_x_temp = center_x - fixed_size_ / 2.f + shift / 2.f + c * shift; + float center_y_temp = center_y - fixed_size_ / 2.f + shift / 2.f + p * shift; // xmin _cpu_data[idx++] = (center_x_temp - box_width_ratio / 2.f) / img_width >= 0 ? \ (center_x_temp - box_width_ratio / 2.f) / img_width : 0 ; diff --git a/saber/funcs/product_quant_embedding_with_vsum.h b/saber/funcs/product_quant_embedding_with_vsum.h new file mode 100644 index 000000000..f5db20821 --- /dev/null +++ b/saber/funcs/product_quant_embedding_with_vsum.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_PRODUCT_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_product_quant_embedding_with_vsum.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_product_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_product_quant_embedding_with_vsum.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_product_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_product_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_product_quant_embedding_with_vsum.h" +#endif + +namespace anakin { +namespace saber { + +template +class ProductQuantEmbeddingWithVsum : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ProductQuantEmbeddingWithVsumParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ProductQuantEmbeddingWithVsumParam>::BaseFunc; + + ProductQuantEmbeddingWithVsum() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ProductQuantEmbeddingWithVsumParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + auto offset = input[0]->get_seq_offset()[0]; + int seq_num = offset.size() - 1; + std::vector out_offset; + for (int i = 0; i < seq_num; i++) { + out_offset.push_back(i); + } + out_offset.push_back(seq_num); + std::vector> out_offsets = {out_offset}; + output[0]->set_seq_offset(out_offsets); + + Shape output_shape({seq_num, param.word_emb, 1, 1}, Layout_NCHW); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderProductQuantEmbeddingWithVsum ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberProductQuantEmbeddingWithVsum ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/ps_roi_pooling.h b/saber/funcs/ps_roi_pooling.h new file mode 100644 index 000000000..58f178fca --- /dev/null +++ b/saber/funcs/ps_roi_pooling.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_PS_ROI_POOLING_H +#define ANAKIN_SABER_FUNCS_PS_ROI_POOLING_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_ps_roi_pooling.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_ps_roi_pooling.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_ps_roi_pooling.h" +#endif +namespace anakin { +namespace saber { + +template +class PsRoiPool : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + PsRoiPoolParam> +{ +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + PsRoiPoolParam >::BaseFunc; + + PsRoiPool() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef PsRoiPoolParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, \ + Output_v &output, Param_t& param) override { + + CHECK_GE(input.size(), 2) << "psroipooling input must equal or greater than 2"; + + Shape in_sh = input[0]->valid_shape(); + int rois_num = input[1]->num(); + Shape out_sh = in_sh; + + int size = param.pooled_width * param.pooled_height; + CHECK_EQ(in_sh.channel()%size, 0); + + int new_c = in_sh.channel() / size; + + if (!param.global_pooling){ + out_sh.set_width(param.pooled_width); + out_sh.set_height(param.pooled_height); + } else { + out_sh.set_width(1); + out_sh.set_height(1); + } + out_sh.set_channel(new_c); + out_sh.set_num(rois_num); + + return output[0]->set_shape(out_sh); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderPsRoiPool ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberPsRoiPool ); + return SaberSuccess; + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} + +} + +#endif //ANAKIN_SABER_FUNCS_CROP_H diff --git a/saber/funcs/pyramid_hash_quant_embedding_with_vsum.h b/saber/funcs/pyramid_hash_quant_embedding_with_vsum.h new file mode 100644 index 000000000..001599110 --- /dev/null +++ b/saber/funcs/pyramid_hash_quant_embedding_with_vsum.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H +#define ANAKIN_SABER_FUNCS_PYRAMID_HASH_QUANT_EMBEDDING_WITH_VSUM_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_pyramid_hash_quant_embedding_with_vsum.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_pyramid_hash_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_pyramid_hash_quant_embedding_with_vsum.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_pyramid_hash_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_pyramid_hash_quant_embedding_with_vsum.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_pyramid_hash_quant_embedding_with_vsum.h" +#endif + + +namespace anakin { +namespace saber { + +template +class PyramidHashQuantEmbeddingWithVsum : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + PyramidHashQuantEmbeddingParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + PyramidHashQuantEmbeddingParam>::BaseFunc; + + PyramidHashQuantEmbeddingWithVsum() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef PyramidHashQuantEmbeddingParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + int seq_num = input[0]->get_seq_offset()[0].size() - 1; + Shape output_shape({seq_num, param.emb_size, 1, 1}, Layout_NCHW); + std::vector offset; + for (int i = 0; i < seq_num; i++) { + offset.push_back(i); + } + offset.push_back(seq_num); + std::vector> out_offset; + out_offset.push_back(offset); + output[0]->set_seq_offset(out_offset); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderPyramidHashQuantEmbeddingWithVsum ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberPyramidHashQuantEmbeddingWithVsum ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/reduce.h b/saber/funcs/reduce.h new file mode 100644 index 000000000..4b91548e4 --- /dev/null +++ b/saber/funcs/reduce.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_REDUCE_H +#define ANAKIN_SABER_FUNCS_REDUCE_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_reduce.h" + +#ifdef USE_CUDA +#include "saber/funcs/impl/cuda/saber_reduce.h" +#include "saber/funcs/impl/cuda/vender_reduce.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_reduce.h" +#endif + +namespace anakin { +namespace saber { + +template +class Reduce : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ReduceParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ReduceParam>::BaseFunc; + + Reduce() = default; + + virtual SaberStatus compute_output_shape( + const std::vector*>& input, + std::vector*> &output, + ReduceParam ¶m) override { + Shape input_shape = input[0]->valid_shape(); + int input_dim = input_shape.size(); +// LOG(INFO) <<"input.valid.size:"<valid_size(); + + int reduce_dim = param.reduce_dim.size(); + //The dim we want to reduce is not empty. + if (param.reduce_all) { + // CHECK IF reduce dim size is legal + // I hope parser has handle this for saber, + // if not, saber will re-write reduce_dim + if (param.reduce_dim.size() != input_dim) { + param.reduce_dim.clear(); + for (int i = 0; i < input_dim; ++i) { + param.reduce_dim.push_back(i); + } + } + // check keep dim ? + std::vector temp_shape(input_dim, 1); + Shape out_shape(temp_shape); + return output[0]->set_shape(out_shape); + } else { + //Check valid reduce dim. + Shape output_shape(input[0]->valid_shape()); + CHECK_LT(reduce_dim, input_dim) << "[reduce_min]reduce_dim's size must less than input's!!!"; + int tmp_dim; + for (int i = 0; i < reduce_dim; i++) { + if (param.reduce_dim[i] < 0) { + tmp_dim = param.reduce_dim[i] + input_dim; + CHECK_GE(tmp_dim, 0) << "[reduce_min] invalid reduce_dim!!!"; + CHECK_LT(tmp_dim, input_dim) << "[reduce_min]invalid reduce_dim!!!"; + output_shape[tmp_dim] = 1; //The dimention tmp_dim is to reduce dimention. + }else { + CHECK_LT(param.reduce_dim[i], input_dim) << "[reduce_min]invalid reduce_dim!!!"; + output_shape[param.reduce_dim[i]] = 1; + } + //output_shape[param.reduce_dim[i]] = 1; + } + return output[0]->set_shape(output_shape); + } + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderReduce ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberReduce ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/reduce_min.h b/saber/funcs/reduce_min.h new file mode 100644 index 000000000..36b56df8d --- /dev/null +++ b/saber/funcs/reduce_min.h @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_REDUCE_MIN_H +#define ANAKIN_SABER_FUNCS_REDUCE_MIN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_reduce_min.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_reduce_min.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_reduce_min.h" +#endif + +#ifdef USE_AMD +#endif + +#ifdef USE_ARM_PLACE +#endif + +#ifdef USE_BM +#endif + +namespace anakin { +namespace saber { + +template +class ReduceMin : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + ReduceMinParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + ReduceMinParam>::BaseFunc; + + ReduceMin() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef ReduceMinParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape input_shape = input[0]->valid_shape(); + int input_dim = input_shape.size(); + // int real_dim = 0; + // //Count with the real_dim that wanted to be reduced. + // for (int i = 0; i < input_dim; ++i) { + // if (input_shape[i] != 1) { + // ++real_dim; + // } + // } + LOG(INFO) <<"input.valid.size:"<valid_size(); + Shape output_shape(input[0]->valid_shape()); + int reduce_dim = param.reduce_dim.size(); + //The dim we want to reduce is not empty. + if (reduce_dim != 0) { + //Check valid reduce dim. + CHECK_LT(reduce_dim, input_dim) << "[reduce_min]reduce_dim's size must less than input's!!!"; + int tmp_dim; + for (int i = 0; i < reduce_dim; i++) { + if (param.reduce_dim[i] < 0) { + tmp_dim = param.reduce_dim[i] + input_dim; + CHECK_GE(tmp_dim, 0) << "[reduce_min] invalid reduce_dim!!!"; + CHECK_LT(tmp_dim, input_dim) << "[reduce_min]invalid reduce_dim!!!"; + output_shape[tmp_dim] = 1; //The dimention tmp_dim is to reduce dimention. + }else { + CHECK_LT(param.reduce_dim[i], input_dim) << "[reduce_min]invalid reduce_dim!!!"; + output_shape[param.reduce_dim[i]] = 1; + } + //output_shape[param.reduce_dim[i]] = 1; + } + }else { + //Default to reduce all dimensions to a single value. + output_shape = Shape({1, 1, 1, 1}); + } + if (!param.keep_dim) { + int size = output_shape.count(); + output_shape = Shape({size, 1, 1, 1}); + } + + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderReduceMin ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberReduceMin ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/reshape.h b/saber/funcs/reshape.h index f88d4eb17..21e25d5c2 100644 --- a/saber/funcs/reshape.h +++ b/saber/funcs/reshape.h @@ -75,7 +75,7 @@ class Reshape : public BaseFunc< if (infer_axis >= 0){ output_shape[infer_axis] = valid_size / count_axis; } - + output[0]->set_seq_offset(input[0]->get_seq_offset()); return output[0] -> set_shape(output_shape); } //Reshape ops do nothing diff --git a/saber/funcs/resize.h b/saber/funcs/resize.h old mode 100755 new mode 100644 index 26610c480..4321dfe71 --- a/saber/funcs/resize.h +++ b/saber/funcs/resize.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_RESIZE_H @@ -27,13 +27,12 @@ #include "saber/funcs/impl/x86/saber_resize.h" #endif -#ifdef AMD_GPU +#ifdef AMD_GPU #include "saber/funcs/impl/impl_resize.h" #endif #ifdef USE_ARM_PLACE -//todo -#include "saber/funcs/impl/impl_resize.h" +#include "saber/funcs/impl/arm/saber_resize.h" #endif namespace anakin{ @@ -76,6 +75,9 @@ class Resize : public BaseFunc< CHECK_GE(height_idx, 0) << "no height dim in tensor"; CHECK_GE(width_idx, 0) << "no width dim in tensor"; + bool has_out_wh = (param.out_width != -1) && (param.out_height != -1); + bool has_scale_wh = (param.width_scale > 0.f) && (param.height_scale > 0.f); + CHECK_EQ(has_out_wh || has_scale_wh, true) << "resize param must has either scale_w/scale_h or out_w/out_h"; if (num_idx > -1) { output_shape[num_idx] = input[0]->num(); // N } @@ -83,11 +85,21 @@ class Resize : public BaseFunc< output_shape[channel_idx] = input[0]->channel(); // C } if (height_idx > -1) { - int height = floor(input[0]->height() * param.height_scale); // H + int height = 0; + if (param.out_height != -1){ + height = param.out_height; + } else { + height = floor(input[0]->height() * param.height_scale); // H + } output_shape[height_idx] = height; } if (width_idx > -1) { - int width = floor(input[0]->width() * param.width_scale); //W + int width = 0; + if (param.out_width != -1){ + width = param.out_width; + } else { + width = floor(input[0]->width() * param.width_scale); //W + } output_shape[width_idx] = width; } @@ -95,19 +107,19 @@ class Resize : public BaseFunc< } virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - //return SaberUnImplError; + switch (implenum) { + case VENDER_IMPL: + //return SaberUnImplError; this->_impl.push_back(new VenderResize); return SaberSuccess; - case SABER_IMPL: + case SABER_IMPL: this->_impl.push_back(new SaberResize); return SaberSuccess; - default: - return SaberUnImplError; - } + default: + return SaberUnImplError; + } }; private: diff --git a/saber/funcs/roi_align.h b/saber/funcs/roi_align.h new file mode 100644 index 000000000..950e44b5a --- /dev/null +++ b/saber/funcs/roi_align.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_ROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_ROI_ALIGN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_roi_align.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_roi_align.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_roi_align.h" +#endif + +#ifdef USE_AMD +#endif + +#ifdef USE_ARM_PLACE +#endif + +#ifdef USE_BM +#endif + +namespace anakin { +namespace saber { + +template +class RoiAlign : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + RoiAlignParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + RoiAlignParam>::BaseFunc; + + RoiAlign() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef RoiAlignParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + //input[1] is roi. + Shape output_shape = input[0]->valid_shape(); + CHECK_EQ(input.size(), 2) << " input's size must be 2."; + int num_index = input[0]->num_index(); + int height_index = input[0]->height_index(); + int width_index = input[0]->width_index(); + + output_shape[num_index] = input[1]->num(); + output_shape[height_index] = param.pooled_height; + output_shape[width_index] = param.pooled_width; + + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderRoiAlign ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberRoiAlign ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/roi_pooling.h b/saber/funcs/roi_pooling.h index 63078202e..9940c9f4d 100644 --- a/saber/funcs/roi_pooling.h +++ b/saber/funcs/roi_pooling.h @@ -10,7 +10,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_ROI_POOL_H @@ -20,6 +20,9 @@ #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_roi_pooling.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_roi_pool.h" +#endif #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_roi_pool.h" #endif diff --git a/saber/funcs/saber_util.h b/saber/funcs/saber_util.h index ac2d22b9d..9d8aedff2 100644 --- a/saber/funcs/saber_util.h +++ b/saber/funcs/saber_util.h @@ -1,13 +1,14 @@ #ifndef ANAKIN_SABER_FUNCS_IMPL_SABER_UTIL_H #define ANAKIN_SABER_FUNCS_IMPL_SABER_UTIL_H +#include +#include "saber/core/common.h" +#include "saber/core/tensor.h" +#include "saber/core/shape.h" namespace anakin { namespace saber { namespace utils { -#include "saber/core/common.h" -#include "saber/core/tensor.h" -#include "saber/core/shape.h" template static inline bool try_expand_tensor(opTensor& x, anakin::saber::Shape shape) { @@ -15,6 +16,7 @@ static inline bool try_expand_tensor(opTensor& x, anakin::saber::Shape shape) { x.re_alloc(shape, x.get_dtype()); return true; } + return false; } @@ -24,19 +26,611 @@ static inline bool try_expand_tensor(opTensor& x, int size) { anakin::saber::Shape shape({1, 1, 1, size}, Layout_NCHW); return try_expand_tensor(x, shape); } + return false; } template -static inline void transpose(const DataType* in,int height,int width,DataType*out){ - for(int i=0;i + static void print_float(Dtype* target) { + float* f = (float*)target; + printf("size = %d\n", sizeof(Dtype)); + + for (int i = 0; i < sizeof(Dtype) / sizeof(float); i++) { + printf(" %f ,", f[i]); + } + + printf("\n"); + } +}; + +class AlignedUtils { +public: + template + void aligned_last_dim(const Dtype* input, Dtype* output, int input_size, int ori_last_dim, + int aligned_dim) { + for (int row = 0; row < input_size / ori_last_dim; row++) { + for (int col = ori_last_dim; col < aligned_dim; col++) { + output[row * aligned_dim + col] = static_cast(0); + } + } + + for (int i = 0; i < input_size; i++) { + int row = i / ori_last_dim; + int col = i % ori_last_dim; + output[row * aligned_dim + col] = input[i]; + } + } + template + void unaligned_last_dim(const Dtype* input, Dtype* output, int output_size, int ori_last_dim, + int aligned_dim) { + for (int i = 0; i < output_size; i++) { + int row = i / ori_last_dim; + int col = i % ori_last_dim; + output[i] = input[row * aligned_dim + col]; + } + } + +}; + +class SeqSortedseqTranseUtil { +public: + SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false) + : _is_reverse(is_reverse), + _is_bi(is_bi) {}; + void print_vec(int* in, int size, const char* perfix) { + for (int i = 0; i < size; i++) { + printf("[%s] %d = %d\n", perfix, i, in[i]); + } + } + template + void seq_2_sorted_seq(const Dtype* input, Dtype* output, int word_size) { + // _map_vec.resize(word_sum); + int word_sum = _map_vec.size(); + // std::cout << "word_sum = " << word_sum << std::endl; + + for (int ori_word_id = 0; ori_word_id < word_sum; ++ori_word_id) { + //can param + int word_start = ori_word_id * word_size; + int maped_id = _map_vec[ori_word_id]; + int maped_start = maped_id * word_size; + + for (int word_vec_offset = 0; word_vec_offset < word_size; ++word_vec_offset) { + // std::cout< "< "< "< + void sorted_seq_2_seq(const Dtype* input, Dtype* output, int hidden_size, + int alligned_hidden_size) { + int word_sum = _map_vec.size(); + + for (int ori_word_id = 0; ori_word_id < word_sum; ori_word_id++) { + //can param + int word_start = ori_word_id * hidden_size; + int maped_id = _map_vec[ori_word_id]; + int maped_start = maped_id * alligned_hidden_size; + + for (int word_vec_offset = 0; word_vec_offset < hidden_size; word_vec_offset++) { + // std::cout< "<& offset_vec, + std::vector& emit_offset_vec, int& emit_length, int skip_num = 0) { + int batch_size = offset_vec.size() - 1; + int word_sum = offset_vec[offset_vec.size() - 1]; + std::vectorlength_vec(batch_size); + _length_index.resize(batch_size); + + if (skip_num > 1) { + CHECK_EQ(batch_size, 1) << "only support batch = 1 in skip_mode"; + CHECK_EQ(word_sum % skip_num, 0); + int real_batch_size = skip_num; + emit_length = word_sum / skip_num; + emit_offset_vec.resize(emit_length + 1); + emit_offset_vec[0] = 0; + + for (int i = 1; i <= emit_length; i++) { + emit_offset_vec[i] = emit_offset_vec[i - 1] + skip_num; + } + + return false; + } + + if (batch_size == 1) { + emit_length = offset_vec[1] - offset_vec[0]; + emit_offset_vec.resize(emit_length + 1); + + for (int i = 0; i <= emit_length; i++) { + emit_offset_vec[i] = i; + } + + return false; + } + + int max_len = 0; + + for (int i = 0; i < offset_vec.size() - 1; ++i) { + int len = offset_vec[i + 1] - offset_vec[i]; + max_len = max_len > len ? max_len : len; + length_vec[i] = len; + _length_index[i] = i; + } + + emit_length = max_len; + + if (max_len == 1) { + emit_offset_vec.push_back(0); + emit_offset_vec.push_back(emit_length * batch_size); + return false; + } + + std::sort(_length_index.begin(), _length_index.end(), [&length_vec](int i1, int i2) { + return length_vec[i1] > length_vec[i2]; + }); + + emit_offset_vec.resize(max_len + 1); + _map_vec.resize(word_sum); + + int target_word_id = 0; + std::vector length_vec_cnt = length_vec; + int last_batch_size = batch_size; + + for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) { + emit_offset_vec[word_id_in_seq] = target_word_id; + + for (int batch_id = 0; batch_id < last_batch_size; batch_id++) { + int old_batch_id = _length_index[batch_id]; + + if (length_vec_cnt[old_batch_id] > 0) { + int inner_word_id_in_seq = word_id_in_seq; + + if (_is_reverse) { + inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq; + } + + int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq; + _map_vec[old_word_id] = target_word_id; + // printf("map %d -> %d\n",old_word_id,target_word_id); + length_vec_cnt[old_batch_id]--; + target_word_id++; + } else { + last_batch_size--; + break; + } + } } + + // print_vec(_map_vec.data(),word_sum,"map"); + emit_offset_vec[max_len] = word_sum; + return true; } + + +private: + // std::vector _length_vec; + std::vector _length_index; + std::vector _map_vec; + bool _is_reverse; + bool _is_bi; + +}; + +/* analogue std::conditional */ +template struct conditional {}; +template struct conditional { + typedef T type; +}; +template struct conditional { + typedef F type; +}; + +template struct conditional3 {}; +template +struct conditional3 { + typedef T type; +}; +template +struct conditional3 { + typedef FT type; +}; +template +struct conditional3 { + typedef FF type; +}; + +template struct conditional_v {}; +template struct conditional_v { + static constexpr U value = t; +}; +template struct conditional_v { + static constexpr U value = f; +}; + +template +inline const T& min(const T& a, const T& b) { + return a < b ? a : b; } +template +inline const T& max(const T& a, const T& b) { + return a > b ? a : b; +} + +template +inline typename std::remove_reference::type zero() { + auto zero = typename std::remove_reference::type(); + return zero; +} +template +inline bool everyone_is(T val, P item) { + return val == item; +} +template +inline bool everyone_is(T val, P item, Args... item_others) { + return val == item && everyone_is(val, item_others...); +} + +template +inline bool one_of(T val, P item) { + return val == item; +} +template +inline bool one_of(T val, P item, Args... item_others) { + return val == item || one_of(val, item_others...); +} + +template +inline bool any_null(Args... ptrs) { + return one_of(nullptr, ptrs...); +} + +inline bool implication(bool cause, bool effect) { + return !cause || effect; +} + +template +inline void array_copy(T* dst, const T* src, size_t size) { + for (size_t i = 0; i < size; ++i) { + dst[i] = src[i]; + } +} + +template +inline bool array_cmp(const T* a1, const T* a2, size_t size) { + for (size_t i = 0; i < size; ++i) if (a1[i] != a2[i]) { + return false; + } + + return true; +} + +template +inline void array_set(T* arr, const U& val, size_t size) { + for (size_t i = 0; i < size; ++i) { + arr[i] = static_cast(val); + } +} + +namespace product_impl { + +template struct int2type {}; + +template +constexpr int product_impl(const T* arr, int2type<0>) { + return arr[0]; +} + +template +inline T product_impl(const T* arr, int2type) { + return arr[0] * product_impl(arr + 1, int2type < num - 1 > ()); +} +} + +template +inline T array_product(const T* arr) { + return product_impl::product_impl(arr, product_impl::int2type < num - 1 > ()); +} + +template +inline R array_product(const T* arr, size_t size) { + R prod = 1; + + for (size_t i = 0; i < size; ++i) { + prod *= arr[i]; + } + + return prod; +} + +template +inline typename std::remove_reference::type div_up(const T a, const U b) { + assert(b); + return (a + b - 1) / b; +} + +template +inline typename std::remove_reference::type rnd_up(const T a, const U b) { + return div_up(a, b) * b; +} + +template +inline typename std::remove_reference::type rnd_dn(const T a, const U b) { + return (a / b) * b; +} + +template +inline U this_block_size(const T offset, const U max, const V block_size) { + assert(offset < max); + // TODO (Roma): can't use nstl::max() due to circular dependency... we + // need to fix this + const T block_boundary = offset + block_size; + + if (block_boundary > max) { + return max - offset; + } else { + return block_size; + } +} + +template +struct array_offset_calculator { + template + array_offset_calculator(Telem* base, Targs... Fargs) : _dims{ Fargs... } { + _base_ptr = base; + } + + template + inline Telem& operator()(Targs... Fargs) { + return *(_base_ptr + _offset(1, Fargs...)); + } + +private: + template + inline size_t _offset(size_t const dimension, size_t element) { + return element; + } + + template + inline size_t _offset(size_t const dimension, size_t theta, size_t element) { + return element + (_dims[dimension] * theta); + } + + template + inline size_t _offset(size_t const dimension, size_t theta, size_t element, + Targs... Fargs) { + size_t t_prime = element + (_dims[dimension] * theta); + return _offset(dimension + 1, t_prime, Fargs...); + } + + Telem* _base_ptr; + const int _dims[Tdims]; +}; + +}//fin utils namespace + +template struct is_integral { + static constexpr bool value = false; +}; + +template<> struct is_integral { + static constexpr bool value = true; +}; +template<> struct is_integral { + static constexpr bool value = true; +}; +template<> struct is_integral { + static constexpr bool value = true; +}; +template<> struct is_integral { + static constexpr bool value = true; +}; + +template +inline typename std::enable_if < !is_integral::value, + typename std::remove_reference::type >::type +saturate(const acc_t& x) { + return x; +} + +template +inline typename std::enable_if::value, + typename std::remove_reference::type>::type +saturate(const acc_t& x) { + acc_t v = x; + + if (v < (acc_t)std::numeric_limits::lowest()) { + v = (acc_t)std::numeric_limits::lowest(); + } + + if (v > (acc_t)std::numeric_limits::max()) { + v = (acc_t)std::numeric_limits::max(); + } + + return (typename std::remove_reference::type)v; +} + +template +inline out_t round_and_saturate(float f, round_mode rmode) { + switch (rmode) { + case round_mode::nearest: + f = nearbyintf(f); + break; + + case round_mode::down: + f = floorf(f); + break; + } + + return saturate(f); +} + +/* Quantization with beta == 0 */ +template struct qz_b0 { + out_t operator()(in_t in, float alpha, round_mode rmode) { + return round_and_saturate(alpha * in, rmode); + } +}; + +inline size_t datatype_size(DataType data_type) { + switch (data_type) { + case AK_FLOAT: + return sizeof(float); + + case AK_INT32: + return sizeof(int32_t); + + case AK_HALF: + return sizeof(int16_t); + + case AK_INT8: + return sizeof(int8_t); + + case AK_UINT8: + return sizeof(uint8_t); + + case AK_INVALID: + default: + assert(!"unknown data_type"); + } + + return 0; +} + +/** returns floor(log2(v)), aka the position of the leftmost non-0 bit */ +inline int ilog2q(size_t v) { + if (v == 0) { + return -1; + } + + int p = 0; +# define CP(pw) do { if (v >= (1ull << pw)) { v >>= pw; p += pw; } } while(0) + CP(32); + CP(16); + CP(8); + CP(4); + CP(2); + CP(1); +# undef CP + return p; +} + +struct scratchpad_t { + virtual ~scratchpad_t() {} + virtual char* get() const = 0; +}; + +template +inline void balance2D(U nthr, U ithr, T ny, T& ny_start, T& ny_end, + T nx, T& nx_start, T& nx_end, T nx_divider) { + const T grp_size = utils::div_up(nthr, nx_divider); + const T grp_count = utils::div_up(nthr, grp_size); + + T grp = ithr / grp_size; + T grp_ithr = ithr % grp_size; + T grp_nthr = grp_size; + T first_grps = nthr % grp_count; + + if (first_grps > 0 && grp >= first_grps) { + ithr -= first_grps * grp_size; + grp_nthr--; + grp = ithr / grp_nthr + first_grps; + grp_ithr = ithr % grp_nthr; + } + + balance211(nx, grp_count, grp, nx_start, nx_end); + balance211(ny, grp_nthr, grp_ithr, ny_start, ny_end); +} + +template +inline U this_block_size(const T offset, const U max, const V block_size) { + assert(offset < max); + // TODO (Roma): can't use nstl::max() due to circular dependency... we + // need to fix this + const T block_boundary = offset + block_size; + + if (block_boundary > max) { + return max - offset; + } else { + return block_size; + } } } diff --git a/saber/funcs/saturate.h b/saber/funcs/saturate.h new file mode 100644 index 000000000..4b3db19d6 --- /dev/null +++ b/saber/funcs/saturate.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SATURATE_H +#define ANAKIN_SABER_FUNCS_SATURATE_H + +#include "saber/core/common.h" +#include +#include +namespace anakin { +namespace saber{ + +template static inline _Tp saturate_cast(uint8_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int8_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(uint16_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int16_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(uint32_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int32_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(float v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(double v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(int64_t v) { return _Tp(v); } +/** @overload */ +template static inline _Tp saturate_cast(uint64_t v) { return _Tp(v); } + +template<> inline uint8_t saturate_cast(int8_t v) { return (uint8_t)std::max((int)v, 0); } +template<> inline uint8_t saturate_cast(uint16_t v) { return (uint8_t)std::min((unsigned)v, (unsigned)UCHAR_MAX); } +template<> inline uint8_t saturate_cast(int v) { return (uint8_t)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline uint8_t saturate_cast(short v) { return saturate_cast((int)v); } +template<> inline uint8_t saturate_cast(unsigned v) { return (uint8_t)std::min(v, (unsigned)UCHAR_MAX); } +template<> inline uint8_t saturate_cast(float v) { int iv = (int)roundf(v); return saturate_cast(iv); } +template<> inline uint8_t saturate_cast(double v) { int iv = (int)round(v); return saturate_cast(iv); } +template<> inline uint8_t saturate_cast(int64_t v) { return (uint8_t)((uint64_t)v <= (uint64_t)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline uint8_t saturate_cast(uint64_t v) { return (uint8_t)std::min(v, (uint64_t)UCHAR_MAX); } + +template<> inline int8_t saturate_cast(uint8_t v) { return (int8_t)std::min((int)v, SCHAR_MAX); } +template<> inline int8_t saturate_cast(uint16_t v) { return (int8_t)std::min((unsigned)v, (unsigned)SCHAR_MAX); } +template<> inline int8_t saturate_cast(int v) { return (int8_t)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline int8_t saturate_cast(short v) { return saturate_cast((int)v); } +template<> inline int8_t saturate_cast(unsigned v) { return (int8_t)std::min(v, (unsigned)SCHAR_MAX); } +template<> inline int8_t saturate_cast(float v) { int iv = (int)roundf(v); return saturate_cast(iv); } +template<> inline int8_t saturate_cast(double v) { int iv = (int)round(v); return saturate_cast(iv); } +template<> inline int8_t saturate_cast(int64_t v) { return (int8_t)((uint64_t)((int64_t)v-SCHAR_MIN) <= (uint64_t)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline int8_t saturate_cast(uint64_t v) { return (int8_t)std::min(v, (uint64_t)SCHAR_MAX); } + +template<> inline uint16_t saturate_cast(int8_t v) { return (uint16_t)std::max((int)v, 0); } +template<> inline uint16_t saturate_cast(short v) { return (uint16_t)std::max((int)v, 0); } +template<> inline uint16_t saturate_cast(int v) { return (uint16_t)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline uint16_t saturate_cast(unsigned v) { return (uint16_t)std::min(v, (unsigned)USHRT_MAX); } +template<> inline uint16_t saturate_cast(float v) { int iv = (int)roundf(v); return saturate_cast(iv); } +template<> inline uint16_t saturate_cast(double v) { int iv = (int)round(v); return saturate_cast(iv); } +template<> inline uint16_t saturate_cast(int64_t v) { return (uint16_t)((uint64_t)v <= (uint64_t)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline uint16_t saturate_cast(uint64_t v) { return (uint16_t)std::min(v, (uint64_t)USHRT_MAX); } + +template<> inline short saturate_cast(uint16_t v) { return (short)std::min((int)v, SHRT_MAX); } +template<> inline short saturate_cast(int v) { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline short saturate_cast(unsigned v) { return (short)std::min(v, (unsigned)SHRT_MAX); } +template<> inline short saturate_cast(float v) { int iv = (int)roundf(v); return saturate_cast(iv); } +template<> inline short saturate_cast(double v) { int iv = (int)round(v); return saturate_cast(iv); } +template<> inline short saturate_cast(int64_t v) { return (short)((uint64_t)((int64_t)v - SHRT_MIN) <= (uint64_t)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline short saturate_cast(uint64_t v) { return (short)std::min(v, (uint64_t)SHRT_MAX); } + +template<> inline int saturate_cast(unsigned v) { return (int)std::min(v, (unsigned)INT_MAX); } +template<> inline int saturate_cast(int64_t v) { return (int)((uint64_t)(v - INT_MIN) <= (uint64_t)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); } +template<> inline int saturate_cast(uint64_t v) { return (int)std::min(v, (uint64_t)INT_MAX); } +template<> inline int saturate_cast(float v) { return (int)roundf(v); } +template<> inline int saturate_cast(double v) { return (int)round(v); } + +template<> inline unsigned saturate_cast(int8_t v) { return (unsigned)std::max(v, (int8_t)0); } +template<> inline unsigned saturate_cast(short v) { return (unsigned)std::max(v, (short)0); } +template<> inline unsigned saturate_cast(int v) { return (unsigned)std::max(v, (int)0); } +template<> inline unsigned saturate_cast(int64_t v) { return (unsigned)((uint64_t)v <= (uint64_t)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); } +template<> inline unsigned saturate_cast(uint64_t v) { return (unsigned)std::min(v, (uint64_t)UINT_MAX); } +// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. +template<> inline unsigned saturate_cast(float v) { return static_cast(roundf(v)); } +template<> inline unsigned saturate_cast(double v) { return static_cast(round(v)); } + +template<> inline uint64_t saturate_cast(int8_t v) { return (uint64_t)std::max(v, (int8_t)0); } +template<> inline uint64_t saturate_cast(short v) { return (uint64_t)std::max(v, (short)0); } +template<> inline uint64_t saturate_cast(int v) { return (uint64_t)std::max(v, (int)0); } +template<> inline uint64_t saturate_cast(int64_t v) { return (uint64_t)std::max(v, (int64_t)0); } + +template<> inline int64_t saturate_cast(uint64_t v) { return (int64_t)std::min(v, (uint64_t)LLONG_MAX); } +#if 0 //FP16 +/** @overload */ +template static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); } + +// in theory, we could use a LUT for 8u/8s->16f conversion, +// but with hardware support for FP32->FP16 conversion the current approach is preferable +template<> inline float16_t saturate_cast(uint8_t v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(int8_t v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(uint16_t v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(short v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(unsigned v){ return float16_t((float)v); } +template<> inline float16_t saturate_cast(int v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(uint64_t v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(int64_t v) { return float16_t((float)v); } +template<> inline float16_t saturate_cast(float v) { return float16_t(v); } +template<> inline float16_t saturate_cast(double v) { return float16_t((float)v); } +#endif +} //namespace saber +} //namespace anakin +#endif // ANAKIN_SABER_FUNCS_SATURATE_H diff --git a/saber/funcs/scale.h b/saber/funcs/scale.h index 255d80564..71637b9c0 100644 --- a/saber/funcs/scale.h +++ b/saber/funcs/scale.h @@ -21,6 +21,9 @@ #include "saber/saber_funcs_param.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_scale.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_scale.h" +#endif #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_scale.h" @@ -29,6 +32,9 @@ #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_scale.h" #endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_scale.h" +#endif namespace anakin { namespace saber { diff --git a/saber/funcs/seq_concat_seq_pool_soft_sign.h b/saber/funcs/seq_concat_seq_pool_soft_sign.h new file mode 100644 index 000000000..eafedbcf8 --- /dev/null +++ b/saber/funcs/seq_concat_seq_pool_soft_sign.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_SEQ_CONCAT_SEQ_POOL_SOFT_SIGN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_seq_concat_seq_pool_soft_sign.h" + +#ifdef NVIDIA_GPU +//#include "saber/funcs/impl/cuda/saber_seq_concat_seq_pool_soft_sign.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_seq_concat_seq_pool_soft_sign.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_seq_concat_seq_pool_soft_sign.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_seq_concat_seq_pool_soft_sign.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_seq_concat_seq_pool_soft_sign.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SeqConcatSeqPoolSoftSign : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SeqConcatSeqPoolSoftSignParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SeqConcatSeqPoolSoftSignParam>::BaseFunc; + + SeqConcatSeqPoolSoftSign() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SeqConcatSeqPoolSoftSignParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + int seq_num = input[0]->get_seq_offset()[0].size() - 1; + int emb_size = input[0]->valid_size() / input[0]->num(); + Shape output_shape({seq_num, emb_size, 1, 1}, Layout_NCHW); + std::vector> out_offset; + out_offset.resize(1); + out_offset[0].push_back(0); + for (int i = 0; i < seq_num; i++) { + out_offset[0].push_back(i); + } + + output[0]->set_seq_offset(out_offset); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSeqConcatSeqPoolSoftSign ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSeqConcatSeqPoolSoftSign ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/sequence_concat.h b/saber/funcs/sequence_concat.h new file mode 100644 index 000000000..e8806f10b --- /dev/null +++ b/saber/funcs/sequence_concat.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQUENCE_CONCAT_H +#define ANAKIN_SABER_FUNCS_SEQUENCE_CONCAT_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_sequence_concat.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_sequence_concat.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sequence_concat.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_sequence_concat.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_sequence_concat.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_sequence_concat.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SequenceConcat : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequenceConcatParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequenceConcatParam>::BaseFunc; + + SequenceConcat() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequenceConcatParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + Shape output_shape = (input[0]->valid_shape()); + CHECK_EQ(input[0]->num_index(), 0) << "num index must be zero"; + for (int i = 1; i < input.size(); i++) { + output_shape[0] += input[i]->num(); + } + std::vector> out_offset; + out_offset.resize(1); + int seq_len = input[0]->get_seq_offset()[0].size() - 1; + out_offset[0].push_back(0); + int cur_off = 0; + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < input.size(); j++) { + cur_off += input[j]->get_seq_offset()[0][i + 1]; + } + out_offset[0].push_back(cur_off); + } + + output[0]->set_seq_offset(out_offset); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSequenceConcat ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSequenceConcat ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/sequence_depadding.h b/saber/funcs/sequence_depadding.h new file mode 100644 index 000000000..684957982 --- /dev/null +++ b/saber/funcs/sequence_depadding.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQUENCE_DEPADDING_H +#define ANAKIN_SABER_FUNCS_SEQUENCE_DEPADDING_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_sequence_depadding.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_sequence_depadding.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sequence_depadding.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/saber_sequence_depadding.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_sequence_depadding.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_sequence_depadding.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SequenceDePadding : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequenceDePaddingParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequenceDePaddingParam>::BaseFunc; + + SequenceDePadding() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequenceDePaddingParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + auto seq_offset = input[1]->get_seq_offset()[0]; + Shape output_shape = input[0]->valid_shape(); + output_shape[0] = seq_offset.back(); + + output[0]->set_seq_offset(input[1]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSequenceDePadding ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSequenceDePadding ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/sequence_expand.h b/saber/funcs/sequence_expand.h index 207cdff52..03a645567 100644 --- a/saber/funcs/sequence_expand.h +++ b/saber/funcs/sequence_expand.h @@ -94,7 +94,6 @@ class SequenceExpand : public BaseFunc < output_shape[0] = cum; output[0]->set_seq_offset({off}); - } @@ -133,4 +132,4 @@ class SequenceExpand : public BaseFunc < } // namespace saber } // namespace anakin -#endif \ No newline at end of file +#endif diff --git a/saber/funcs/sequence_padding.h b/saber/funcs/sequence_padding.h new file mode 100644 index 000000000..bf8e1fdd4 --- /dev/null +++ b/saber/funcs/sequence_padding.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQUENCE_PADDING_H +#define ANAKIN_SABER_FUNCS_SEQUENCE_PADDING_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_sequence_padding.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_sequence_padding.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sequence_padding.h" +#endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/saber_sequence_padding.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_sequence_padding.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_sequence_padding.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SequencePadding : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequencePaddingParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequencePaddingParam>::BaseFunc; + + SequencePadding() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequencePaddingParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + int max_len = 0; + auto seq_offset = input[0]->get_seq_offset()[0]; + int seq_num = seq_offset.size() - 1; + int emb_size = input[0]->count_valid(1, input[0]->dims()); + for (int i = 0; i < seq_num; i++) { + int cur_len = seq_offset[i+1] - seq_offset[i]; + max_len = cur_len > max_len ? cur_len : max_len; + } + Shape output_shape = input[0]->valid_shape(); + output_shape[0] = seq_num * max_len; + + std::vector out_offset; + for (int i = 0; i < seq_num + 1; i++) { + out_offset.push_back(i * max_len); + } + output[0]->set_seq_offset({out_offset}); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSequencePadding ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSequencePadding ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/sequence_pool.h b/saber/funcs/sequence_pool.h index 9654b717a..4d30aef99 100644 --- a/saber/funcs/sequence_pool.h +++ b/saber/funcs/sequence_pool.h @@ -65,7 +65,7 @@ class SequencePool : public BaseFunc< std::vector > offset = input[0]->get_seq_offset(); //CHECK_GT(offset.size(), 1) << "seq num error! " << offset.size(); int output_shape_num=0; - if (offset.size() > 1) { + if (offset.size() >=1 && offset[0].size() > 1) { output_shape_num = offset[0].size() - 1; } else { output_shape_num = input[0]->num(); diff --git a/saber/funcs/sequence_pool_concat.h b/saber/funcs/sequence_pool_concat.h new file mode 100644 index 000000000..176dfb5b7 --- /dev/null +++ b/saber/funcs/sequence_pool_concat.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SEQUENCE_POOL_CONCAT_H +#define ANAKIN_SABER_FUNCS_SEQUENCE_POOL_CONCAT_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_sequence_pool_concat.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_sequence_pool_concat.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sequence_pool_concat.h" +#endif + +namespace anakin { +namespace saber { + +template +class SequencePoolConcat : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequencePoolConcatParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SequencePoolConcatParam>::BaseFunc; + + SequencePoolConcat() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SequencePoolConcatParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v& input, + Output_v &output, Param_t& param) override { + int xdim = input[0]->width(); + auto offset = input[0]->get_seq_offset(); + int slot_num = param.slot_num; + // batch need to check the max batch + int batch = 0; + if (offset.size() >= 1 && offset[0].size() > 1) { + batch = (offset[0].size() - 1) / slot_num; + } else { + batch = input[0]->num(); + } + Shape output_shape({batch, slot_num * input[0]->width(), 1, 1}, Layout_NCHW); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSequencePoolConcat ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberSequencePoolConcat ); + return SaberSuccess; + default: + return SaberUnImplError; + } + } +private: + + virtual void pick_best_static() override { + if (true) // some condition? + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; +} +} +#endif diff --git a/saber/funcs/shuffle_channel.h b/saber/funcs/shuffle_channel.h index bfd827b08..585db3be1 100644 --- a/saber/funcs/shuffle_channel.h +++ b/saber/funcs/shuffle_channel.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_SHUFFLE_CHANNEL_H @@ -21,7 +21,12 @@ #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_shuffle_channel.h" #endif - +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_shuffle_channel.h" +#endif +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_shuffle_channel.h" +#endif namespace anakin { namespace saber { diff --git a/saber/funcs/slice.h b/saber/funcs/slice.h index b89345c04..dc8a466ea 100644 --- a/saber/funcs/slice.h +++ b/saber/funcs/slice.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_SLICE_H @@ -19,6 +19,9 @@ #include "saber/funcs/base.h" #include "saber/funcs/impl/impl_base.h" #include "saber/funcs/impl/impl_slice.h" +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_slice.h" +#endif #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_slice.h" #endif @@ -98,16 +101,16 @@ class Slice : public BaseFunc } virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderSlice ); - return SaberSuccess; - case SABER_IMPL: - this->_impl.push_back(new SaberSlice ); - return SaberSuccess; - default: + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSlice ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberSlice ); + return SaberSuccess; + default: return SaberUnImplError; - } + } } private: diff --git a/saber/funcs/slice_v2.h b/saber/funcs/slice_v2.h new file mode 100644 index 000000000..9c72bf872 --- /dev/null +++ b/saber/funcs/slice_v2.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SLICE_V2_H +#define ANAKIN_SABER_FUNCS_SLICE_V2_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_slice_v2.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_slice_v2.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_slice_v2.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/include/saber_slice_v2.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_slice_v2.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_slice_v2.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SliceV2 : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SliceV2Param> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SliceV2Param>::BaseFunc; + + SliceV2() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SliceV2Param Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape = input[0]->valid_shape(); + Shape in_shape = input[0]->valid_shape(); + auto starts = param.starts; + auto ends = param.ends; + auto axes = param.axes; + CHECK_EQ(axes.size(), starts.size()) << "the size of axes and starts are not equal "; + CHECK_EQ(ends.size(), starts.size()) << "the size of starts and ends are not valid"; + for (int i = 0; i < starts.size(); i++) { + int dim_value = in_shape[axes[i]]; + int start = starts[i] < 0 ? starts[i] + dim_value : starts[i]; + int end = ends[i] < 0 ? ends[i] + dim_value : ends[i]; + start = std::max(start, 0); + start = std::min(start, dim_value); + end = std::max(end, 0); + end = std::min(end, dim_value); + output_shape[axes[i]] = end - start; + } + if (axes[0] != 0) { + output[0]->set_seq_offset(input[0]->get_seq_offset()); + } + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + //this->_impl.push_back(new VenderSliceV2 _impl.push_back(new VenderSliceV2 ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSliceV2 ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/soft_sign.h b/saber/funcs/soft_sign.h new file mode 100644 index 000000000..9a6b1f26e --- /dev/null +++ b/saber/funcs/soft_sign.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SOFT_SIGN_H +#define ANAKIN_SABER_FUNCS_SOFT_SIGN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_soft_sign.h" + +#ifdef NVIDIA_GPU +#include "saber/funcs/impl/cuda/saber_soft_sign.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_soft_sign.h" +#endif + +#ifdef AMD_GPU +//#include "saber/funcs/impl/amd/saber_soft_sign.h" +#endif + +#ifdef USE_ARM_PLACE +//#include "saber/funcs/impl/arm/saber_soft_sign.h" +#endif + +#ifdef USE_BM_PLACE +//#include "saber/funcs/impl/bm/vender_soft_sign.h" +#endif + + +namespace anakin { +namespace saber { + +template +class SoftSign : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SoftSignParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SoftSignParam>::BaseFunc; + + SoftSign() = default; + + typedef Tensor InDataTensor; + typedef Tensor OutDataTensor; + typedef Tensor OpTensor; + typedef SoftSignParam Param_t; + typedef std::vector Input_v; + typedef std::vector Output_v; + typedef std::vector Shape_v; + + virtual SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) override { + + Shape output_shape = (input[0]->valid_shape()); + output[0]->set_seq_offset(input[0]->get_seq_offset()); + return output[0]->set_shape(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSoftSign ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSoftSign ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + + + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/softmax.h b/saber/funcs/softmax.h index 9c8354588..c4e0d5d28 100644 --- a/saber/funcs/softmax.h +++ b/saber/funcs/softmax.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_SOFTMAX_H @@ -24,7 +24,7 @@ #include "saber/funcs/impl/cuda/vender_softmax.h" #endif #ifdef AMD_GPU -#include "saber/funcs/impl/amd/saber_softmax.h" +//#include "saber/funcs/impl/amd/saber_softmax.h" #endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_softmax.h" @@ -33,6 +33,11 @@ #ifdef USE_ARM_PLACE #include "saber/funcs/impl/arm/saber_softmax.h" #endif + +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_softmax.h" +#include "saber/funcs/impl/amd/include/vender_softmax.h" +#endif namespace anakin{ namespace saber{ @@ -60,16 +65,16 @@ class Softmax : public BaseFunc } virtual SaberStatus init_impl(ImplEnum implenum) override { - switch (implenum) { - case VENDER_IMPL: - this->_impl.push_back(new VenderSoftmax ); - return SaberSuccess; - case SABER_IMPL: - this->_impl.push_back(new SaberSoftmax ); - return SaberSuccess; - default: + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSoftmax ); + return SaberSuccess; + case SABER_IMPL: + this->_impl.push_back(new SaberSoftmax ); + return SaberSuccess; + default: return SaberUnImplError; - } + } } private: diff --git a/saber/funcs/sproposal.h b/saber/funcs/sproposal.h new file mode 100644 index 000000000..274148a65 --- /dev/null +++ b/saber/funcs/sproposal.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Baidu, Inc. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ANAKIN_SABER_FUNCS_SPROPOSAL_H +#define ANAKIN_SABER_FUNCS_SPROPOSAL_H + +#include "saber/funcs/base.h" +#include "saber/saber_funcs_param.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_sproposal.h" + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sproposal.h" +#endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_sproposal.h" +#endif + +namespace anakin { +namespace saber { + +template +class SProposal : public BaseFunc < + TargetType, OpDtype, + ImplBase, SProposalParam +> { +public: + typedef TargetType targetType_t; + typedef Tensor OpTensor; + typedef SProposalParam Param_t; + typedef const std::vector Input_v; + typedef std::vector Output_v; + + SProposal() = default; + SaberStatus compute_output_shape(const Input_v &input, + Output_v &output, Param_t ¶m) { + + // need to make sure the max size of this op. + Shape output_shape({param.post_nms_topn, 5, 1, 1}, Layout_NCHW); + return output[0]->set_shape_without_layout(output_shape); + } + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSProposal); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSProposal); + return SaberSuccess; + + default: + return SaberUnImplError; + } + + }; +private: + virtual void pick_best_static() override { + if (true) { // some condition? + this->_best_impl = this->_impl[0]; + } + } + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } +}; +} +} +#endif //ANAKIN_SABER_FUNCS_CONV_H diff --git a/saber/funcs/sroi_align.h b/saber/funcs/sroi_align.h new file mode 100644 index 000000000..4e4202804 --- /dev/null +++ b/saber/funcs/sroi_align.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_SROI_ALIGN_H +#define ANAKIN_SABER_FUNCS_SROI_ALIGN_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_sroi_align.h" + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_sroi_align.h" +#endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_sroi_align.h" +#endif + +namespace anakin { +namespace saber { + +template +class SRoiAlign : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + SRoiAlignParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + SRoiAlignParam>::BaseFunc; + + SRoiAlign() = default; + + virtual SaberStatus compute_output_shape( + const std::vector *> &input, + std::vector *> &output, + SRoiAlignParam ¶m) override { + + //input[1] is roi. + Shape output_shape = input[0]->valid_shape(); + CHECK_EQ(input.size(), 2) << " input's size must be 2."; + + int num_index = input[0]->num_index(); + int channel_index = input[0]->channel_index(); + int height_index = input[0]->height_index(); + int width_index = input[0]->width_index(); + + output_shape[num_index] = input[1]->num(); + output_shape[channel_index] = input[0]->channel(); + output_shape[height_index] = param.pooled_h; + output_shape[width_index] = param.pooled_w; + + return output[0]->set_shape_without_layout(output_shape); + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderSRoiAlign ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberSRoiAlign ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/funcs/timer.h b/saber/funcs/timer.h index 1cb716094..8ec585a46 100644 --- a/saber/funcs/timer.h +++ b/saber/funcs/timer.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_SABER_FUNCS_TIMER_H @@ -90,7 +90,6 @@ class SaberTimer final { std::chrono::time_point tend; std::list ms_time; }; - #ifdef USE_CUDA template <> class SaberTimer final { @@ -167,7 +166,7 @@ class SaberTimer final { #endif -#ifdef AMD_GPU +#ifdef AMD_GPU typedef TargetWrapper AMD_API; @@ -206,7 +205,7 @@ class SaberTimer final { AMD_API::sync_event(_e_end); cl_ulong start; - clGetEventProfilingInfo(_e_start, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &start,NULL); + clGetEventProfilingInfo(_e_start, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,NULL); cl_ulong end; clGetEventProfilingInfo(_e_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); @@ -232,7 +231,7 @@ class SaberTimer final { } #if 0 for(auto time : ms_time) - LOG(INFO) << time; + LOG(INFO) << time; #endif ms_time.sort(); LOG(INFO) << ms_time.front() <<" - " << ms_time.back(); diff --git a/saber/funcs/transpose.h b/saber/funcs/transpose.h index ac596a85f..fa84a9660 100644 --- a/saber/funcs/transpose.h +++ b/saber/funcs/transpose.h @@ -24,6 +24,10 @@ #include "saber/funcs/impl/cuda/saber_transpose.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_transpose.h" +#endif + #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_transpose.h" #endif diff --git a/saber/funcs/type_trans.cpp b/saber/funcs/type_trans.cpp new file mode 100644 index 000000000..af81bf17d --- /dev/null +++ b/saber/funcs/type_trans.cpp @@ -0,0 +1,988 @@ +#include "saber/funcs/type_trans.h" + +namespace anakin { +namespace saber { + +#ifdef USE_ARM_PLACE + +template +void int32_to_dtype(const int* din, dtype* dout, const float* scale, + int axis_size, long long outer_size, long long inner_size); + +void fp32_to_int8(const float* din, signed char* dout, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + + int cnt = inner_size / 16; + int remain = inner_size & 15; + long long loop_size = outer_size * axis_size; + +#pragma omp parallel for + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + signed char* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + signed char* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "fmul v6.4s, v2.4s, %[scale].4s \n" + "fmul v7.4s, v3.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "FCVTAS v10.4s, v6.4s \n" + "FCVTAS v11.4s, v7.4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "sqxtn v5.4h, v10.4s \n" + "sqxtn2 v5.8h, v11.4s \n" + "sqxtn v8.8b, v4.8h \n" + "sqxtn2 v8.16b, v5.8h \n" + "str q8, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) + : [scale] "w" (vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" + "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vcvt.s32.f32 q2, q6 @ cvt to int32\n" + "vcvt.s32.f32 q3, q7 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vqmovn.s32 d10, q2 @ cnt to int16\n" + "vqmovn.s32 d11, q3 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d12, q4 @ cnt to int8\n" + "vqmovn.s16 d13, q5 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) + :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" + ); +#endif + } + const float* din_r = din_c + 16 * cnt; + signed char* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } +} + +void fp32_to_int16(const float* din, int16_t* dout, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + + int cnt = inner_size / 8; + int remain = inner_size & 7; + long long loop_size = outer_size * axis_size; + +#pragma omp parallel for + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + int16_t* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + int16_t* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "str q4, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) + : [scale] "w" (vscale) + : "v0", "v1", "v4", "v5", "v8", "v9" + ); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) + :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) + :"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9" + ); +#endif + } + const float* din_r = din_c + 8 * cnt; + int16_t* dout_r = dout_c + 8 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } +} + +void int8_to_fp32(const signed char* in, float* out, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + + int cnt = inner_size / 16; + int remain = inner_size & 15; + long long loop_size = axis_size * outer_size; +#pragma omp parallel for + for (long long n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const signed char* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const signed char* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + "0: \n" /* main loop */ + "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ + "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ + + "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ + + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + asm volatile( + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + "0: @ main loop\n" + "vmovl.s8 q2, d0 @ trans to int16\n" + "vmovl.s8 q3, d1 @ trans to int16\n" + "vmovl.s16 q4, d4 @ trans to int32\n" + "vmovl.s16 q5, d5 @ trans to int32\n" + "vmovl.s16 q6, d6 @ trans to int32\n" + "vmovl.s16 q7, d7 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +#endif //__aarch64__ + } + const signed char* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int16_to_fp32(const short* in, float* out, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + + int cnt = inner_size / 16; + int remain = inner_size & 15; + long long loop_size = axis_size * outer_size; +#pragma omp parallel for + for (long long n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const short* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const short* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + "0: \n" /* main loop */ + "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ + + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" + "0: @ main loop\n" + "vmovl.s16 q4, d0 @ trans to int32\n" + "vmovl.s16 q5, d1 @ trans to int32\n" + "vmovl.s16 q6, d2 @ trans to int32\n" + "vmovl.s16 q7, d3 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +#endif //__aarch64__ + } + const short* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int32_to_fp32(const int* din, float* dout, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + long long loop_size = axis_size * outer_size; +#pragma omp parallel for + for (long long n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + float* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + float* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "fmul v8.4s, v4.4s, %[scale].4s \n" + "fmul v9.4s, v5.4s, %[scale].4s \n" + "fmul v10.4s, v6.4s, %[scale].4s \n" + "fmul v11.4s, v7.4s, %[scale].4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "stp q8, q9, [%[out]], #32 \n" + "stp q10, q11, [%[out]], #32 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + asm volatile( + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "0: \n" + "vcvt.f32.s32 q4, q0 \n" + "vcvt.f32.s32 q5, q1 \n" + "vcvt.f32.s32 q6, q2 \n" + "vcvt.f32.s32 q7, q3 \n" + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vmul.f32 q8, q4, %q[scale] \n" + "vmul.f32 q9, q5, %q[scale] \n" + "vmul.f32 q10, q6, %q[scale] \n" + "vmul.f32 q11, q7, %q[scale] \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "subs %[loop], #1 \n" + "vst1.f32 {d16-d19}, [%[out]]! \n" + "vst1.f32 {d20-d23}, [%[out]]! \n" + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" + ); +#endif //__aarch64__ + } + const int* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } +} + +void int32_to_int8(const int* din, signed char* dout, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + long long loop_size = outer_size * axis_size; +#pragma omp parallel for + for (long long n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + signed char* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + signed char* dout_ptr = dout_c; +#ifdef __aarch64__ + asm volatile( + "0: \n" + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + + "fmul v0.4s, v4.4s, %[scale].4s \n" + "fmul v1.4s, v5.4s, %[scale].4s \n" + "fmul v2.4s, v6.4s, %[scale].4s \n" + "fmul v3.4s, v7.4s, %[scale].4s \n" + + "fcvtas v4.4s, v0.4s \n" + "fcvtas v5.4s, v1.4s \n" + "fcvtas v6.4s, v2.4s \n" + "fcvtas v7.4s, v3.4s \n" + + "sqxtn v0.4h, v4.4s \n" + "sqxtn2 v0.8h, v5.4s \n" + "sqxtn v1.4h, v6.4s \n" + "sqxtn2 v1.8h, v7.4s \n" + + "sqxtn v2.8b, v0.8h \n" + "sqxtn2 v2.16b, v1.8h \n" + + "st1 {v2.16b}, [%[out]], #16 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) + :[scale] "w" (vscale) + :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vcvt.f32.s32 q4, q0 @ cvt to float\n" + "vcvt.f32.s32 q5, q1 @ cvt to float\n" + "vcvt.f32.s32 q6, q2 @ cvt to float\n" + "vcvt.f32.s32 q7, q3 @ cvt to float\n" + "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q1, q0, q0 @ set offset, 0.5\n" + "vand.i32 q2, q0, q0 @ set offset, 0.5\n" + "vand.i32 q3, q0, q0 @ set offset, 0.5\n" + "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" + "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" + "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" + "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q4, q0 @ cvt to int32\n" + "vcvt.s32.f32 q5, q1 @ cvt to int32\n" + "vcvt.s32.f32 q6, q2 @ cvt to int32\n" + "vcvt.s32.f32 q7, q3 @ cvt to int32\n" + "vqmovn.s32 d16, q4 @ cnt to int16\n" + "vqmovn.s32 d17, q5 @ cnt to int16\n" + "vqmovn.s32 d18, q6 @ cnt to int16\n" + "vqmovn.s32 d19, q7 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d8, q8 @ cnt to int8\n" + "vqmovn.s16 d9, q9 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[loop], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + :[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr) + :[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff) + :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" + ); +#endif //__aarch64__ + } + const int* din_r = din_c + 16 * cnt; + int8_t* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); + } + } +} + +void int32_to_int32(const int* din, int* dout, const float* scale, \ + int axis_size, long long outer_size, long long inner_size) { + int size_all = outer_size * axis_size * inner_size; + memmove(dout, din, size_all*sizeof(int)); +} + +template <> +void int32_to_dtype(const int* din, float* dout, const float* scale, + int axis_size, long long outer_size, long long inner_size) { + + return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); +} + +template <> +void int32_to_dtype(const int* din, signed char* dout, const float* scale, + int axis_size, long long outer_size, long long inner_size) { + + return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); +} + +template <> +void int32_to_dtype(const int* din, int* dout, const float* scale, + int axis_size, long long outer_size, long long inner_size) { + + return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); +} + +SaberStatus trans_tensor_fp32_to_int8(const Tensor& tin, Tensor& tout, \ + float input_scale) { + if (tin.get_dtype() != AK_FLOAT) { + return SaberInvalidValue; + } + if (tout.get_dtype() != AK_INT8) { + tout.set_dtype(AK_INT8); + } + tout.reshape(tin.valid_shape()); + std::vector scale = {input_scale}; + + const float* din = static_cast(tin.data()); + signed char* dout = static_cast(tout.mutable_data()); + //! convert to int8 + fp32_to_int8(din, dout, scale.data(), 1, 1, tin.valid_size()); + return SaberSuccess; +} + +SaberStatus trans_tensor_int8_to_fp32(Tensor& tin, Tensor& tout, \ + float input_scale) { + + if (tin.get_dtype() != AK_INT8) { + return SaberInvalidValue; + } + if (tout.get_dtype() != AK_FLOAT) { + tout.set_dtype(AK_FLOAT); + } + tout.reshape(tin.valid_shape()); + + //! compute scale + std::vector scale = {input_scale}; + + const signed char* input = (const signed char*)tin.data(); + float* output = (float*)tout.mutable_data(); + + int inner_size = tin.valid_size(); + + //! convert to fp32 + int8_to_fp32(input, output, scale.data(), 1, 1, inner_size); + return SaberSuccess; +} + +SaberStatus trans_tensor_int32_to_fp32(const Tensor& tin, Tensor& tout, \ + float input_scale, std::vector weights_scale, int axis) { + if (tin.get_dtype() != AK_INT32) { + return SaberInvalidValue; + } + if (tout.get_dtype() != AK_FLOAT) { + tout.set_dtype(AK_FLOAT); + } + tout.reshape(tin.valid_shape()); + + //! compute scale + std::vector scale(weights_scale.size()); + for (int i = 0; i < weights_scale.size(); ++i){ + scale[i] = input_scale * weights_scale[i]; + } + const int* input = (const int*)tin.data(); + float* output = (float*)tout.mutable_data(); + + Shape shin = tin.valid_shape(); + int outer_size = shin.count(0, axis); + int axis_size = shin[axis]; + int inner_size = shin.count(axis + 1); +// if (tin.dims() < 3){ +// outer_size = tin.valid_shape()[0]; +// axis_size = tin.valid_shape()[1]; +// inner_size = 1; +// } else{ +// outer_size = tin.num(); +// axis_size = tin.channel(); +// inner_size = tin.width() * tin.height(); +// } + //! convert to fp32 + int32_to_fp32(input, output, scale.data(), axis_size, outer_size, inner_size); + return SaberSuccess; +} + +SaberStatus trans_tensor_int32_to_int8(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale, int axis) { + + if (tin.get_dtype() != AK_INT32) { + return SaberInvalidValue; + } + if (tout.get_dtype() != AK_INT8) { + tout.set_dtype(AK_INT8); + } + tout.reshape(tin.valid_shape()); + + //! compute scale + std::vector scale(weights_scale.size()); + for (int i = 0; i < weights_scale.size(); ++i){ + scale[i] = input_scale * weights_scale[i] / output_scale; + } + const int* input = (const int*)tin.data(); + signed char* output = (signed char*)tout.mutable_data(); + + Shape shin = tin.valid_shape(); + int outer_size = shin.count(0, axis); + int axis_size = shin[axis]; + int inner_size = shin.count(axis + 1); + +// int outer_size; +// int axis_size; +// int inner_size; +// if (tin.dims() < 3){ +// outer_size = tin.valid_shape()[0]; +// axis_size = tin.valid_shape()[1]; +// inner_size = 1; +// } else{ +// outer_size = tin.num(); +// axis_size = tin.channel(); +// inner_size = tin.width() * tin.height(); +// } + //! convert to int8 + int32_to_int8(input, output, scale.data(), axis_size, outer_size, inner_size); + return SaberSuccess; +} + +/******************************************/ +/******** kernel implement *********/ +/******************************************/ +float compute_max_kernel(const float* din, long long size) { + + float max_value = 0.f; + int cnt = size / 16; + int remain = size & 15; + float32x4_t vmax_val = vdupq_n_f32(0.f); + const float* ptr_in = din; + if (cnt > 0) { + int loop_cnt = cnt; +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "0: \n" + "fabs v4.4s, v0.4s \n" + "fabs v5.4s, v1.4s \n" + "fabs v6.4s, v2.4s \n" + "fabs v7.4s, v3.4s \n" + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "fmax v2.4s, v4.4s, v5.4s \n" + "fmax v3.4s, v6.4s, v7.4s \n" + "fmax v4.4s, v2.4s, v3.4s \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "fmax %[max_val].4s, v4.4s, %[max_val].4s \n" + "subs %[cnt], %[cnt], #1 \n" + "bne 0b \n" + : [in] "+r" (ptr_in), [cnt] "+r" (loop_cnt), [max_val] "+w" (vmax_val) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" + "vld1.32 {d4-d7}, [%[in]]! @ load 8 float\n" + "0: @ main loop\n" + "vabs.f32 q4, q0 @ abs \n" + "vabs.f32 q5, q1 @ abs \n" + "vabs.f32 q6, q2 @ abs \n" + "vabs.f32 q7, q3 @ abs \n" + "vld1.32 {d0-d3}, [%[in]]! @ load 8 float\n" + "vmax.f32 q2, q4, q5 @ max \n" + "vmax.f32 q3, q6, q7 @ max \n" + "vmax.f32 q4, q2, q3 @ max \n" + "vld1.32 {d4-d7}, [%[in]]! @ load 8 float\n" + "vmax.f32 %q[max_val], q4, %q[max_val] @ max \n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ jump to main loop\n" + + : [in] "+r" (ptr_in), [cnt] "+r" (loop_cnt), [max_val] "+w" (vmax_val) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +#endif + float32x2_t vmax_p = vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val)); + float max0 = vget_lane_f32(vmax_p, 0); + float max1 = vget_lane_f32(vmax_p, 1); + float max2 = max0 > max1 ? max0 : max1; + max_value = max_value > max2 ? max_value : max2; + } + ptr_in = din + 16 * cnt; + for (int i = 0; i < remain; ++i) { + float data = fabsf(*(ptr_in++)); + max_value = fmaxf(max_value, data); + } + return max_value; +} + +std::vector get_tensor_scale_n(const float* in_data, int axis_size, \ + long long inner_size, float scale_factor) { + + std::vector scale_out(axis_size); +#pragma omp parallel for + for (int c = 0; c < axis_size; ++c) {//num + const float* ptr_in = in_data + c * inner_size;//channel*width*height + scale_out[c] = compute_max_kernel(ptr_in, inner_size) / scale_factor; + } + return scale_out; +} + +std::vector get_tensor_scale_chw(const float* in_data, int axis_size, long long outer_size, \ +long long inner_size, float scale_factor) { + std::vector scale_out(axis_size); + long long inner_size_with_axis = axis_size * inner_size; +#pragma omp parallel for + for (int c = 0; c < axis_size; ++c) { + const float* din = in_data + c * inner_size; + float max_val = 0.f; + for (int j = 0; j < outer_size; ++j) { + const float *ptr_in = din + j * inner_size_with_axis; + max_val = fmaxf(compute_max_kernel(ptr_in, inner_size),max_val); + } + scale_out[c] = max_val / scale_factor; + } + return scale_out; +} + +SaberStatus get_tensor_scale(const Tensor& tin, std::vector& scale_out, \ + int axis, float scale_factor) { + if (tin.get_dtype() != AK_FLOAT && tin.get_dtype() != AK_INT8) { + LOG(ERROR) << "ERROR: Get tensor scale failed, unsupported data type"; + return SaberInvalidValue; + } + if (tin.get_dtype() == AK_INT8) { + if (tin.get_scale().size() <= 0) { + LOG(ERROR) << "ERROR: Get tensor scale failed, int8 tensor without scale"; + return SaberInvalidValue; + } else { + scale_out = tin.get_scale(); + return SaberSuccess; + } + } + int axis_size = 1; + if (axis >= 0 && axis < tin.dims()) { + axis_size = tin.valid_shape()[axis]; + } + int outer_size = 1; + if (axis >= 0) { + outer_size = tin.count_valid(0, axis); + } + long long inner_size = tin.count_valid(axis + 1, tin.dims()); + + const float* in_data = static_cast(tin.data()); + if (axis <= 0){ + scale_out = get_tensor_scale_n(in_data, axis_size, inner_size, scale_factor); + }else{ + scale_out = get_tensor_scale_chw(in_data, axis_size, outer_size, inner_size, scale_factor); + } + return SaberSuccess; +} + +template<> +SaberStatus trans_weights_dtype(Tensor& weights, DataType type, float scale_factor, \ + TRANS_TYPE op_type, int group) { + + if (weights.get_dtype() == type) { + return SaberSuccess; + } + if (type == AK_FLOAT && weights.get_dtype() == AK_INT8) { + //! trans int8 weights to fp32 weights + if (weights.get_scale().size() <= 0) { + LOG(ERROR) << "ERROR: Trans weights from int8 to fp32, without scale"; + return SaberInvalidValue; + } + Tensor tmp_tensor; + tmp_tensor.re_alloc(weights.valid_shape(), AK_FLOAT); + std::vector scale = weights.get_scale(); + const char* din = static_cast(weights.data()); + float* dout = static_cast(tmp_tensor.mutable_data()); + if (op_type == CONV_TYPE){ + //! for conv + int axis_size = weights.valid_shape()[0]; + int outer_size = 1; + int inner_size = weights.count_valid(1, weights.dims()); + int8_to_fp32(din, dout, scale.data(), axis_size, outer_size, inner_size); + } else if (op_type == DECONV_TYPE){ + //! for deconv + int axis_size = weights.valid_shape()[0] * group; + int outer_size = weights.valid_shape()[1] / group; + int inner_size = weights.valid_shape()[2] * weights.valid_shape()[3]; + int8_to_fp32(din, dout, scale.data(), axis_size, outer_size, inner_size); + } else if (op_type == FC_TYPE){ + //! for fc + int axis_size = weights.valid_shape()[2]; + int outer_size = 1; + int inner_size = weights.count_valid(3, weights.dims()); + int8_to_fp32(din, dout, scale.data(), axis_size, outer_size, inner_size); + } else { + LOG(ERROR) << "ERROR: Invalid Op type in trans weights"; + return SaberInvalidValue; + } + weights.re_alloc(weights.valid_shape(), AK_FLOAT); + weights.copy_from(tmp_tensor); + } else if (type == AK_INT8 && weights.get_dtype() == AK_FLOAT) { + //! trans fp32 weights to int8 weights + Tensor tmp_tensor; + tmp_tensor.re_alloc(weights.valid_shape(), AK_INT8); + std::vector scale; + const float* din = static_cast(weights.data()); + char* dout = static_cast(tmp_tensor.mutable_data()); + if (op_type == CONV_TYPE){ + //! for conv + //! layout is: chout, chin, kh, kw + int axis_size = weights.valid_shape()[0]; + int inner_size = weights.valid_size() / axis_size; + scale = get_tensor_scale_n(din, axis_size, inner_size, scale_factor); + fp32_to_int8(din, dout, scale.data(), axis_size, 1, inner_size); + } else if (op_type == DECONV_TYPE){ + //! for deconv, chout and chin in inversed + //! real layout is: chin, chout, kh, kw + int axis_size = weights.valid_shape()[0] * group; + int outer_size = weights.valid_shape()[1] / group; + int inner_size = weights.valid_shape()[2] * weights.valid_shape()[3]; + scale = get_tensor_scale_chw(din, axis_size, outer_size, inner_size, scale_factor); + fp32_to_int8(din, dout, scale.data(), axis_size, outer_size, inner_size); + } else if (op_type == FC_TYPE){ + //! for fc + //! layout is: 1, 1, chout, chin + int axis_size = weights.valid_shape()[2]; + int inner_size = weights.count_valid(3, weights.dims()); + scale = get_tensor_scale_n(din, axis_size, inner_size, scale_factor); + fp32_to_int8(din, dout, scale.data(), axis_size, 1, inner_size); + } else { + LOG(ERROR) << "ERROR: Invalid Op type in trans weights"; + return SaberInvalidValue; + } + //! set weights scale + weights.set_scale(scale); + weights.re_alloc(weights.valid_shape(), AK_INT8); + weights.copy_from(tmp_tensor); + } else if (type == AK_INT16 && weights.get_dtype() == AK_FLOAT) { + //! trans fp32 weights to int16 weights + Tensor tmp_tensor; + tmp_tensor.re_alloc(weights.valid_shape(), AK_INT16); + std::vector scale; + const float* din = static_cast(weights.data()); + short* dout = static_cast(tmp_tensor.mutable_data()); + if (op_type == CONV_TYPE){ + //! for conv + //! layout is: chout, chin, kh, kw + int axis_size = weights.valid_shape()[0]; + int inner_size = weights.valid_size() / axis_size; + scale = get_tensor_scale_n(din, axis_size, inner_size, scale_factor); + fp32_to_int16(din, dout, scale.data(), axis_size, 1, inner_size); + } else if (op_type == DECONV_TYPE){ + //! for deconv, chout and chin in inversed + //! real layout is: chin, chout, kh, kw + int axis_size = weights.valid_shape()[0] * group; + int outer_size = weights.valid_shape()[1] / group; + int inner_size = weights.valid_shape()[2] * weights.valid_shape()[3]; + scale = get_tensor_scale_chw(din, axis_size, outer_size, inner_size, scale_factor); + fp32_to_int16(din, dout, scale.data(), axis_size, outer_size, inner_size); + } else if (op_type == FC_TYPE){ + //! for fc + //! layout is: 1, 1, chout, chin + int axis_size = weights.valid_shape()[2]; + int inner_size = weights.count_valid(3, weights.dims()); + scale = get_tensor_scale_n(din, axis_size, inner_size, scale_factor); + fp32_to_int16(din, dout, scale.data(), axis_size, 1, inner_size); + } else { + LOG(ERROR) << "ERROR: Invalid Op type in trans weights"; + return SaberInvalidValue; + } + //! set weights scale + weights.set_scale(scale); + weights.re_alloc(weights.valid_shape(), AK_INT16); + weights.copy_from(tmp_tensor); + } else { + LOG(ERROR) << "ERROR: Trans weights fialed, unsupported data type"; + return SaberInvalidValue; + } + return SaberSuccess; +} + +template<> +SaberStatus trans_fp32_bias_to_int32(Tensor& tin, Tensor& tout, \ + float in_scale, std::vector vector_weight_scale) { + + if (tin.get_dtype() != AK_FLOAT || vector_weight_scale.size() != tin.valid_size()) { + return SaberInvalidValue; + } + tout.set_dtype(AK_INT32); + tout.reshape(tin.valid_shape()); + const float* in_data = static_cast(tin.data()); + int* out_data = static_cast(tout.mutable_data()); + for (int i = 0; i < tin.valid_size(); ++i) { + out_data[i] = saturate_cast(roundf(in_data[i] / in_scale / vector_weight_scale[i])); + } + return SaberSuccess; +} + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale){ + return trans_tensor_fp32_to_int8(tin, tout, input_scale); +} + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale){ + return trans_tensor_int8_to_fp32(tin, tout, input_scale); +} + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale){ + return trans_tensor_int32_to_fp32(tin, tout, input_scale, weights_scale, 1); +} + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale){ + return trans_tensor_int32_to_int8(tin, tout, input_scale, output_scale, weights_scale, 1); +} + +#endif + +} // namespace saber +} // namespace anakin diff --git a/saber/funcs/type_trans.h b/saber/funcs/type_trans.h new file mode 100644 index 000000000..07fb4f8de --- /dev/null +++ b/saber/funcs/type_trans.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_TYPE_TRANS_H +#define ANAKIN_SABER_FUNCS_TYPE_TRANS_H + +#include "saber/core/tensor.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/saturate.h" +#include "saber/saber_types.h" + +namespace anakin { +namespace saber { + +typedef enum{ + CONV_TYPE = 0, + DECONV_TYPE = 1, + FC_TYPE = 2 +} TRANS_TYPE; + +template +SaberStatus trans_weights_dtype(Tensor& weights, DataType type, float scale_factor, TRANS_TYPE op_type, int group){ + LOG(ERROR) << "trans_weights_dtype has no impl"; + return SaberUnImplError; +} +template +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale){ + LOG(ERROR) << "trans_tensor_dtype has no impl"; + return SaberUnImplError; +} +template +SaberStatus trans_fp32_bias_to_int32(Tensor& tin, Tensor& tout, \ + float in_scale, std::vector vector_weight_scale){ + LOG(ERROR) << "trans_fp32_bias_to_int32 has no impl"; + return SaberUnImplError; +} + +#ifdef USE_ARM_PLACE + +template<> +SaberStatus trans_weights_dtype(Tensor& weights, DataType type, float scale_factor, \ + TRANS_TYPE op_type, int group); + +template<> +SaberStatus trans_fp32_bias_to_int32(Tensor& tin, Tensor& tout, \ + float in_scale, std::vector vector_weight_scale); + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale); + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale); + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale); + +template<> +SaberStatus trans_tensor_dtype(Tensor& tin, Tensor& tout, \ + float input_scale, float output_scale, std::vector weights_scale); + +SaberStatus get_tensor_scale(const Tensor& tin, std::vector& scale_out, \ + int axis, float scale_factor); + +template +void int32_to_dtype(const int* din, dtype* dout, const float* scale, + int axis_size, long long outer_size, long long inner_size); +#endif + +} // namespace saber +} // namespace anakin +#endif // ANAKIN_SABER_FUNCS_TYPE_TRANS_H \ No newline at end of file diff --git a/saber/funcs/unpool.h b/saber/funcs/unpool.h index aad437dd5..8f47304ab 100644 --- a/saber/funcs/unpool.h +++ b/saber/funcs/unpool.h @@ -22,6 +22,9 @@ #ifdef NVIDIA_GPU #include "saber/funcs/impl/cuda/saber_unpool.h" #endif +#ifdef AMD_GPU +#include "saber/funcs/impl/amd/include/saber_unpool.h" +#endif #ifdef USE_X86_PLACE #include "saber/funcs/impl/x86/saber_unpool.h" diff --git a/saber/funcs/yolo_box.h b/saber/funcs/yolo_box.h new file mode 100644 index 000000000..fe02c6c1e --- /dev/null +++ b/saber/funcs/yolo_box.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2018 Anakin Authors, Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef ANAKIN_SABER_FUNCS_YOLO_BOX_H +#define ANAKIN_SABER_FUNCS_YOLO_BOX_H + +#include "saber/funcs/base.h" +#include "saber/funcs/impl/impl_base.h" +#include "saber/funcs/impl/impl_yolo_box.h" + +#ifdef USE_CUDA +#include "saber/funcs/impl/cuda/saber_yolo_box.h" +#endif + +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/saber_yolo_box.h" +#endif +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/saber_yolo_box.h" +#endif +namespace anakin { +namespace saber { + +template +class YoloBox : public BaseFunc< + TargetType, + OpDtype, + ImplBase, + YoloBoxParam> { +public: + using BaseFunc< + TargetType, + OpDtype, + ImplBase, + YoloBoxParam>::BaseFunc; + + YoloBox() = default; + + virtual SaberStatus compute_output_shape( + const std::vector*>& input, + std::vector*> &output, + YoloBoxParam ¶m) override { + + auto dim_x = input[0]->valid_shape(); + auto dim_imgsize = input[1]->valid_shape(); + auto anchors = param.anchors; + int anchor_num = anchors.size() / 2; + auto class_num = param.class_num; + + + CHECK_EQ(dim_x[1], anchor_num * (5 + class_num)) + << "Input(X) dim[1] should be equal to (anchor_mask_number * (5 + class_num))."; + CHECK_EQ(dim_imgsize[0], dim_x[0]) + << "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."; + + CHECK_EQ(dim_imgsize[1], 2) << "Input(ImgSize) dim[1] should be 2."; + CHECK_GT(anchors.size(), 0) << "Attr(anchors) length should be greater than 0."; + CHECK_EQ(anchors.size() % 2, 0) << "Attr(anchors) length should be even integer."; + CHECK_GT(class_num, 0) << "Attr(class_num) should be an integer greater than 0."; + + int box_num = dim_x[2] * dim_x[3] * anchor_num; + Shape dim_boxes({dim_x[0], box_num, 4}, Layout_NHW); + output[0]->set_shape(dim_boxes); + + Shape dim_scores({dim_x[0], box_num, class_num}, Layout_NHW); + output[1]->set_shape(dim_scores); + + return SaberSuccess; + } + + virtual SaberStatus init_impl(ImplEnum implenum) override { + switch (implenum) { + case VENDER_IMPL: + this->_impl.push_back(new VenderYoloBox ); + return SaberSuccess; + + case SABER_IMPL: + this->_impl.push_back(new SaberYoloBox ); + return SaberSuccess; + + default: + return SaberUnImplError; + } + } + +private: + + virtual void pick_best_static() override { + this->_best_impl = this->_impl[0]; + } + + virtual void pick_best_specify(ImplEnum implenum) override { + this->_best_impl = this->_impl[0]; + } + +}; + +} // namespace saber +} // namespace anakin + +#endif diff --git a/saber/saber_funcs_param.h b/saber/saber_funcs_param.h index 0c3e2a5b5..e7322bdec 100644 --- a/saber/saber_funcs_param.h +++ b/saber/saber_funcs_param.h @@ -23,6 +23,17 @@ namespace anakin { namespace saber { +template +//bool compare_vector(std::vector vec1, std::vector vec2) { +bool compare_vector(Dtype vec1, Dtype vec2) { + bool flag = vec1.size() == vec2.size(); + if (flag) { + for (int i = 0; i < vec1.size(); i++) { + flag = flag && (vec1[i] == vec2[i]); + } + } + return flag; +} template struct PreluParam; @@ -102,11 +113,95 @@ template struct AffineChannelParam { AffineChannelParam() = default; - AffineChannelParam(const AffineChannelParam& right) {} + AffineChannelParam(Tensor* weight_in, + Tensor* bias_in): + weight_tensor(weight_in), bias_tensor(bias_in){} - AffineChannelParam& operator=(const AffineChannelParam& right) {} + AffineChannelParam(const AffineChannelParam& right): + weight_tensor(right.weight_tensor), + bias_tensor(right.bias_tensor) {} - bool operator==(const AffineChannelParam& right) {return true;} + AffineChannelParam& operator=(const AffineChannelParam& right) { + weight_tensor = right.weight_tensor; + bias_tensor = right.bias_tensor; + return *this; + } + + bool operator==(const AffineChannelParam& right) { + bool flag = true; + flag = flag && weight_tensor == right.weight_tensor; + flag = flag && bias_tensor == right.bias_tensor; + return true; + } + + inline const Tensor* weight() { + return weight_tensor; + } + + inline const Tensor* bias() { + return bias_tensor; + } + + inline Tensor* mutable_weight() { + return weight_tensor; + } + + inline Tensor* mutable_bias() { + return bias_tensor; + } + + inline void set_weight(Tensor* weight_tensor_in) { + weight_tensor = weight_tensor_in; + } +private: + Tensor* weight_tensor; + Tensor* bias_tensor; +}; + +template +struct AnchorGeneratorParam { + AnchorGeneratorParam() = default; + AnchorGeneratorParam(std::vector anchor_sizes_in, + std::vector aspect_ratios_in, + std::vector variances_in, + std::vector stride_in, + float offset_in): anchor_sizes(anchor_sizes_in), + aspect_ratios(aspect_ratios_in), + variances(variances_in), + stride(stride_in), + offset(offset_in) { + } + + AnchorGeneratorParam(const AnchorGeneratorParam& right):anchor_sizes(right.anchor_sizes), + aspect_ratios(right.aspect_ratios), + variances(right.variances), + stride(right.stride), + offset(right.offset) {} + + AnchorGeneratorParam& operator=(const AnchorGeneratorParam& right) { + anchor_sizes = right.anchor_sizes; + aspect_ratios = right.aspect_ratios; + variances = right.variances; + stride = right.stride; + offset = right.offset; + return *this; + } + + bool operator==(const AnchorGeneratorParam& right) { + bool flag = true; + flag = flag && compare_vector(anchor_sizes, right.anchor_sizes); + flag = flag && compare_vector(aspect_ratios, right.aspect_ratios); + flag = flag && compare_vector(variances, right.variances); + flag = flag && compare_vector(stride, right.stride); + flag = flag && offset == right.offset; + return flag; + } + + std::vector anchor_sizes; + std::vector aspect_ratios; + std::vector variances; + std::vector stride; + float offset; }; @@ -270,6 +365,38 @@ struct BatchnormParam { std::vector variance; }; +template +struct BoxCoderParam { + BoxCoderParam() {}; + BoxCoderParam(Tensor* prior_box_var_in, bool box_normalized_in, int axis_in) : + box_normalized(box_normalized_in), axis(axis_in), var_tensor(prior_box_var_in) {} + BoxCoderParam(const BoxCoderParam& right): + box_normalized(right.box_normalized), + axis(right.axis), var_tensor(right.var_tensor) {} + BoxCoderParam& operator=(const BoxCoderParam& right) { + box_normalized = right.box_normalized; + axis = right.axis; + var_tensor = right.var_tensor; + return *this; + } + bool operator == (const BoxCoderParam& right) { + bool cmp_eq = true; + cmp_eq = cmp_eq && (box_normalized == right.box_normalized); + cmp_eq = cmp_eq && (axis == right.axis); + cmp_eq = cmp_eq && (var_tensor == right.var_tensor); + return cmp_eq; + } + + Tensor* variance() { + return var_tensor; + } + +public: + bool box_normalized{true}; + int axis{0}; + Tensor* var_tensor{nullptr}; +}; + template struct CastParam { CastParam() = default; @@ -324,20 +451,20 @@ struct ConvParam { , stride_h(-1), stride_w(-1) , dilation_h(-1), dilation_w(-1) , weight_tensor(NULL), bias_tensor(NULL) - , alpha(1.0), beta(0.0),rm(round_mode::nearest) + , alpha(1.0), beta(0.0), beta_type(AK_FLOAT), rm(round_mode::nearest) , activation_param(ActivationParam()) {} ConvParam(int group_in, int pad_h_in, int pad_w_in, int stride_h_in, int stride_w_in, int dilation_h_, int dilation_w_, Tensor* weight, Tensor* bias, ActivationParam activation_param_in = ActivationParam(), - float alpha_in = 1.0, float beta_in = 0.0,round_mode rm_in = round_mode::nearest) + float alpha_in = 1.0, float beta_in = 0.0, DataType beta_type_in = AK_FLOAT, round_mode rm_in = round_mode::nearest) : group(group_in), pad_h(pad_h_in), pad_w(pad_w_in) , stride_h(stride_h_in), stride_w(stride_w_in) , dilation_h(dilation_h_), dilation_w(dilation_w_) , weight_tensor(weight), bias_tensor(bias) , activation_param(activation_param_in) - , alpha(alpha_in), beta(beta_in) + , alpha(alpha_in), beta(beta_in), beta_type(beta_type_in) , rm(rm_in) {} @@ -350,6 +477,7 @@ struct ConvParam { , bias_tensor(right.bias_tensor) , alpha(right.alpha) , beta(right.beta) + , beta_type(right.beta_type) , rm(right.rm) , activation_param(right.activation_param) {} @@ -366,6 +494,7 @@ struct ConvParam { bias_tensor = right.bias_tensor; alpha = right.alpha; beta = right.beta; + beta_type = right.beta_type; rm = right.rm; activation_param = right.activation_param; return *this; @@ -384,6 +513,7 @@ struct ConvParam { comp_eq = comp_eq && (bias_tensor == right.bias_tensor); comp_eq = comp_eq && (alpha == right.alpha); comp_eq = comp_eq && (beta == right.beta); + comp_eq = comp_eq && (beta_type == right.beta_type); comp_eq = comp_eq && (rm == right.rm); comp_eq = comp_eq && (activation_param == right.activation_param); return comp_eq; @@ -418,6 +548,7 @@ struct ConvParam { int dilation_w; float alpha; float beta; + DataType beta_type; round_mode rm; //add by intel,round mode in converting float to int ActivationParam activation_param; private: @@ -460,6 +591,32 @@ struct ConvEltwiseParam { EltwiseParam eltwise_param; }; +template +struct Coord2PatchParam { + Coord2PatchParam():img_h(128), output_h(1), output_w(72) {} + Coord2PatchParam(int in_img_h, int in_output_h, int in_output_w):img_h(in_img_h), \ + output_h(in_output_h), output_w(in_output_w) {} + Coord2PatchParam(const Coord2PatchParam &right): + img_h(right.img_h), output_h(right.output_h), output_w(right.output_w) {} + Coord2PatchParam &operator=(const Coord2PatchParam &right) { + img_h = right.img_h; + output_h = right.output_h; + output_w = right.output_w; + return *this; + } + bool operator==(const Coord2PatchParam &right) { + bool flag = img_h == right.img_h; + flag = flag && (output_h == right.output_h); + flag = flag && (output_w == right.output_w); + return flag; + } + +public: + int img_h; + int output_h; + int output_w; +}; + template struct PoolingParam; @@ -900,11 +1057,14 @@ struct EltwiseParam { , coeff() , activation_param(ActivationParam()) , has_eltwise(false) {} + EltwiseParam(EltwiseType operation_in , std::vector coeff_in = std::vector({1, 1}) - , ActivationParam activation_param_in = ActivationParam()) + , ActivationParam activation_param_in = ActivationParam() + , int axis_in = 0) : operation(operation_in) , coeff(coeff_in) + , axis(axis_in) , activation_param(activation_param_in) , has_eltwise(true) { if ((operation == Eltwise_sum) && (coeff.size() == 0)) { @@ -915,6 +1075,7 @@ struct EltwiseParam { EltwiseParam(const EltwiseParam& right) : operation(right.operation) , coeff(right.coeff) + , axis(right.axis) , activation_param(right.activation_param) , has_eltwise(right.has_eltwise) {} @@ -928,6 +1089,7 @@ struct EltwiseParam { activation_param = right.activation_param; has_eltwise = right.has_eltwise; + axis = right.axis; return *this; } bool operator==(const EltwiseParam& right) { @@ -936,6 +1098,7 @@ struct EltwiseParam { comp_eq = comp_eq && (coeff.size() == right.coeff.size()); comp_eq = comp_eq && (activation_param == right.activation_param); comp_eq = comp_eq && (has_eltwise == right.has_eltwise); + comp_eq = comp_eq && (axis == right.axis); if (!comp_eq) { return comp_eq; @@ -950,6 +1113,7 @@ struct EltwiseParam { ActivationParam activation_param; EltwiseType operation; bool has_eltwise{false}; + int axis{0}; std::vector coeff; }; @@ -1021,38 +1185,18 @@ struct EmptyParam{ } }; -template -struct FakeQuantizeAbsMaxParam { - FakeQuantizeAbsMaxParam() = default; - - FakeQuantizeAbsMaxParam(int bit_length_in): - bit_length(bit_length_in) {} - - FakeQuantizeAbsMaxParam(const FakeQuantizeAbsMaxParam& right): - bit_length(right.bit_length) {} - - FakeQuantizeAbsMaxParam& operator=(const FakeQuantizeAbsMaxParam& right) { - bit_length = right.bit_length; - } - - bool operator==(const FakeQuantizeAbsMaxParam& right) { - return bit_length == right.bit_length; - } - - int bit_length{8}; -}; - template struct ExpandParam{ ExpandParam() = default; ExpandParam(std::vector expand_times_in) : expand_times(expand_times_in) { } - ExpandParam(const ExpandParam& right) : + ExpandParam(const ExpandParam& right) : expand_times(right.expand_times) { } ExpandParam& operator=(const ExpandParam& right) { expand_times = right.expand_times; + return *this; } bool operator==(const ExpandParam& right) { bool flag = true; @@ -1138,6 +1282,55 @@ struct FlattenParam { int end_axis{-1}; }; +template +struct GenerateProposalsParam { + GenerateProposalsParam() = default; + + GenerateProposalsParam(int pre_nms_top_n_in, + int post_nms_top_n_in, + float nms_thresh_in, + float min_size_in, + float eta_in) : + pre_nms_top_n(pre_nms_top_n_in), + post_nms_top_n(post_nms_top_n_in), + nms_thresh(nms_thresh_in), + min_size(min_size_in), + eta(eta_in) {} + + GenerateProposalsParam(const GenerateProposalsParam& right): + pre_nms_top_n(right.pre_nms_top_n), + post_nms_top_n(right.post_nms_top_n), + nms_thresh(right.nms_thresh), + min_size(right.min_size), + eta(right.eta) { + } + + GenerateProposalsParam& operator=(const GenerateProposalsParam& right) { + pre_nms_top_n = right.pre_nms_top_n; + post_nms_top_n = right.post_nms_top_n; + nms_thresh = right.nms_thresh; + min_size = right.min_size; + eta = right.eta; + return *this; + } + + bool operator==(const GenerateProposalsParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (pre_nms_top_n == right.pre_nms_top_n); + comp_eq = comp_eq && (post_nms_top_n == right.post_nms_top_n); + comp_eq = comp_eq && (nms_thresh == right.nms_thresh); + comp_eq = comp_eq && (min_size == right.min_size); + comp_eq = comp_eq && (eta == right.eta); + return comp_eq; + } + + int pre_nms_top_n{1}; + int post_nms_top_n{1}; + float nms_thresh{0.f}; + float min_size{1.f}; + float eta{0.f}; +}; + /** * GRU_Formula,origin for paddle,Cudnn for cudnn,difference is w_h_r and weighted mean * weight for origin is [W_h_o][W_h_r,W_h_z] @@ -1427,7 +1620,9 @@ struct LstmParam { , candidate_activity(Active_tanh) , with_peephole(true) , skip_input(false) - + , skip_num(1) + , project_dim(-1) + , cell_dim(-1) {} LstmParam(opTensor* weight_in, opTensor* bias_in, @@ -1441,7 +1636,11 @@ struct LstmParam { bool is_reverse_in = false, float dropout_param_in = 1.f, int num_direction_in = 1, - int numLayers_in = 1) + int numLayers_in = 1, + int skip_num_in = 1, + int project_dim_in = -1, + int cell_dim_in = -1 + ) : weight_tensor(weight_in) , bias_tensor(bias_in) @@ -1456,6 +1655,9 @@ struct LstmParam { , init_hidden_tensor(hidden_init_in) , with_peephole(with_peephole_in) , skip_input(skip_input_in) + , skip_num(skip_num_in) + , project_dim(project_dim_in) + , cell_dim(cell_dim_in) {} @@ -1473,6 +1675,9 @@ struct LstmParam { skip_input = right.skip_input; is_reverse = right.is_reverse; init_hidden_tensor = right.init_hidden_tensor; + skip_num = right.skip_num; + project_dim=right.project_dim; + cell_dim=right.cell_dim; return *this; } @@ -1491,6 +1696,9 @@ struct LstmParam { comp_eq = comp_eq && (candidate_activity == right.candidate_activity); comp_eq = comp_eq && (is_reverse = right.is_reverse); comp_eq = comp_eq && (init_hidden_tensor == right.init_hidden_tensor); + comp_eq = comp_eq && (skip_num == right.skip_num); + comp_eq = comp_eq && (project_dim == right.project_dim); + comp_eq = comp_eq && (cell_dim == right.cell_dim); return comp_eq; } @@ -1498,6 +1706,10 @@ struct LstmParam { return weight_tensor; } + void set_weight(opTensor* weights_ptr) { + weight_tensor=weights_ptr; + } + inline const opTensor* bias() { return bias_tensor; } @@ -1520,6 +1732,10 @@ struct LstmParam { // and you should calc this information in fc layer before; // otherwise the input's memory layout should be total_seq_len * input_size; bool skip_input; + + int skip_num; + int project_dim; + int cell_dim; private: opTensor* weight_tensor; opTensor* bias_tensor; @@ -1529,20 +1745,25 @@ struct LstmParam { template struct MatMulParam { - MatMulParam(): _is_transpose_X(false), _is_transpose_Y(false) {} - MatMulParam(bool x, bool y): _is_transpose_X(x), _is_transpose_Y(y) {} + MatMulParam(): _is_transpose_X(false), _is_transpose_Y(false), _scale(1.0f) {} + MatMulParam(bool x, bool y): _is_transpose_X(x), _is_transpose_Y(y), _scale(1.0f) {} + MatMulParam(bool x, bool y, float scale): _is_transpose_X(x), _is_transpose_Y(y), _scale(scale) {} MatMulParam& operator=(const MatMulParam& right) { _is_transpose_X = right._is_transpose_X; _is_transpose_Y = right._is_transpose_Y; + _scale = right._scale; + return *this; } bool operator==(const MatMulParam& right) { bool comp_eq = true; comp_eq = comp_eq && (_is_transpose_X == right._is_transpose_X); comp_eq = comp_eq && (_is_transpose_Y == right._is_transpose_Y); + comp_eq = comp_eq && (_scale == right._scale); return comp_eq; } bool _is_transpose_X{false}; bool _is_transpose_Y{false}; + float _scale{1.0f}; int _m = 0; int _n = 0; int _k = 0; @@ -1602,7 +1823,25 @@ struct NormalizeParam { eps = eps_in; CHECK_EQ(p == 2 || p == 1, true) << "only support L1 and L2 norm"; } - + NormalizeParam(bool is_across_spatial, bool is_shared_channel, \ + float eps_in = 1e-6f, int pin = 2) { + across_spatial = is_across_spatial; + channel_shared = is_shared_channel; + p = pin; + has_scale = false; + scale = nullptr; + eps = eps_in; + CHECK_EQ(p == 2 || p == 1, true) << "only support L1 and L2 norm"; + } + NormalizeParam(bool with_scale, Tensor* input_scale, + bool with_bias, Tensor* input_bias, int group, float eps){ + this->scale = input_scale; + this->bias = input_bias; + this->has_scale = has_scale; + this->has_bias = has_bias; + this->group = group; + this->eps = eps; + } NormalizeParam(const NormalizeParam& right) { channel_shared = right.channel_shared; across_spatial = right.across_spatial; @@ -1610,6 +1849,9 @@ struct NormalizeParam { has_scale = right.has_scale; scale = right.scale; eps = right.eps; + has_bias = right.has_bias; + group = right.group; + bias = right.bias; } NormalizeParam& operator=(const NormalizeParam& right) { @@ -1619,6 +1861,9 @@ struct NormalizeParam { this->p = right.p; this->has_scale = right.has_scale; this->eps = right.eps; + has_bias = right.has_bias; + group = right.group; + bias = right.bias; return *this; } @@ -1628,6 +1873,9 @@ struct NormalizeParam { flag = flag && (this->has_scale == right.has_scale); flag = flag && (this->p == right.p); flag = flag && (fabsf(this->eps - right.eps) < 1e-7f); + flag = flag && (has_bias == right.has_bias); + flag = flag && (group == right.group); + flag = flag && (bias == right.bias); return flag && (this->scale == right.scale); } @@ -1642,7 +1890,38 @@ struct NormalizeParam { bool channel_shared{false}; //! scale tensor if has one Tensor* scale{nullptr}; + Tensor* bias{nullptr}; float eps{1e-6f}; + //!group, which can normalize + int group{-1}; + //!bias + bool has_bias{false}; +}; + +template +struct OneHotParam { + OneHotParam() = default; + ~OneHotParam() = default; + + OneHotParam(int depth_in) + : depth(depth_in) + {} + + OneHotParam(const OneHotParam& right) + :depth(right.depth) + {} + + OneHotParam& operator=(const OneHotParam& right) { + depth = right.depth; + return *this; + } + + bool operator==(const OneHotParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (depth == right.depth); + return comp_eq; + } + int depth{0}; }; template @@ -1676,6 +1955,41 @@ struct PadParam { std::vector pad_w; }; +template +struct Pad2DParam { + Pad2DParam():_mode(PAD_CONSTANT), _pad_value(0.f), _pad_h({0, 0}), _pad_w({0, 0}) {} + Pad2DParam(std::vector pad_h, std::vector pad_w, \ + float pad_value, PadMode mode = PAD_CONSTANT){ + mode = mode; + _pad_h = pad_h; + _pad_w = pad_w; + _pad_value = pad_value; + } + Pad2DParam(const Pad2DParam &right): + _mode(right._mode), _pad_value(right._pad_value), \ + _pad_h(right._pad_h), _pad_w(right._pad_w) {} + Pad2DParam &operator=(const Pad2DParam &right) { + _mode = right._mode; + _pad_h = right._pad_h; + _pad_w = right._pad_w; + _pad_value = right._pad_value; + return *this; + } + bool operator==(const Pad2DParam &right) { + bool flag = _mode == right._mode; + flag = flag && _pad_h == right._pad_h; + flag = flag && _pad_w == right._pad_w; + flag = flag && _pad_value == right._pad_value; + return flag; + } + +public: + PadMode _mode{PAD_CONSTANT}; + std::vector _pad_h; + std::vector _pad_w; + float _pad_value = 0.f; +}; + template struct PermuteParam { PermuteParam() {} @@ -1701,10 +2015,10 @@ struct PermuteParam { template struct PermutePowerParam { PermutePowerParam() {} - PermutePowerParam(PermuteParam permute_param): - power_param(power_param), has_power_param(false) {} - PermutePowerParam(PermuteParam permute_param, PowerParam power_param): - power_param(power_param), permute_param(permute_param), has_power_param(true) {} + PermutePowerParam(PermuteParam permute_param_in): + permute_param(permute_param_in), has_power_param(false) {} + PermutePowerParam(PermuteParam permute_param_in, PowerParam power_param_in): + power_param(power_param_in), permute_param(permute_param_in), has_power_param(true) {} PermutePowerParam(const PermutePowerParam& right): power_param(right.power_param), permute_param(right.permute_param), has_power_param(right.has_power_param) {} @@ -1719,6 +2033,33 @@ struct PermutePowerParam { bool has_power_param; }; +template +struct PixelShuffleParam { + PixelShuffleParam() {} + PixelShuffleParam(int h, int w): rh(h), rw(w), channel_first(true) {}; + PixelShuffleParam(int h, int w, bool flag): rh(h), rw(w), channel_first(flag) {}; + PixelShuffleParam(const PixelShuffleParam& right): + rh(right.rh), rw(right.rw), channel_first(right.channel_first) {} + PixelShuffleParam& operator=(const PixelShuffleParam right){ + rh = right.rh; + rw = right.rw; + channel_first = right.channel_first; + return *this; + } + bool operator==(const PixelShuffleParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (rh == right.rh); + comp_eq = comp_eq && (rw == right.rw); + comp_eq = comp_eq && (channel_first == right.channel_first); + return comp_eq; + } + + int rh; + int rw; + bool channel_first; +}; + + template struct PoolingParam { PoolingParam() : window_h(-1), window_w(-1) @@ -2053,6 +2394,57 @@ struct PriorBoxParam { std::vector order; }; +template +struct PsRoiPoolParam { + PsRoiPoolParam() = default; + PsRoiPoolParam(int ph, int pw, int ch, int cw) : + pooled_height(ph), pooled_width(pw), crop_height(ch), crop_width(cw){} + PsRoiPoolParam(int ph, int pw, int ch, int cw, bool pool, float scale, int m, float exv) : + pooled_height(ph), pooled_width(pw), crop_height(ch), crop_width(cw), + method(m), extra_value(exv), global_pooling(pool), spatial_scale(scale){} + PsRoiPoolParam(const PsRoiPoolParam& right) { + pooled_width = right.pooled_width; + pooled_height = right.pooled_height; + crop_height = right.crop_height; + crop_width = right.crop_width; + global_pooling = right.global_pooling; + spatial_scale = right.spatial_scale; + method = right.method; + extra_value = right.extra_value; + } + PsRoiPoolParam& operator=(const PsRoiPoolParam& right) { + pooled_width = right.pooled_width; + pooled_height = right.pooled_height; + crop_height = right.crop_height; + crop_width = right.crop_width; + global_pooling = right.global_pooling; + spatial_scale = right.spatial_scale; + method = right.method; + extra_value = right.extra_value; + return *this; + } + bool operator==(const PsRoiPoolParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && pooled_width == right.pooled_width; + comp_eq = comp_eq && pooled_height == right.pooled_height; + comp_eq = comp_eq && spatial_scale == right.spatial_scale; + comp_eq = comp_eq && crop_height == right.crop_height; + comp_eq = comp_eq && crop_width == right.crop_width; + comp_eq = comp_eq && global_pooling == right.global_pooling; + comp_eq = comp_eq && method == right.method; + comp_eq = comp_eq && extra_value == right.extra_value; + return comp_eq; + } + int pooled_height; + int pooled_width; + int crop_height; + int crop_width; + bool global_pooling{true}; + float spatial_scale{1.}; + int method{0}; + float extra_value{0.}; +}; + template struct ReshapeParam { ReshapeParam() = default; @@ -2094,29 +2486,43 @@ struct ReshapeParam { template struct ResizeParam { ResizeParam() = default; - explicit ResizeParam(float scale_w, float scale_h) { - bool flag = scale_w > 0.f && scale_h > 0.f; + explicit ResizeParam(ResizeType type, float scale_w, float scale_h, int out_w = -1, int out_h = -1) { + bool flag = (scale_w > 0.f && scale_h > 0.f) || (out_w > 0 && out_h > 0); CHECK_EQ(flag, true) << "wrong parameters"; + resize_type = type; width_scale = scale_w; height_scale = scale_h; + out_width = out_w; + out_height = out_h; } ResizeParam(const ResizeParam& right) { + resize_type = right.resize_type; width_scale = right.width_scale; height_scale = right.height_scale; + out_width = right.out_width; + out_height = right.out_height; } ResizeParam& operator=(const ResizeParam& right) { + this->resize_type = right.resize_type; this->width_scale = right.width_scale; this->height_scale = right.height_scale; + this->out_width = right.out_width; + this->out_height = right.out_height; return *this; } bool operator==(const ResizeParam& right) { float eps = 1e-6; bool flag = fabsf(width_scale - right.width_scale) < eps; flag &= fabsf(height_scale - right.height_scale) < eps; + flag &= (resize_type == right.resize_type); + flag &= (out_width == right.out_width) && (out_height == right.out_height); return flag; } float width_scale{0.0f}; float height_scale{0.0f}; + int out_width{-1}; + int out_height{-1}; + ResizeType resize_type; }; template @@ -2344,6 +2750,55 @@ struct SliceParam { std::vector slice_points; }; +template +struct SliceV2Param { + SliceV2Param() = default; + explicit SliceV2Param(std::vector axes_in, + std::vector starts_in, + std::vector ends_in) { + axes = axes_in; + starts = starts_in; + ends = ends_in; + } + SliceV2Param(const SliceV2Param& right) { + axes = right.axes; + starts = right.starts; + ends = right.ends; + } + SliceV2Param& operator=(const SliceV2Param& right) { + axes = right.axes; + starts = right.starts; + ends = right.ends; + return *this; + } + bool operator==(const SliceV2Param& right) { + bool comp_eq = starts.size() == right.starts.size(); + comp_eq = comp_eq && ends.size() == right.ends.size(); + comp_eq = comp_eq && starts.size() == ends.size(); + + for (int i = 0; i < starts.size(); ++i) { + if (!comp_eq) { + return false; + } + + comp_eq = starts[i] == right.starts[i]; + comp_eq = comp_eq && ends[i] == right.ends[i]; + } + for (int i = 0; i < axes.size(); i++) { + if (!comp_eq) { + return false; + } + + comp_eq = axes[i] == right.axes[i]; + } + + return comp_eq; + } + std::vector axes; + std::vector starts; + std::vector ends; +}; + template struct SoftmaxParam { SoftmaxParam() = default; @@ -2391,12 +2846,109 @@ struct SPPParam { PoolingType pool_type; }; +template +struct SProposalParam { + SProposalParam() = default; + SProposalParam(std::vector scale_in, + std::vector ratio_in, + int feat_stride_in, + int basesize_in, + int boxminsize_in, + int pre_nms_topn_in, + int post_nms_topn_in, + float nms_thresh_in) + : scale(scale_in) + , ratio(ratio_in) + , feat_stride(feat_stride_in) + , basesize(basesize_in) + , boxminsize(boxminsize_in) + , pre_nms_topn(pre_nms_topn_in) + , post_nms_topn(post_nms_topn_in) + , nms_thresh(nms_thresh_in) + {} + SProposalParam(const SProposalParam& right) + : scale(right.scale) + , ratio(right.ratio) + , feat_stride(right.feat_stride) + , basesize(right.basesize) + , boxminsize(right.boxminsize) + , pre_nms_topn(right.pre_nms_topn) + , post_nms_topn(right.post_nms_topn) + , nms_thresh(right.nms_thresh) + {} + SProposalParam& operator=(const SProposalParam& right) { + scale = right.scale; + ratio = right.ratio; + feat_stride = right.feat_stride; + basesize = right.basesize; + boxminsize = right.boxminsize; + pre_nms_topn = right.pre_nms_topn; + post_nms_topn = right.post_nms_topn; + nms_thresh = right.nms_thresh; + return *this; + } + bool operator==(const SProposalParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && compare_vectors(scale, right.scale); + comp_eq = comp_eq && compare_vectors(ratio, right.ratio); + comp_eq = comp_eq && (feat_stride == right.feat_stride); + comp_eq = comp_eq && (basesize == right.basesize); + comp_eq = comp_eq && (boxminsize == right.boxminsize); + comp_eq = comp_eq && (pre_nms_topn == right.pre_nms_topn); + comp_eq = comp_eq && (post_nms_topn == right.post_nms_topn); + comp_eq = comp_eq && (nms_thresh == right.nms_thresh); + return comp_eq; + } + std::vector scale; + std::vector ratio; + int feat_stride{16}; + int basesize{16}; + int boxminsize{1000}; + int pre_nms_topn{400}; + int post_nms_topn{120}; + float nms_thresh{0.7}; +}; + +template +struct SRoiAlignParam { + SRoiAlignParam() = default; + SRoiAlignParam(int pooled_h_in, int pooled_w_in, float spatial_scale_in) + : pooled_h(pooled_h_in) + , pooled_w(pooled_w_in) + , spatial_scale(spatial_scale_in) + {} + + SRoiAlignParam(const SRoiAlignParam& right) + : pooled_h(right.pooled_h) + , pooled_w(right.pooled_w) + , spatial_scale(right.spatial_scale) + {} + SRoiAlignParam& operator=(const SRoiAlignParam& right) { + pooled_h = right.pooled_h; + pooled_w = right.pooled_w; + spatial_scale = right.spatial_scale; + return *this; + } + bool operator==(const SRoiAlignParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && (pooled_h == right.pooled_h); + comp_eq = comp_eq && (pooled_w == right.pooled_w); + comp_eq = comp_eq && (spatial_scale == right.spatial_scale); + return comp_eq; + } + + int pooled_h{1}; + int pooled_w{1}; + float spatial_scale{1}; +}; template struct TransposeParam { TransposeParam() = default; TransposeParam(const TransposeParam& right) {} - TransposeParam& operator=(const TransposeParam& right) {} + TransposeParam& operator=(const TransposeParam& right) { + return *this; + } bool operator==(const TransposeParam& right) { return true; } @@ -2427,7 +2979,7 @@ struct TopKPoolingParam { template struct TopKAvgPoolingParam { TopKAvgPoolingParam() = default; - TopKAvgPoolingParam(std::vector top_ks_in, + TopKAvgPoolingParam(std::vector top_ks_in, int feat_map_num_in, bool is_pooling_by_row_in): top_ks(top_ks_in), feat_map_num(feat_map_num_in), @@ -2464,28 +3016,43 @@ struct MatchMatrixParam { dim_t(dim_t_in), linear_term(false), bias_term(false), + is_l_same(true), weight_tensor(weight) {} - MatchMatrixParam(int dim_in_in, + MatchMatrixParam(int dim_in_in, + int dim_t_in, + bool is_l_same_in, + opTensor* weight): + dim_in(dim_in_in), + dim_t(dim_t_in), + linear_term(false), + bias_term(false), + is_l_same(is_l_same_in), + weight_tensor(weight) {} + MatchMatrixParam(int dim_in_in, int dim_t_in, bool linear_term_in, bool bias_term_in, + bool is_l_same_in, opTensor* weight): dim_in(dim_in_in), dim_t(dim_t_in), linear_term(linear_term_in), bias_term(bias_term_in), + is_l_same(is_l_same_in), weight_tensor(weight) {} MatchMatrixParam(const MatchMatrixParam& right): dim_in(right.dim_in), dim_t(right.dim_t), linear_term(right.linear_term), bias_term(right.bias_term), + is_l_same(right.is_l_same), weight_tensor(right.weight_tensor) {} MatchMatrixParam& operator=(const MatchMatrixParam& right) { dim_in = right.dim_in; dim_t = right.dim_t; linear_term = right.linear_term; bias_term = right.bias_term; + is_l_same = right.is_l_same; weight_tensor = right.weight_tensor; return *this; } @@ -2494,7 +3061,8 @@ struct MatchMatrixParam { flag = flag && (dim_in == right.dim_in); flag = flag && (dim_t == right.dim_t); flag = flag && (linear_term == right.linear_term); - flag = flag && (bias_term == right.bias_term); + flag = flag && (bias_term == right.bias_term); + flag = flag && (is_l_same == right.is_l_same); flag = flag && (weight_tensor == right.weight_tensor); return flag; } @@ -2508,6 +3076,7 @@ struct MatchMatrixParam { int dim_t{2}; bool linear_term{false}; bool bias_term{false}; + bool is_l_same{true}; private: opTensor* weight_tensor{nullptr}; }; @@ -2534,7 +3103,9 @@ template struct MeanParam { MeanParam() = default; MeanParam(const MeanParam& right) {} - MeanParam& operator=(const MeanParam& right) {} + MeanParam& operator=(const MeanParam& right) { + return *this; + } bool operator==(const MeanParam& right) { return true; } @@ -2563,7 +3134,588 @@ struct ShuffleChannelParam { int group; }; -} +template +struct ReduceParam { + ReduceParam() = default; + ReduceParam(std::vector& reduce_dim_in, + ReduceType reduce_type_in, + bool keep_dim_in, + bool reduce_all_in, + float coeff_in = 1.f) + : reduce_dim(reduce_dim_in) + , reduce_type(reduce_type_in) + , keep_dim(keep_dim_in) + , reduce_all(reduce_all_in) + , coeff(coeff_in) + {} + ReduceParam(const ReduceParam& right) + : reduce_dim(right.reduce_dim) + , reduce_type(right.reduce_type) + , keep_dim(right.keep_dim) + , reduce_all(right.reduce_all) + , coeff(right.coeff) + {} + + ReduceParam& operator=(const ReduceParam& right) { + reduce_dim = right.reduce_dim; + reduce_type = right.reduce_type; + keep_dim = right.keep_dim; + reduce_all = right.reduce_all; + coeff = right.coeff; + return *this; + } + + bool operator==(const ReduceParam& right) { + bool comp_eq = true; + comp_eq = comp_eq && compare_vectors(reduce_dim, right.reduce_dim); + comp_eq = comp_eq && (reduce_type == right.reduce_type); + comp_eq = comp_eq && (keep_dim == right.keep_dim); + comp_eq = comp_eq && (reduce_all == right.reduce_all); + comp_eq = comp_eq && (coeff == right.coeff); + return comp_eq; + } + + std::vector reduce_dim; + ReduceType reduce_type{Reduce_unknow}; + bool keep_dim{false}; + bool reduce_all{false}; + float coeff{1.f}; // output coeff +}; + +template +struct ReduceMinParam { + ReduceMinParam() = default; + ReduceMinParam(std::vectorreduce_dim_in, bool keep_dim_in = false) : + reduce_dim(reduce_dim_in), keep_dim(keep_dim_in){} + + ReduceMinParam(const ReduceMinParam& right) { + keep_dim = right.keep_dim; + reduce_dim = right.reduce_dim; + } + ReduceMinParam& operator=(const ReduceMinParam& right) { + keep_dim = right.keep_dim; + reduce_dim = right.reduce_dim; + return *this; + } + bool operator==(const ReduceMinParam& right) { + return (keep_dim == right.keep_dim) && (reduce_dim == right.reduce_dim); + } + + std::vector reduce_dim; + bool keep_dim{false}; +}; + +template +struct RoiAlignParam { + RoiAlignParam() = default; + RoiAlignParam(int pooled_height_in, int pooled_width_in, float spatial_scale_in, int sampling_ratio_in) : + pooled_height(pooled_height_in), pooled_width(pooled_width_in), \ + spatial_scale(spatial_scale_in), sampling_ratio(sampling_ratio_in) {} + RoiAlignParam(const RoiAlignParam& right) { + pooled_height = right.pooled_height; + pooled_width = right.pooled_width; + spatial_scale = right.spatial_scale; + sampling_ratio = right.sampling_ratio; + } + RoiAlignParam& operator=(const RoiAlignParam& right) { + pooled_height = right.pooled_height; + pooled_width = right.pooled_width; + spatial_scale = right.spatial_scale; + sampling_ratio = right.sampling_ratio; + return *this; + } + bool operator==(const RoiAlignParam& right) { + return (pooled_height == right.pooled_height) && + (pooled_width == right.pooled_width) && + (spatial_scale == right.spatial_scale) && + (sampling_ratio == right.sampling_ratio); + } + + int pooled_height; + int pooled_width; + float spatial_scale; + int sampling_ratio; +}; + +template +struct SequenceConcatParam{ + SequenceConcatParam() = default; + SequenceConcatParam(const SequenceConcatParam& right) {} + SequenceConcatParam& operator=(const SequenceConcatParam& right) { return *this;} + bool operator==(const SequenceConcatParam& right) {return true;} +}; + +template +struct SequenceConcatByColParam { + SequenceConcatByColParam() = default; + SequenceConcatByColParam(const SequenceConcatByColParam &right) {} + SequenceConcatByColParam &operator=(const SequenceConcatByColParam &right) { return *this; } + bool operator==(const SequenceConcatByColParam &right) { return true; } +}; + +template +struct SequencePoolConcatParam{ + SequencePoolConcatParam() + : sequence_pool_param() + , concat_param() + , slot_num{0} + {} + SequencePoolConcatParam(SequencePoolParam sequence_pool_param_in, + ConcatParam concat_param, int slot_num_in) + : sequence_pool_param(sequence_pool_param_in) + , concat_param(concat_param) + , slot_num(slot_num_in) + {} + + SequencePoolConcatParam(const SequencePoolConcatParam& right) + : sequence_pool_param(right.sequence_pool_param) + , concat_param(right.concat_param) + , slot_num(right.slot_num) + {} + + SequencePoolConcatParam& operator=(const SequencePoolConcatParam& right) { + sequence_pool_param = right.sequence_pool_param; + concat_param = right.concat_param; + slot_num = right.slot_num; + return *this; + } + + bool operator==(const SequencePoolConcatParam& right) { + bool comp_eq = true; + comp_eq &= (sequence_pool_param == right.sequence_pool_param); + comp_eq &= (concat_param == right.concat_param); + comp_eq &= (slot_num == right.slot_num); + return comp_eq; + } + + SequencePoolParam sequence_pool_param; + ConcatParam concat_param; + int slot_num; +}; + +template +struct SoftSignParam{ + SoftSignParam() = default; + SoftSignParam(const SoftSignParam& right) {} + SoftSignParam& operator=(const SoftSignParam& right) { return *this;} + bool operator==(const SoftSignParam& right) {return true;} +}; + +template +struct CosSimParam{ + CosSimParam() = default; + + CosSimParam(float epsilon_in):epsilon(epsilon_in) {} + + CosSimParam(const CosSimParam& right):epsilon(right.epsilon) {} + + CosSimParam& operator=(const CosSimParam& right) { + epsilon = right.epsilon; + return *this; + } + + bool operator==(const CosSimParam& right) { + return epsilon == right.epsilon; + } + + float epsilon{0.f}; +}; + +template +struct ProductQuantEmbeddingWithVsumParam { + ProductQuantEmbeddingWithVsumParam() = default; + ProductQuantEmbeddingWithVsumParam(int word_emb_in, + int word_voc_in, + int top_unigram_in, + int top_bigram_in, + int top_collocation_in, + int sec_unigram_in, + int sec_bigram_in, + int sec_collocation_in, + int thd_unigram_in, + int thd_bigram_in, + int thd_collocation_in, + int max_seq_len_in, + Tensor* embedding_0_in, + Tensor* embedding_1_in, + Tensor* embedding_2_in, + Tensor* quant_dict_0_in, + Tensor* quant_dict_1_in, + Tensor* quant_dict_2_in):word_emb(word_emb_in), + word_voc(word_voc_in), + top_unigram(top_unigram_in), + top_bigram(top_bigram_in), + top_collocation(top_collocation_in), + sec_unigram(sec_unigram_in), + sec_bigram(sec_bigram_in), + sec_collocation(sec_collocation_in), + thd_unigram(thd_unigram_in), + thd_bigram(thd_bigram_in), + thd_collocation(thd_collocation_in), + max_seq_len(max_seq_len_in), + embedding_0(embedding_0_in), + embedding_1(embedding_1_in), + embedding_2(embedding_2_in), + quant_dict_0(quant_dict_0_in), + quant_dict_1(quant_dict_1_in), + quant_dict_2(quant_dict_2_in) { } + + ProductQuantEmbeddingWithVsumParam(const ProductQuantEmbeddingWithVsumParam& right) :word_emb(right.word_emb), + word_voc(right.word_voc), + top_unigram(right.top_unigram), + top_bigram(right.top_bigram), + top_collocation(right.top_collocation), + sec_unigram(right.sec_unigram), + sec_bigram(right.sec_bigram), + sec_collocation(right.sec_collocation), + thd_unigram(right.thd_unigram), + thd_bigram(right.thd_bigram), + thd_collocation(right.thd_collocation), + max_seq_len(right.max_seq_len), + embedding_0(right.embedding_0), + embedding_1(right.embedding_1), + embedding_2(right.embedding_2), + quant_dict_0(right.quant_dict_0), + quant_dict_1(right.quant_dict_1), + quant_dict_2(right.quant_dict_2) {} + ProductQuantEmbeddingWithVsumParam& operator=(const ProductQuantEmbeddingWithVsumParam& right) { + word_emb = right.word_emb; + word_voc = right.word_voc; + top_unigram = right.top_unigram; + top_bigram = right.top_bigram; + top_collocation = right.top_collocation; + sec_unigram = right.sec_unigram; + sec_bigram = right.sec_bigram; + sec_collocation = right.sec_collocation; + thd_unigram = right.thd_unigram; + thd_bigram = right.thd_bigram; + thd_collocation = right.thd_collocation; + max_seq_len = right.max_seq_len; + embedding_0 = right.embedding_0; + embedding_1 = right.embedding_1; + embedding_2 = right.embedding_2; + quant_dict_0 = right.quant_dict_0; + quant_dict_1 = right.quant_dict_1; + quant_dict_2 = right.quant_dict_2; + return *this; + } + bool operator==(const ProductQuantEmbeddingWithVsumParam& right) { + bool flag = true; + flag = flag && word_emb == right.word_emb; + flag = flag && word_voc == right.word_voc; + flag = flag && top_unigram == right.top_unigram; + flag = flag && top_bigram == right.top_bigram; + flag = flag && top_collocation == right.top_collocation; + flag = flag && sec_unigram == right.sec_unigram; + flag = flag && sec_bigram == right.sec_bigram; + flag = flag && sec_collocation == right.sec_collocation; + flag = flag && thd_unigram == right.thd_unigram; + flag = flag && thd_bigram == right.thd_bigram; + flag = flag && thd_collocation == right.thd_collocation; + flag = flag && max_seq_len == right.max_seq_len; + flag = flag && embedding_0 == right.embedding_0; + flag = flag && embedding_1 == right.embedding_1; + flag = flag && embedding_2 == right.embedding_2; + flag = flag && quant_dict_0 == right.quant_dict_0; + flag = flag && quant_dict_1 == right.quant_dict_1; + flag = flag && quant_dict_2 == right.quant_dict_2; + return flag; + } + + int word_emb{128}; + int word_voc{1}; + int top_unigram{0}; + int top_bigram{0}; + int top_collocation{0}; + int sec_unigram{0}; + int sec_bigram{0}; + int sec_collocation{0}; + int thd_unigram{0}; + int thd_bigram{0}; + int thd_collocation{0}; + int max_seq_len{0}; + Tensor* embedding_0{NULL}; + Tensor* embedding_1{NULL}; + Tensor* embedding_2{NULL}; + Tensor* quant_dict_0{NULL}; + Tensor* quant_dict_1{NULL}; + Tensor* quant_dict_2{NULL}; + +}; + +template +struct ArithmeticParam{ + ArithmeticParam() = default; + + ArithmeticParam(ArithmeticType op_type_in):op_type(op_type_in) {} + + ArithmeticParam(const ArithmeticParam& right):op_type(right.op_type) {} + + ArithmeticParam& operator=(const ArithmeticParam& right) { + op_type = right.op_type; + return *this; + } + + bool operator==(const ArithmeticParam& right) { + return op_type == right.op_type; + } + + ArithmeticType op_type; +}; + +template +struct AlignedMatMulParam{ + AlignedMatMulParam() = default; + + AlignedMatMulParam(bool is_transpose_X_in, + bool is_transpose_Y_in, + float scale_in):is_transpose_X(is_transpose_X_in), + is_transpose_Y(is_transpose_Y_in), + scale(scale_in) {} + + AlignedMatMulParam(const AlignedMatMulParam& right): + is_transpose_X(right.is_transpose_X), + is_transpose_Y(right.is_transpose_Y), + scale(right.scale){} + + AlignedMatMulParam& operator=(const AlignedMatMulParam& right) { + is_transpose_X = right.is_transpose_X; + is_transpose_Y = right.is_transpose_Y; + scale = right.scale; + return *this; + } + + bool operator==(const AlignedMatMulParam& right) { + bool flag = true; + flag = flag && is_transpose_X == right.is_transpose_X; + flag = flag && is_transpose_Y == right.is_transpose_Y; + flag = flag && scale == right.scale; + return flag; + } + + bool is_transpose_X{false}; + bool is_transpose_Y{false}; + float scale{1.0f}; +}; + +template +struct SequencePaddingParam{ + SequencePaddingParam() = default; + + SequencePaddingParam(const SequencePaddingParam& right) { } + + SequencePaddingParam& operator=(const SequencePaddingParam& right) { + return *this; + } + + bool operator==(const SequencePaddingParam& right) { + return true; + } +}; + +template +struct SequenceDePaddingParam{ + SequenceDePaddingParam() = default; + + SequenceDePaddingParam(const SequenceDePaddingParam& right) { } + + SequenceDePaddingParam& operator=(const SequenceDePaddingParam& right) { + return *this; + } + + bool operator==(const SequenceDePaddingParam& right) { + return true; + } +}; + +template +struct AttentionPaddingMaskParam{ + AttentionPaddingMaskParam() = default; + AttentionPaddingMaskParam(float mask_in, + int pad_id_in): + mask(mask_in), + pad_id(pad_id_in){} + AttentionPaddingMaskParam(const AttentionPaddingMaskParam& right):mask(right.mask), pad_id(right.pad_id) {} + AttentionPaddingMaskParam& operator=(const AttentionPaddingMaskParam& right) { + mask = right.mask; + pad_id = right.pad_id; + return *this; + } + bool operator== (const AttentionPaddingMaskParam& right) { + bool flag = mask == right.mask; + flag = flag && pad_id == right.pad_id; + return flag; + } + + float mask{900000000.0f}; + int pad_id{12800001}; + +}; + +template +struct PyramidHashQuantEmbeddingParam{ + PyramidHashQuantEmbeddingParam() = default; + PyramidHashQuantEmbeddingParam(int space_size_in, + int emb_size_in, + int pyramid_layer_in, + int rand_len_in, + int white_list_len_in, + int black_list_len_in, + float dropout_percent_in, + Tensor* quant_dict_in, + Tensor* hash_space_in, + Tensor* white_filter_in, + Tensor* black_filter_in): + space_size(space_size_in), + emb_size(emb_size_in), + pyramid_layer(pyramid_layer_in), + rand_len(rand_len_in), + white_list_len(white_list_len_in), + black_list_len(black_list_len_in), + dropout_percent(dropout_percent_in), + quant_dict(quant_dict_in), + hash_space(hash_space_in), + white_filter(white_filter_in), + black_filter(black_filter_in) {}; + + PyramidHashQuantEmbeddingParam(const PyramidHashQuantEmbeddingParam& right): + space_size(right.space_size), + emb_size(right.emb_size), + pyramid_layer(right.pyramid_layer), + rand_len(right.rand_len), + white_list_len(right.white_list_len), + black_list_len(right.black_list_len), + dropout_percent(right.dropout_percent), + quant_dict(right.quant_dict), + hash_space(right.hash_space), + white_filter(right.white_filter), + black_filter(right.black_filter) {} + + PyramidHashQuantEmbeddingParam& operator=(const PyramidHashQuantEmbeddingParam& right) { + space_size = right.space_size; + emb_size = right.emb_size; + pyramid_layer = right.pyramid_layer; + rand_len = right.rand_len; + white_list_len = right.white_list_len; + black_list_len = right.black_list_len; + dropout_percent = right.dropout_percent; + quant_dict = right.quant_dict; + hash_space = right.hash_space; + white_filter = right.white_filter; + black_filter = right.black_filter; + return *this; + } + + bool operator==(const PyramidHashQuantEmbeddingParam& right) { + bool flag = true; + flag = flag && space_size == right.space_size; + flag = flag && emb_size == right.emb_size; + flag = flag && pyramid_layer == right.pyramid_layer; + flag = flag && rand_len == right.rand_len; + flag = flag && white_list_len == right.white_list_len; + flag = flag && black_list_len == right.black_list_len; + flag = flag && dropout_percent == right.dropout_percent; + flag = flag && quant_dict == right.quant_dict; + flag = flag && hash_space == right.hash_space; + flag = flag && white_filter == right.white_filter; + flag = flag && black_filter == right.black_filter; + return flag; + } + + int space_size; + int emb_size; + int pyramid_layer; + int rand_len; + int white_list_len; + int black_list_len; + float dropout_percent; + Tensor* quant_dict; + Tensor* hash_space; + Tensor* white_filter; + Tensor* black_filter; +}; + +template +struct SeqConcatSeqPoolSoftSignParam{ + SeqConcatSeqPoolSoftSignParam() = default; + + SeqConcatSeqPoolSoftSignParam(SequenceConcatParam seq_concat_in, + SequencePoolParam seq_pool_in, + SoftSignParam soft_sign_in): + seq_pool(seq_pool_in), + seq_concat(seq_concat_in), + soft_sign(soft_sign_in) {} + + SeqConcatSeqPoolSoftSignParam(const SeqConcatSeqPoolSoftSignParam& right) : seq_pool(right.seq_pool), + seq_concat(right.seq_concat), + soft_sign(right.soft_sign) {} + + SeqConcatSeqPoolSoftSignParam& operator=(const SeqConcatSeqPoolSoftSignParam& right) { + seq_concat = right.seq_concat; + seq_pool = right.seq_pool; + soft_sign = right.soft_sign; + return *this; + } + + bool operator==(const SeqConcatSeqPoolSoftSignParam& right) { + bool flag = true; + flag = flag && seq_concat == right.seq_concat; + flag = flag && seq_pool == right.seq_pool; + flag = flag && soft_sign == right.soft_sign; + return flag; + } + + SequenceConcatParam seq_concat; + SequencePoolParam seq_pool; + SoftSignParam soft_sign; +}; + +template +struct YoloBoxParam { + + YoloBoxParam() = default; + + YoloBoxParam(std::vector anchors_in, + int class_num_in, + float conf_thresh_in, + int downsample_ratio_in) + : anchors(anchors_in) + , class_num(class_num_in) + , conf_thresh(conf_thresh_in) + , downsample_ratio(downsample_ratio_in) + {} + + YoloBoxParam(const YoloBoxParam& right) + : anchors(right.anchors) + , class_num(right.class_num) + , conf_thresh(right.conf_thresh) + , downsample_ratio(right.downsample_ratio) + {} + + YoloBoxParam& operator=(const YoloBoxParam& right) { + anchors = right.anchors; + class_num = right.class_num; + conf_thresh = right.conf_thresh; + downsample_ratio = right.downsample_ratio; + return *this; + } + + bool operator==(const YoloBoxParam& right) { + bool flag = true; + flag = flag && (anchors == right.anchors); + flag = flag && (class_num == right.class_num); + flag = flag && (conf_thresh == right.conf_thresh); + flag = flag && (downsample_ratio == right.downsample_ratio); + return flag; + } + + std::vector anchors; + int class_num{0}; + float conf_thresh{0.f}; + int downsample_ratio{0}; +}; + +} } #endif //SABER_FUNCS_PARAM_H diff --git a/saber/saber_types.h b/saber/saber_types.h index 0534b27a3..754d27570 100644 --- a/saber/saber_types.h +++ b/saber/saber_types.h @@ -61,7 +61,9 @@ enum LayoutType { Layout_NCHW_C8 = 11, Layout_NCHW_C16 = 12, Layout_OIHW16I16O = 13, - Layout_GOIHW16I16O = 14 + Layout_GOIHW16I16O = 14, + Layout_NCHW_C8R=15, + Layout_NCHW_C16R=16, }; //! target_type struct struct Layout { @@ -71,6 +73,7 @@ struct Layout { virtual int width_index() {return -1;} virtual int depth_index() {return -1;} virtual int inner_c() {return -1;} + virtual int aligned_length() {return -1;} virtual int dims() {return -1;} virtual LayoutType type() {return Layout_invalid;} }; @@ -137,8 +140,7 @@ struct NCHW_C4 : public Layout { int channel_index() {return 1;} int height_index() {return 2;} int width_index() {return 3;} - int inner_c() {return 4;} - int dims() {return 5;} + int dims() {return 4;} LayoutType type() {return Layout_NCHW_C4;} }; struct NCHW_C8 : public Layout { @@ -150,6 +152,15 @@ struct NCHW_C8 : public Layout { int dims() {return 5;} LayoutType type() {return Layout_NCHW_C8;} }; +struct NCHW_C8R : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int dims() {return 4;} + int aligned_length(){ return 8;} + LayoutType type() {return Layout_NCHW_C8R;} +}; struct NCHW_C16 : public Layout { int num_index() {return 0;} int channel_index() {return 1;} @@ -159,6 +170,17 @@ struct NCHW_C16 : public Layout { int dims() {return 5;} LayoutType type() {return Layout_NCHW_C16;} }; + +struct NCHW_C16R : public Layout { + int num_index() {return 0;} + int channel_index() {return 1;} + int height_index() {return 2;} + int width_index() {return 3;} + int dims() {return 4;} + int aligned_length(){ return 16;} + LayoutType type() {return Layout_NCHW_C16R;} +}; + enum DataType { AK_INVALID = -1, AK_HALF = 0, @@ -171,10 +193,11 @@ enum DataType { AK_UINT8 = 7, AK_UINT16 = 8, AK_UINT32 = 9, - AK_STRING = 10, - AK_BOOL = 11, - AK_SHAPE = 12, - AK_TENSOR = 13 + AK_UINT64 = 10, + AK_STRING = 11, + AK_BOOL = 12, + AK_SHAPE = 13, + AK_TENSOR = 14 }; typedef enum { SaberSuccess = -1, /*!< No errors */ @@ -194,6 +217,19 @@ typedef enum{ UNKNOWN = 4 }SaberImplStrategy; +//! arm arch +enum ARMArch{ + APPLE = 0, + A53 = 53, + A55 = 55, + A57 = 57, + A72 = 72, + A73 = 73, + A75 = 75, + A76 = 76, + ARM_UNKOWN = -1 +}; + typedef enum { nearest = 0, down @@ -231,8 +267,20 @@ typedef enum{ Active_elu = 5, Active_identity = 6, Active_stanh = 9, - Active_prelu = 10 + Active_prelu = 10, + Active_gelu = 11, + Active_swish = 12 } ActiveType; + +typedef enum { + Reduce_unknow = 0, + Reduce_min, + Reduce_max, + Reduce_sum, + Reduce_avg, + Reduce_prod +} ReduceType; + typedef enum{ Pooling_unknow = 0, Pooling_max = 1, @@ -244,7 +292,8 @@ typedef enum{ Eltwise_unknow = 0, Eltwise_prod = 1, Eltwise_sum = 2, - Eltwise_max = 3 + Eltwise_max = 3, + Eltwise_div = 4 } EltwiseType; typedef enum{ ACROSS_CHANNELS = 0, @@ -276,16 +325,36 @@ typedef enum { PRIOR_MAX = 1, PRIOR_COM = 2 } PriorType; - + typedef enum{ RANDOM=0, SPECIAL, CUSTOM } TestDataType; + typedef enum{ ENTROPY= 0, MAXABS = 1 } CalibrationAlgoType; + +typedef enum{ + BILINEAR_ALIGN = 0, + BILINEAR_NO_ALIGN = 1, + RESIZE_CUSTOM = 2, + NEAREST_ALIGN = 3 +} ResizeType; + +typedef enum{ + PAD_CONSTANT = 0, + PAD_EDGE = 1, + PAD_REFLECT = 2, +} PadMode; + +typedef enum{ + SUM = 0, + SUB = 1, + MUL = 2, +} ArithmeticType; } //namespace saber } //namespace anakin #endif //ANAKIN_SABER_CORE_TYPES_H diff --git a/sgx/CMakeLists.txt b/sgx/CMakeLists.txt new file mode 100644 index 000000000..f5fed8662 --- /dev/null +++ b/sgx/CMakeLists.txt @@ -0,0 +1,162 @@ +set(SGX_ENCLAVE_SIGNER ${SGX_SDK}/bin/x64/sgx_sign) +set(SGX_EDGER8R ${SGX_SDK}/bin/x64/sgx_edger8r) + +set(TRUSTED_DIR ${CMAKE_CURRENT_BINARY_DIR}/trusted) +set(UNTRUSTED_DIR ${CMAKE_CURRENT_BINARY_DIR}/untrusted) + +macro(anakin_sgx_copy_example part file) + add_custom_command( + OUTPUT ${ANAKIN_SGX}/${part}/${file} + COMMAND ${CMAKE_COMMAND} + ARGS -E copy + ${ANAKIN_SGX}/${part}/example/${file} + ${ANAKIN_SGX}/${part}/${file} + DEPENDS ${ANAKIN_SGX}/${part}/example/${file} + COMMENT "Using the example ${file} for SGX ${part}" + ) +endmacro() + +file(GLOB examples "enclave/example/*") +foreach(example ${examples}) + get_filename_component(file ${example} NAME) + anakin_sgx_copy_example("enclave" ${file}) +endforeach() + +file(GLOB examples "app/example/*") +foreach(example ${examples}) + get_filename_component(file ${example} NAME) + anakin_sgx_copy_example("app" ${file}) +endforeach() + +set(ENCLAVE_EDL ${ANAKIN_SGX}/enclave/enclave.edl) +set(ENCLAVE_LDS ${ANAKIN_SGX}/enclave/version.lds) +set(ENCLAVE_KEY ${ANAKIN_SGX}/enclave/sign_enclave.pem) +set(ENCLAVE_CONFIG ${ANAKIN_SGX}/enclave/config.xml) +set(ECALL_EDL ${ANAKIN_SGX}/enclave/ecall.edl) +set(ECALL_SRC ${ANAKIN_SGX}/enclave/ecall.cpp) +set(APP_SRC ${ANAKIN_SGX}/app/app.cpp) +set(OCALL_SRC ${ANAKIN_SGX}/app/ocall.c) + +add_custom_command( + OUTPUT ${TRUSTED_DIR}/enclave_t.c ${TRUSTED_DIR}/enclave_t.h + ${UNTRUSTED_DIR}/enclave_u.c ${UNTRUSTED_DIR}/enclave_u.h + COMMAND ${CMAKE_COMMAND} -E make_directory ${TRUSTED_DIR} + COMMAND ${CMAKE_COMMAND} -E make_directory ${UNTRUSTED_DIR} + COMMAND ${SGX_EDGER8R} + ARGS ${ENCLAVE_EDL} + --trusted --trusted-dir ${TRUSTED_DIR} + --untrusted --untrusted-dir ${UNTRUSTED_DIR} + --search-path ${SGX_SDK}/include + --search-path ${ANAKIN_SGX}/enclave + DEPENDS ${SGX_EDGER8R} ${ENCLAVE_EDL} ${ECALL_EDL} + COMMENT "Generatring enclave bridge for ${ENCLAVE_EDL}..." +) + +set(TRUSTED_SRC ${TRUSTED_DIR}/enclave_t.c) +anakin_fetch_files_with_suffix("${ANAKIN_SGX}/enclave/src" "c" TRUSTED_SRC) +anakin_fetch_files_with_suffix("${ANAKIN_SGX}/enclave/src" "cpp" TRUSTED_SRC) +add_library(anakin_trusted STATIC ${TRUSTED_SRC}) +target_link_libraries(anakin_trusted ${SGX_CONFIG_INTERFACE}) + +set(UNTRUSTED_SRC ${UNTRUSTED_DIR}/enclave_u.c) +add_library(anakin_untrusted STATIC ${UNTRUSTED_SRC}) + +target_include_directories(anakin_trusted PUBLIC + ${TRUSTED_DIR} + ${ANAKIN_FRAMEWORK}/graph + ${ANAKIN_FRAMEWORK}/core/net + ${ANAKIN_SABER} +) + +target_include_directories(anakin_untrusted PUBLIC ${UNTRUSTED_DIR}) +target_include_directories(anakin_untrusted PUBLIC ${SGX_SDK}/include) + +set(MKL_PATCHED_DIR ${ANAKIN_ROOT}/third-party/mkl-patched) +set(MKL_PATCHED_LIB ${MKL_PATCHED_DIR}/libmkl_patch.a) +set(MKL_PATCHED_URL "https://raw.githubusercontent.com/rdzhou/mkl_patch/master/libmkl_patch.a") + +add_executable(anakin_enclave ${ECALL_SRC} ${ECLAVE_LDS} ${MKL_PATCHED_LIB}) + +add_custom_command( + OUTPUT ${MKL_PATCHED_LIB} + COMMAND ${CMAKE_COMMAND} ARGS -E make_directory ${MKL_PATCHED_DIR} + COMMAND wget ARGS -O ${MKL_PATCHED_LIB} ${MKL_PATCHED_URL} + COMMENT "Downloading mkl patch for sgx build from ${MKL_PATCHED_URL}..." + VERBATIM +) + +add_dependencies(anakin_enclave + ${anakin_lib_static} + ${ANAKIN_SABER_LIB_TARGET} + anakin_trusted +) + +add_custom_target(enclave_assets DEPENDS + ${ENCLAVE_LDS} ${ENCLAVE_KEY} ${ENCLAVE_CONFIG} +) + +add_dependencies(anakin_enclave enclave_assets) + +set(SGX_JIT_LDS ${ANAKIN_SGX}/enclave/enclave.lds) + +set_target_properties(anakin_enclave PROPERTIES LINK_DEPENDS ${SGX_JIT_LDS}) + +target_link_libraries(anakin_enclave + -Wl,-T,${SGX_JIT_LDS} + -L${SGX_SDK}/lib64 + -Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles + -Wl,-Bstatic -Wl,-Bsymbolic -Wl,-pie,-eenclave_entry + -Wl,--undefined,__anakin_enclave_init_status + -Wl,--defsym,__ImageBase=0 -Wl,--gc-sections + -Wl,--version-script=${ENCLAVE_LDS} +) + +if(SGX_SIM_MODE) + set(SGX_LIB_TYPE "_sim") +endif() + +# link anakin components +target_link_libraries(anakin_enclave + -Wl,--whole-archive + -lsgx_trts${SGX_LIB_TYPE} anakin_trusted ${anakin_lib_static} ${ANAKIN_SABER_LIB_TARGET} + -Wl,--no-whole-archive +) + +# link mkl +target_link_libraries(anakin_enclave + -Wl,--start-group + -lmkl_intel_lp64 -lmkl_sequential -lmkl_core + -Wl,--end-group + ${MKL_PATCHED_LIB} +) + +# link standard sgx libraries +target_link_libraries(anakin_enclave + -Wl,--start-group + -lsgx_tcxx -lsgx_tstdc -lsgx_tcrypto -lsgx_tservice${SGX_LIB_TYPE} + -Wl,--end-group +) + +# sign the enclave +add_custom_command( + OUTPUT anakin_enclave.signed + DEPENDS anakin_enclave ${ENCLAVE_KEY} ${ENCLAVE_CONFIG} + COMMAND sgx_sign + ARGS sign -key ${ENCLAVE_KEY} -enclave anakin_enclave + -out anakin_enclave.signed -config ${ENCLAVE_CONFIG} + COMMENT "Signing the enclave using\n key: ${ENCLAVE_KEY}\n config: ${ENCLAVE_CONFIG}" +) + +add_custom_target(anakin_enclave_signed ALL + DEPENDS anakin_enclave.signed +) + +add_executable(anakin_app ${APP_SRC} ${OCALL_SRC}) +target_compile_options(anakin_app PUBLIC -UNDEBUG) +target_link_libraries(anakin_app + anakin_untrusted + -L${SGX_SDK}/sdk_libs + -lsgx_urts${SGX_LIB_TYPE} + -lsgx_uae_service${SGX_LIB_TYPE} + -lpthread +) diff --git a/sgx/app/example/app.cpp b/sgx/app/example/app.cpp new file mode 100644 index 000000000..c0715cc03 --- /dev/null +++ b/sgx/app/example/app.cpp @@ -0,0 +1,150 @@ +#include +#include +#include "enclave_u.h" +#include "sgx_urts.h" + +/* Initialize the enclave: + * Step 1: try to retrieve the launch token saved by last transaction + * Step 2: call sgx_create_enclave to initialize an enclave instance + * Step 3: save the launch token if it is updated + */ +int initialize_enclave(sgx_enclave_id_t* eid, const char *token_path, const char *enclave_name) { + sgx_launch_token_t token = {0}; + sgx_status_t ret = SGX_ERROR_UNEXPECTED; + int updated = 0; + + /* Step 1: try to retrieve the launch token saved by last transaction + * if there is no token, then create a new one. + */ + /* try to get the token saved in $HOME */ + FILE* fp = fopen(token_path, "rb"); + if (fp == nullptr && (fp = fopen(token_path, "wb+")) == NULL) { + printf("Warning: Failed to create/open the launch token file \"%s\".\n", token_path); + } + + if (fp != nullptr) { + /* read the token from saved file */ + size_t read_num = fread(token, 1, sizeof(sgx_launch_token_t), fp); + if (read_num != 0 && read_num != sizeof(sgx_launch_token_t)) { + /* if token is invalid, clear the buffer */ + memset(&token, 0x0, sizeof(sgx_launch_token_t)); + printf("Warning: Invalid launch token read from \"%s\".\n", token_path); + } + } + + /* Step 2: call sgx_create_enclave to initialize an enclave instance */ + ret = sgx_create_enclave(enclave_name, SGX_DEBUG_FLAG, &token, &updated, eid, nullptr); + if (ret != SGX_SUCCESS) { + if (fp != nullptr) fclose(fp); + return -1; + } + + /* Step 3: save the launch token if it is updated */ + if (updated == false || fp == nullptr) { + /* if the token is not updated, or file handler is invalid, do not perform saving */ + if (fp != nullptr) fclose(fp); + return 0; + } + + /* reopen the file with write capablity */ + fp = freopen(token_path, "wb", fp); + if (fp == nullptr) return 0; + size_t write_num = fwrite(token, 1, sizeof(sgx_launch_token_t), fp); + if (write_num != sizeof(sgx_launch_token_t)) + printf("Warning: Failed to save launch token to \"%s\".\n", token_path); + fclose(fp); + return 0; +} + +/* Global EID shared by multiple threads */ +sgx_enclave_id_t global_eid = 0; + +#define SGX_INPUT_MAX (1024U * 1024U * 1U) +uint8_t sgx_input[SGX_INPUT_MAX]; + +#define SGX_OUTPUT_MAX (1024U * 1024U * 1U) +uint8_t sgx_output[SGX_OUTPUT_MAX]; + +int main(int argc, char const *argv[]) { + if (argc != 2 && argc != 3) { + fprintf(stderr, "usage: %s model_name [input_file]\n", argv[0]); + return 1; + } + + size_t input_size = 0; + if (argc == 3) { + FILE *input_file = fopen(argv[2], "rb"); + + if (!input_file) { + fprintf(stderr, "error: cannot open input file %s\n", argv[2]); + return 1; + } + + fseek(input_file, 0, SEEK_END); + long int fend = ftell(input_file); + fseek(input_file, 0, SEEK_SET); + + if (fend > sizeof(sgx_input)) { + fprintf(stderr, "error: oversized input\n"); + return 1; + } + + if (fend <= 0) { + fprintf(stderr, "error: cannot read input file\n"); + return 1; + } + + input_size = fend; + if (input_size != fread(sgx_input, 1, input_size, input_file)) { + fprintf(stderr, "error: cannot read input file\n"); + return 1; + } + + fclose(input_file); + } + + if (initialize_enclave(&global_eid, "anakin_enclave.token", "anakin_enclave.signed") < 0) { + printf("Fail to initialize enclave.\n"); + return 1; + } + + int ecall_retcode = -1; + sgx_status_t status = setup_model(global_eid, &ecall_retcode, argv[1]); + + if (status != SGX_SUCCESS) { + fprintf(stderr, "error: SGX ecall 'setup_model' failed.\n"); + return 1; + } + + if (ecall_retcode) { + fprintf(stderr, "error: invalid anakin model.\n"); + return 1; + } + + clock_t begin = clock(); + + size_t result_size = 0; + ecall_retcode = -1; + + status = infer(global_eid, &ecall_retcode, input_size, sgx_input, + sizeof(sgx_output), sgx_output, &result_size); + + if (status != SGX_SUCCESS) { + fprintf(stderr, "error: SGX ecall 'infer' failed.\n"); + return 1; + } else if (ecall_retcode) { + fprintf(stderr, "error: invalid inference parameters.\n"); + } + + clock_t end = clock(); + + fprintf(stderr, "%lf seconds elapsed during inference\n", (double)(end - begin) / CLOCKS_PER_SEC); + + auto f = reinterpret_cast(sgx_output); + auto n = result_size / sizeof(float); + for (int i = 0; i < n; ++i) { + printf("%f\n", f[i]); + } + + return 0; +} diff --git a/sgx/app/example/ocall.c b/sgx/app/example/ocall.c new file mode 100644 index 000000000..ef726526c --- /dev/null +++ b/sgx/app/example/ocall.c @@ -0,0 +1,46 @@ +#include +#include "enclave_u.h" + +uintptr_t ocall_fopen(const char *filename, const char *mode) { + return (uintptr_t)fopen(filename, mode); +} + +size_t ocall_fread(void *buf, size_t size, size_t count, uintptr_t f) { + return fread(buf, size, count, (FILE *)f); +} + +size_t ocall_fwrite(const void *buf, size_t size, size_t count, uintptr_t f) { + return fwrite(buf, size, count, (FILE *)f); +} + +int ocall_fseek(uintptr_t file, long int offset, int origin) { + return fseek((FILE *)file, offset, origin); +} + +long int ocall_ftell(uintptr_t file) { + return ftell((FILE *)file); +} + +size_t ocall_fsize(uintptr_t f) { + FILE *file = (FILE *)f; + size_t size = 0; + long int saved = ftell(file); + fseek(file, 0, SEEK_END); + + long int end = ftell(file); + fseek(file, saved, SEEK_SET); + + if (end > 0) { + size = (size_t)end; + } + + return size; +} + +int ocall_fclose(uintptr_t f) { + return fclose((FILE *)f); +} + +void ocall_print_string(const char *str) { + printf("%s", str); +} diff --git a/sgx/enclave/.gitignore b/sgx/enclave/.gitignore new file mode 100644 index 000000000..72db84639 --- /dev/null +++ b/sgx/enclave/.gitignore @@ -0,0 +1,4 @@ +anakin_ecall.cpp +anakin_ecall.edl +anakin_enclave.pem +anakin_enclave.config.xml diff --git a/sgx/enclave/enclave.edl b/sgx/enclave/enclave.edl new file mode 100644 index 000000000..99724dde5 --- /dev/null +++ b/sgx/enclave/enclave.edl @@ -0,0 +1,37 @@ +enclave { + include "stdint.h" + + from "sgx_tstdc.edl" import *; + from "ecall.edl" import *; + + untrusted { + uintptr_t ocall_fopen( + [in, string] const char *filename, + [in, string] const char *mode + ); + + size_t ocall_fread( + [out, size=size, count=count] void *buf, + size_t size, + size_t count, + uintptr_t f + ); + + size_t ocall_fwrite( + [in, size=size, count=count] const void *buf, + size_t size, + size_t count, + uintptr_t f + ); + + int ocall_fseek(uintptr_t file, long int offset, int origin); + + long int ocall_ftell(uintptr_t file); + + size_t ocall_fsize(uintptr_t f); + + int ocall_fclose(uintptr_t f); + + void ocall_print_string([in, string] const char *str); + }; +}; diff --git a/sgx/enclave/enclave.lds b/sgx/enclave/enclave.lds new file mode 100644 index 000000000..0f2e5ce2c --- /dev/null +++ b/sgx/enclave/enclave.lds @@ -0,0 +1,228 @@ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(_start) +SEARCH_DIR("=/usr/local/lib/x86_64-linux-gnu"); SEARCH_DIR("=/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/lib/x86_64-linux-gnu"); SEARCH_DIR("=/usr/local/lib64"); SEARCH_DIR("=/lib64"); SEARCH_DIR("=/usr/lib64"); SEARCH_DIR("=/usr/local/lib"); SEARCH_DIR("=/lib"); SEARCH_DIR("=/usr/lib"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("=/usr/x86_64-linux-gnu/lib"); +PHDRS +{ + headers PT_PHDR PHDRS FLAGS(5); + interp PT_INTERP FLAGS(4); + text PT_LOAD FILEHDR PHDRS FLAGS(5); + data PT_LOAD FLAGS(6); + jit PT_LOAD FLAGS(7); + dynamic PT_DYNAMIC FLAGS(6); + note PT_NOTE FLAGS(4); + tls PT_TLS FLAGS(4); + gnu_eh_frame PT_GNU_EH_FRAME FLAGS(4); + gnu_stack PT_GNU_STACK FLAGS(7); + gnu_relro 0x6474e552 FLAGS(4); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + PROVIDE (__executable_start = SEGMENT_START("text", 0)); . = SEGMENT_START("text", 0) + SIZEOF_HEADERS; + .interp : { *(.interp) } :interp :text + .note.gnu.build-id : { *(.note.gnu.build-id) } :note :text + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rela.dyn : + { + *(.rela.init) + *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) + *(.rela.fini) + *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) + *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) + *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) + *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) + *(.rela.ctors) + *(.rela.dtors) + *(.rela.got) + *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) + *(.rela.lbss .rela.lbss.* .rela.gnu.linkonce.lb.*) + *(.rela.lrodata .rela.lrodata.* .rela.gnu.linkonce.lr.*) + *(.rela.ifunc) + } + .rela.plt : + { + *(.rela.plt) + PROVIDE_HIDDEN (__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN (__rela_iplt_end = .); + } + .init : + { + KEEP (*(SORT_NONE(.init))) + } + .plt : { *(.plt) *(.iplt) } +.plt.got : { *(.plt.got) } +.plt.bnd : { *(.plt.bnd) } + .text : + { + *(.text.unlikely .text.*_unlikely .text.unlikely.*) + *(.text.exit .text.exit.*) + *(.text.startup .text.startup.*) + *(.text.hot .text.hot.*) + *(.text .stub .text.* .gnu.linkonce.t.*) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } + .fini : + { + KEEP (*(SORT_NONE(.fini))) + } + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) } :gnu_eh_frame :text + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) } :text + .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table + .gcc_except_table.*) } + .gnu_extab : ONLY_IF_RO { *(.gnu_extab*) } + /* These sections are generated by the Sun/Oracle C++ compiler. */ + .exception_ranges : ONLY_IF_RO { *(.exception_ranges + .exception_ranges*) } + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE)); + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) } :data + .gnu_extab : ONLY_IF_RW { *(.gnu_extab) } + .gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) } + .exception_ranges : ONLY_IF_RW { *(.exception_ranges .exception_ranges*) } + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :tls + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + .preinit_array : + { + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + } :gnu_relro :data + .init_array : + { + PROVIDE_HIDDEN (__init_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) + KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) + PROVIDE_HIDDEN (__init_array_end = .); + } + .fini_array : + { + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) + KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors)) + PROVIDE_HIDDEN (__fini_array_end = .); + } + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + KEEP (*crtbegin?.o(.ctors)) + /* We don't want to include the .ctor section from + the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : + { + KEEP (*crtbegin.o(.dtors)) + KEEP (*crtbegin?.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } + .dynamic : { *(.dynamic) } :dynamic :gnu_relro :data + .got : { *(.got) *(.igot) } :gnu_relro :data + . = DATA_SEGMENT_RELRO_END (SIZEOF (.got.plt) >= 24 ? 24 : 0, .); + .got.plt : { *(.got.plt) *(.igot.plt) } :data + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } :data + _edata = .; PROVIDE (edata = .); + . = .; + . = .; + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. + FIXME: Why do we need it? When there is no .bss section, we don't + pad the .data section. */ + . = ALIGN(. != 0 ? 64 / 8 : 1); + } :data + . = ALIGN(64 / 8); + _end = .; PROVIDE (end = .); + . = DATA_SEGMENT_END (.); + . = . + ALIGN (CONSTANT (MAXPAGESIZE)); + .jit : { + PROVIDE (__jit_size = 128 * 256 * CONSTANT (COMMONPAGESIZE)); + PROVIDE (__jit_start = .); + . = . + __jit_size; + PROVIDE (__jit_end = .); + } :jit + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end ) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + /* DWARF Extension. */ + .debug_macro 0 : { *(.debug_macro) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } +} diff --git a/sgx/enclave/example/config.xml b/sgx/enclave/example/config.xml new file mode 100644 index 000000000..e76ca7509 --- /dev/null +++ b/sgx/enclave/example/config.xml @@ -0,0 +1,11 @@ + + 0 + 0 + 0x400000 + 0x8000000 + 1 + 0 + 0 + 0 + 0xFFFFFFFF + diff --git a/sgx/enclave/example/ecall.cpp b/sgx/enclave/example/ecall.cpp new file mode 100644 index 000000000..7ed7a0bd8 --- /dev/null +++ b/sgx/enclave/example/ecall.cpp @@ -0,0 +1,128 @@ +#include "anakin_config.h" + +#include +#include "stdio.h" + +#include "graph.h" +#include "net.h" +#include "saber/core/tensor_op.h" +#include "mkl.h" + +#include + +namespace { + +using namespace anakin; + +std::unique_ptr> ModelGraph; +std::unique_ptr> ModelNet; + +} + +namespace anakin { + +extern "C" int setup_model(const char *model_name) { + ModelGraph.reset(new graph::Graph()); + ModelGraph->load(model_name); +#ifdef ENABLE_DEBUG + printf("model loaded\n"); +#endif + + ModelGraph->Optimize(); +#ifdef ENABLE_DEBUG + printf("model optimized\n"); +#endif + + ModelNet.reset(new Net(*ModelGraph, true)); + + return 0; +} + +extern "C" int seal_data(size_t input_size, const void *input, + size_t output_max_size, void *output, + size_t *result_size) { + uint32_t output_len = sgx_calc_sealed_data_size(0, input_size); + + if (output_len > output_max_size) return -1; + + auto rc = sgx_seal_data(0, NULL, input_size, static_cast(input), + output_len, static_cast(output)); + + if (rc != SGX_SUCCESS) return -2; + + *result_size = output_len; + + return 0; +} + +extern "C" int unseal_data(size_t input_size, const void *input, + size_t output_max_size, void *output, + size_t *result_size) { + auto sealed_data = static_cast(input); + uint32_t input_len = sgx_get_encrypt_txt_len(sealed_data); + + if (input_len > output_max_size) return -1; + + uint32_t mac_length = 0; + auto rc = sgx_unseal_data(sealed_data, NULL, &mac_length, + static_cast(output), &input_len); + + if (rc != SGX_SUCCESS) return -2; + + *result_size = input_len; + + return 0; +} + +extern "C" int infer(size_t input_size, const void *input, + size_t output_max_size, void *output, + size_t *result_size) { + + if (!ModelNet) return -1; + + // Check input size requirement + if (input_size != 0) { + auto h_in = ModelNet->get_in_list().at(0); + auto input_tensor_size = h_in->get_dtype_size() * h_in->valid_size(); + if (input_size != input_tensor_size) return -2; + } + + // Check output size requirement + auto h_out = ModelNet->get_out_list().at(0); + auto output_tensor_size = h_out->get_dtype_size() * h_out->valid_size(); + if (output_tensor_size > output_max_size) return -3; + + if (input_size == 0) { + for (auto h_in : ModelNet->get_in_list()) { + fill_tensor_const(*h_in, 1); + } + } else { + auto start = static_cast(input); + for (auto h_in : ModelNet->get_in_list()) { + auto end = start + h_in->valid_size(); + std::copy(start, end, static_cast(h_in->data())); + start = end; + } + } + + ModelNet->prediction(); + mkl_free_buffers(); + + auto p_float = static_cast(h_out->data()); + +#ifdef ENABLE_DEBUG + auto c = h_out->valid_size(); + for (int i = 0; i < c; i++) { + float f = p_float[i]; + printf("%f\n", f); + } +#endif + + std::copy(p_float, p_float + h_out->valid_size(), static_cast(output)); + + *result_size = output_tensor_size; + + return 0; +} + +} diff --git a/sgx/enclave/example/ecall.edl b/sgx/enclave/example/ecall.edl new file mode 100644 index 000000000..046792dc8 --- /dev/null +++ b/sgx/enclave/example/ecall.edl @@ -0,0 +1,29 @@ +enclave { + trusted { + public int setup_model([in, string] const char *model_name); + + public int seal_data( + size_t in_size, + [in, size=in_size] const void *input, + size_t max_out_size, + [out, size=max_out_size] void *results, + [out] size_t *result_size + ); + + public int unseal_data( + size_t in_size, + [in, size=in_size] const void *input, + size_t max_out_size, + [out, size=max_out_size] void *results, + [out] size_t *result_size + ); + + public int infer( + size_t in_size, + [in, size=in_size] const void *input, + size_t max_out_size, + [out, size=max_out_size] void *results, + [out] size_t *result_size + ); + }; +}; diff --git a/sgx/enclave/example/sign_enclave.pem b/sgx/enclave/example/sign_enclave.pem new file mode 100644 index 000000000..b031c55ad --- /dev/null +++ b/sgx/enclave/example/sign_enclave.pem @@ -0,0 +1,39 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIG4wIBAAKCAYEA01Ng4Ns3Xhp2MwzTn9vaCNnO/Jou2Sgnl8Xrte8xsSIFjgCW +aeeBbrWwzUhnMXlw8xz5TvBpfxw8cXqm0fr5eJEykCF4aytm/dRD4K9Vbp7BjWPB +9M7s8MCWrymWYjzEfgXAHRMfwneLh8xwBtk0DmXU283AXucNWOe2SBnLe2PTCkyP +dqrsH5/tvM/btlVVU8hyoXQerh//yRCBCiFFeC2Vy5HRGoffDuWgdBmF27TFgE9n +nYjypwpPBpvn7+tczPWsEZVZ1zkhlY+x65jo80zJ8zdGfhunNVVa39U15KX0hFnE +0OSP3/LW5gmlhFqhxwwKPK0iT/ANfeZBQy8+1GQX1aW/MBRISNKkikLc+UTR2lxa +3wE0i8SkJUh5bbub4Xq0luGQManRCh3k95YfGGJQRUeC7gijutMtM4DEaFSwWatn +pqcuwdCnhgbxfAbWnnIP0dy6Iv5HZENiDBExmmXFGTzkl2MMjli8+FN3l7n7EmAZ +ohlWqdht73K3Ud0JAgEDAoIBgQCM4kCV53o+vE7Msze/5+awkTSoZsnmGsUP2UfO +n3Z2Fq5eqw7xRQD0eSCI2u92UPX3aKY0oEZUvX2g/G82p1D7C3cKwPryHO9T4tfr +H45Jvyuzl9ajNJ311bnKG7mW0y2pWSq+DL/W+l0FMvVZ5iK0Q+M9M9WUmgjl787a +u9z87TdcMwpPHJ1qap593+fO44432vcWTWnJaqqGCwCxa4Olc7kyYTYRr+oJ7mr4 +EQPnzdkANO++W0xvXDSvEpqf8jynDGUIGJlryfrIsSeYzy5pqcPhGZg1Btg/dSmt +h+SkGQEsPZUTLFNy2tW33Uomi2GhdDqPghId2QFkOkWfRWfTVOqBv/Nck2ZHXeEt +MqKXp9fW+lUdYkfhNTmU30eoZp+2E8c5aRPptbJGUyv3eXj9BChtgo5ZrxgoPKI9 +JQhXTrTz5cwFon/rXYjwX6DypVFC5eQW/290/vZ5DZ0dt6NxhjOsyhEQi83WZ/gX +04cOOwHiCRsBK4BAPqP7KQp6R4MCgcEA+ev+62ivyXX/w9xqzMmLexO6FBqMKP3k +Lo9OwbTNBq4QBTIsVP2pbyCq5bHGZRvBLBEb3F9rSgwSYvRN2K2xD7dgpyyeWOR4 +KO0dVQ1W5Qrzk7D3DDYdtGkuuyZCoki69OOWggWV29ZUJIGIS+EMux1lL6iwvKcm +6jZXO18eVu8HOlTdhxqj6xJnn/fA3vEuKUPg3Sm8fyKGJFd+O7dlL1twRz0PnMct +Dbtgjmrz3Pc/os2OTeSwb4aIfn3EkgOrAoHBANh3FZoHw+wUKaSpi7mYl9m6ag12 +VgWTfqeWTZnUkefSYrzLON8kaUSJ62yL7+VtzijMlokKm9keeQSaivuZcgetV2vZ +M8xStrTYtXFpkjC+GoQz5Ca3qwWLnwrTS07Y8Vt5cz6+XHdC8Xwfmrh+3OG+rnFa +/Kra2JRB4pxqGY5vmbF92BcYyvWx8n1/vzEdrpDVWNIz1nRdT4pXeCPGV0DBB07Q +u2HKKr8BaEYrOSVqOJyE4tJzZdnz73g3YwhuGwKBwQCmnVSc8HUw+VUtPZyIhlz8 +t9FivF1wqULJtN8rzd4EdAquIXLjU8ZKFcdDy9mYvStythKS6keGsrbsot6Qc8tf +z5XEyGmQmFAbSL44s49DXKJidfoIJBPNm3R8xCxsMHyjQmRWrmPn5DgYVlrdQLMn +aO4fxcsob29GzuTSP2mPSgTRjekEvG1HYZpqpSs/S3QbgpXoxn2qFwQYOlQnz5jK +PPWE01+92h4JJ5W0R009+ipsiQmJQyBKWbBUU9hhV8cCgcEAkE9jvAUtSA1xGHEH +0RBlO9GcCPmOrmJUb7mJEThhRTbsfdzQlMLw2FvySF1KmPPexd25sLG9O2mmAxGx +/RD2r8jk8pDNMuHPIzsjoPEMIH68WCKYGc/HWQe/XIzc3ztLklD3fymS+iyg/Wpn +Janoln8e9jyocec7DYFBvZwRCZ+7y6k6uhCHTnahqP/Uy2kfCzjl4XfkTZOKXDpQ +F9mPgICvieB869wcf1ZFhBzQw5wlva3sjEzukU1KUCTssElnAoHAPoTE9Deoa1jl +H8DTNH55G+i9f7t3q9+/mfKjtAJ7QQCcPBKSBB2XhGVqOcIiewtBsYMqVpzukJlG +AB0V0ZfuI2d3VChoCLsDTwBqbcjMYlYV72gAD6RvOkb7zvOpHvlx2qgG7fndwfmh +DBozbYvaQ8gGhJL+ALoLUPHrljauCbvlygayG0pnkLtt2ijhmnOXTfU0X149jHz1 +UDZvQyoJM36nNpKCQD0ToZl2uiPde6ikgXqProtBWM5GvUCmKlTo +-----END RSA PRIVATE KEY----- diff --git a/sgx/enclave/include/cpuid.h b/sgx/enclave/include/cpuid.h new file mode 100644 index 000000000..7435c8db2 --- /dev/null +++ b/sgx/enclave/include/cpuid.h @@ -0,0 +1,80 @@ +// -*- c++ -*- +#ifndef ANAKIN_SGX_CPUID_H +#define ANAKIN_SGX_CPUID_H + +#include + +#undef __cpuid +#define __cpuid(LV, A, B, C, D) \ + do { \ + const uint32_t __eax = LV; \ + if (__eax == 0) \ + (A) = 0x00000016, (B) = 0x756e6547, \ + (C) = 0x6c65746e, (D) = 0x49656e69; \ + else if (__eax == 1) \ + (A) = 0x000906ea, (B) = 0x06100800, \ + (C) = 0x7ffafbff, (D) = 0xbfebfbff; \ + else if (__eax == 0x80000001) \ + (A) = 0x00000000, (B) = 0x00000000, \ + (C) = 0x00000121, (D) = 0x2c100800; \ + else if (__eax == 0x80000008) \ + (A) = 0x00003027, (B) = 0x00000000, \ + (C) = 0x00000000, (D) = 0x00000000; \ + else \ + __assert(__FILE__, __LINE__, __func__, \ + "unsupported cpuid query"); \ + } while (0) + +#undef __cpuid_count +#define __cpuid_count(LV, CNT, A, B, C, D) \ + do { \ + const uint32_t __eax = LV; \ + const uint32_t __ecx = CNT; \ + if (__eax == 0) \ + (A) = 0x00000016, (B) = 0x756e6547, \ + (C) = 0x6c65746e, (D) = 0x49656e69; \ + else if (__eax == 1) \ + (A) = 0x000906ea, (B) = 0x06100800, \ + (C) = 0x7ffafbff, (D) = 0xbfebfbff; \ + else if (__eax == 0x80000001) \ + (A) = 0x00000000, (B) = 0x00000000, \ + (C) = 0x00000121, (D) = 0x2c100800; \ + else if (__eax == 0x80000008) \ + (A) = 0x00003027, (B) = 0x00000000, \ + (C) = 0x00000000, (D) = 0x00000000; \ + else if (__eax == 4 && __ecx == 0) \ + (A) = 0x1c004121, (B) = 0x01c0003f, \ + (C) = 0x0000003f, (D) = 0x00000000; \ + else if (__eax == 4 && __ecx == 1) \ + (A) = 0x1c004122, (B) = 0x01c0003f, \ + (C) = 0x0000003f, (D) = 0x00000000; \ + else if (__eax == 4 && __ecx == 2) \ + (A) = 0x1c004143, (B) = 0x00c0003f, \ + (C) = 0x000003ff, (D) = 0x00000000; \ + else if (__eax == 4 && __ecx == 3) \ + (A) = 0x1c03c163, (B) = 0x03c0003f, \ + (C) = 0x00002fff, (D) = 0x00000006; \ + else if (__eax == 4 && __ecx == 4) \ + (A) = 0x00000000, (B) = 0x00000000, \ + (C) = 0x00000000, (D) = 0x00000000; \ + else if (__eax == 0xb && __ecx == 0) \ + (A) = 0x00000001, (B) = 0x00000002, \ + (C) = 0x00000100, (D) = 0x00000006; \ + else if (__eax == 0xb && __ecx == 1) \ + (A) = 0x00000004, (B) = 0x0000000c, \ + (C) = 0x00000201, (D) = 0x00000006; \ + else if (__eax == 7 && __ecx == 0) \ + (A) = 0x00000000, (B) = 0x029c6fbf, \ + (C) = 0x40000000, (D) = 0x9c000000; \ + else if (__eax == 0x14 && __ecx == 0) \ + (A) = 0x00000001, (B) = 0x0000000f, \ + (C) = 0x00000007, (D) = 0x00000000; \ + else if (__eax == 0x14 && __ecx == 1) \ + (A) = 0x02490002, (B) = 0x003f3fff, \ + (C) = 0x00000000, (D) = 0x00000000; \ + else \ + __assert(__FILE__, __LINE__, __func__, \ + "unsupported cpuid query"); \ + } while (0) + +#endif diff --git a/sgx/enclave/include/iostream b/sgx/enclave/include/iostream new file mode 100644 index 000000000..b8a66f50a --- /dev/null +++ b/sgx/enclave/include/iostream @@ -0,0 +1,16 @@ +#ifndef ANAKIN_SGX_IOSTREAM +#define ANAKIN_SGX_IOSTREAM + +#include + +namespace std { + struct basic_ostream { + template + constexpr const basic_ostream &operator<<(const T &) const { return *this; } + }; + + extern basic_ostream cout, cerr; + extern void *endl; +} + +#endif diff --git a/sgx/enclave/include/mm_malloc.h b/sgx/enclave/include/mm_malloc.h new file mode 100644 index 000000000..9e163fa54 --- /dev/null +++ b/sgx/enclave/include/mm_malloc.h @@ -0,0 +1,25 @@ +#ifndef ANAKIN_SGX_MM_MALLOC_H +#define ANAKIN_SGX_MM_MALLOC_H + +#include + +static inline void *_mm_malloc(size_t size, size_t alignment) { + void *ptr = NULL; + if (alignment == 1) { + return malloc(size); + } + if (alignment == 2 || (sizeof(void *) == 8 && alignment == 4)) { + alignment = sizeof(void *); + } + if (posix_memalign(&ptr, alignment, size) == 0) { + return ptr; + } else { + return NULL; + } +} + +static inline void _mm_free(void * ptr) { + free(ptr); +} + +#endif diff --git a/sgx/enclave/include/random b/sgx/enclave/include/random new file mode 100644 index 000000000..467b1f0ec --- /dev/null +++ b/sgx/enclave/include/random @@ -0,0 +1,40 @@ +#ifndef ANAKIN_SGX_RANDOM_H +#define ANAKIN_SGX_RANDOM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int rand(); + +#ifdef __cplusplus +} + +namespace std { + +using ::rand; + +struct random_device { + int operator()(); +}; + +struct mt19937 { + mt19937(random_device rd); + mt19937(int seed); +}; + +template +struct uniform_real_distribution { + uniform_real_distribution(T start, T end) {} + + template + T operator()(Generator &g) { return static_cast(0); } +}; + +} + +#endif + +#endif diff --git a/sgx/enclave/include/stdio.h b/sgx/enclave/include/stdio.h new file mode 100644 index 000000000..022008d2c --- /dev/null +++ b/sgx/enclave/include/stdio.h @@ -0,0 +1,41 @@ +#ifndef ANAKIN_SGX_STDIO_H +#define ANAKIN_SGX_STDIO_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct _FILE; +typedef struct _FILE FILE; + +int printf(const char *, ...); +int putchar(int); +// fprintf is currently a nop +int fprintf(FILE *, const char *, ...); + +// the following functions require ocall to untrusted code +FILE *fopen(const char *name, const char *mode); +size_t fwrite(const void *buf, size_t size, size_t count, FILE *f); +size_t fread(void *buf, size_t size, size_t count, FILE *f); + +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +int fseek(FILE *stream, long int offset, int origin); +long int ftell(FILE *stream); +size_t fsize(FILE *f); // not really in stdio.h + +int fclose(FILE *f); + +extern FILE *stdout; +extern FILE *stdin; +extern FILE *stderr; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/sgx/enclave/include/stdlib.h b/sgx/enclave/include/stdlib.h new file mode 100644 index 000000000..fcea0922c --- /dev/null +++ b/sgx/enclave/include/stdlib.h @@ -0,0 +1,22 @@ +#ifndef ANAKIN_SGX_STDLIB_H +#define ANAKIN_SGX_STDLIB_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void exit(int exit_code); +int posix_memalign(void **memptr, size_t alignment, size_t size); + +#ifdef __cplusplus +} + +namespace std { + using ::exit; +} + +#endif + +#endif diff --git a/sgx/enclave/src/sgx_enclave_init.cpp b/sgx/enclave/src/sgx_enclave_init.cpp new file mode 100644 index 000000000..417e73b32 --- /dev/null +++ b/sgx/enclave/src/sgx_enclave_init.cpp @@ -0,0 +1,44 @@ +#include +#include "cpuid.h" +#include "stdio.h" +#include "stdlib.h" + +#if defined(_M_X64) || defined(__x86_64__) +#define REG(INFO, REG) ((INFO)->r##REG) +#define RD_REG32(INFO, REG) static_cast(0xFFFFFFFFLLU & ((INFO)->r##REG)) +#define WR_REG32_O(INFO, REG) ((INFO)->r##REG) +#else +#define REG(INFO, REG) ((INFO)->e##REG) +#define RD_REG32(INFO, REG) ((INFO)->e##REG) +#define WR_REG32_O(INFO, REG) RD_REG32(INFO, REG) +#endif + +static int illegal_inst_handler(sgx_exception_info_t *info) { + static constexpr uint16_t cpuid_inst = 0xa20f; + + if (info->exception_vector != SGX_EXCEPTION_VECTOR_UD) + return EXCEPTION_CONTINUE_SEARCH; + + auto *cpu_ctx = &info->cpu_context; + if (*reinterpret_cast(REG(cpu_ctx, ip)) == cpuid_inst) { + __cpuid_count(RD_REG32(cpu_ctx, ax), RD_REG32(cpu_ctx, cx), + REG(cpu_ctx, ax), REG(cpu_ctx, bx), + REG(cpu_ctx, cx), REG(cpu_ctx, dx)); + + REG(cpu_ctx, ip) += 2; + + return EXCEPTION_CONTINUE_EXECUTION; + } + + return EXCEPTION_CONTINUE_SEARCH; +} + +static int anakin_enclave_init() { + if (!sgx_register_exception_handler(true, illegal_inst_handler)) { + abort(); + } + + return 0; +} + +extern "C" const int __anakin_enclave_init_status = anakin_enclave_init(); diff --git a/sgx/enclave/src/sgx_iostream.cpp b/sgx/enclave/src/sgx_iostream.cpp new file mode 100644 index 000000000..7155e2245 --- /dev/null +++ b/sgx/enclave/src/sgx_iostream.cpp @@ -0,0 +1,6 @@ +#include "iostream" + +std::basic_ostream std::cout; +std::basic_ostream std::cerr; + +void *std::endl = nullptr; diff --git a/sgx/enclave/src/sgx_random.cpp b/sgx/enclave/src/sgx_random.cpp new file mode 100644 index 000000000..b2e0cbeee --- /dev/null +++ b/sgx/enclave/src/sgx_random.cpp @@ -0,0 +1,16 @@ +#include "random" + +int rand() { + return 0; +} + +#ifdef __cplusplus + +int std::random_device::operator()() { + return 0; +} + +std::mt19937::mt19937(random_device rd) {} +std::mt19937::mt19937(int seed) {} + +#endif diff --git a/sgx/enclave/src/sgx_stdio.c b/sgx/enclave/src/sgx_stdio.c new file mode 100644 index 000000000..1aba0b18c --- /dev/null +++ b/sgx/enclave/src/sgx_stdio.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include + +#include "stdio.h" +#include "enclave_t.h" + +struct _FILE { + uintptr_t untrusted; + size_t bytes_left; + unsigned char *buffer; + unsigned char *curp; +}; + +FILE *stdout = NULL; +FILE *stdin = NULL; +FILE *stderr = NULL; + +#define SGX_PRINTF_BUFSIZE 4096 +#define SGX_FILE_IO_BUFSIZE 4096 + +int printf(const char *fmt, ...) { + char buf[SGX_PRINTF_BUFSIZE]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buf, SGX_PRINTF_BUFSIZE, fmt, ap); + va_end(ap); + return ocall_print_string(buf); +} + +int fprintf(FILE *f, const char *format, ...) { + return 0; +} + +int putchar(int character) { + char buf[2] = { character, '\0' }; + return ocall_print_string(buf); +} + +#define FILE_MODE_READ 0 +#define FILE_MODE_WRITE 1 +#define FILE_MODE_ERROR 2 + +FILE *fopen(const char *name, const char *mode) { + int fmode = FILE_MODE_ERROR; + + if (strncmp(mode, "rb", 3) == 0) + fmode = FILE_MODE_READ; + else if (strncmp(mode, "wb", 3) == 0) + fmode = FILE_MODE_WRITE; + else + return NULL; + + uintptr_t f = 0; + sgx_status_t ec = ocall_fopen(&f, name, mode); + + if (ec != SGX_SUCCESS) + return NULL; + + FILE *ret = malloc(sizeof(FILE)); + + ret->untrusted = f; + ret->buffer = malloc(SGX_FILE_IO_BUFSIZE); + ret->curp = ret->buffer; + + if (fmode == FILE_MODE_READ) + ret->bytes_left = 0; + else + ret->bytes_left = SGX_FILE_IO_BUFSIZE; + + return ret; +} + +size_t fwrite(const void *buf, size_t size, size_t count, FILE *f) { + size_t bytes_written = 0; + sgx_status_t ec = ocall_fwrite(&bytes_written, buf, size, count, f->untrusted); + + if (ec != SGX_SUCCESS) + return 0; + + return bytes_written; +} + +size_t fread(void *buf, size_t size, size_t count, FILE *f) { + const size_t total = size * count; + size_t left = total; + unsigned char *_buf = buf; + + do { + size_t round = f->bytes_left < left ? f->bytes_left : left; + if (round != 0) { + memcpy(_buf, f->curp, round); + f->curp += round; + _buf += round; + left -= round; + f->bytes_left -= round; + } + + if (f->bytes_left == 0) { + f->curp = f->buffer; + + sgx_status_t ec; + + ec = ocall_fread(&f->bytes_left, f->buffer, + 1, SGX_FILE_IO_BUFSIZE, f->untrusted); + + if (ec != SGX_SUCCESS) { + return total - left; + } + + if (f->bytes_left == 0) + break; + } + } while (left > 0); + + return total - left; +} + +int fseek(FILE *f, long int offset, int origin) { + int ret = -1; + sgx_status_t ec = ocall_fseek(&ret, f->untrusted, offset, origin); + + if (ec != SGX_SUCCESS) + return -1; + + return ret; +}; + +long int ftell(FILE *f) { + long int ret = -1L; + sgx_status_t ec = ocall_ftell(&ret, f->untrusted); + + if (ec != SGX_SUCCESS) + return -1L; + + return ret; +}; + +size_t fsize(FILE *f) { + size_t size = 0; + sgx_status_t ec = ocall_fsize(&size, f->untrusted); + + if (ec != SGX_SUCCESS) + return 0; + + return size; +} + +int fclose(FILE *f) { + int r = EOF; + + sgx_status_t ec = ocall_fclose(&r, f->untrusted); + + free(f->buffer); + free(f); + + if (ec != SGX_SUCCESS) + return EOF; + + return r; +} diff --git a/sgx/enclave/version.lds b/sgx/enclave/version.lds new file mode 100644 index 000000000..2efbca509 --- /dev/null +++ b/sgx/enclave/version.lds @@ -0,0 +1,11 @@ +anakin_enclave +{ + global: + g_global_data_sim; + g_global_data; + enclave_entry; + g_peak_heap_used; + local: + *; +}; + diff --git a/test/.DS_Store b/test/.DS_Store new file mode 100644 index 000000000..a085bceb3 Binary files /dev/null and b/test/.DS_Store differ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index eceaac2ec..42f9891ab 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -51,6 +51,12 @@ foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC}) string(REPLACE "." ";" SEXY_LIST ${TEST_CASE_NAME}) list(GET SEXY_LIST 0 TEST_CASE_NAME) add_executable(${TEST_CASE_NAME} ${SRC_NAME}) + if(USE_ARM_PLACE) + if (USE_OPENCV) + target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc + -ltbb -llibtiff -llibpng -llibjpeg -llibjasper -lIlmImf -lc -lz -llog -ldl) + endif() + endif() if(BUILD_SHARED) if(BUILD_WITH_FRAMEWORK) target_link_libraries(${TEST_CASE_NAME} ${anakin_lib_so} ${ANAKIN_LINKER_LIBS}) @@ -58,13 +64,13 @@ foreach(SRC_NAME ${ANAKIN_TEST_CASE_SRC}) target_link_libraries(${TEST_CASE_NAME} ${ANAKIN_SABER_LIB_TARGET}) endif() else() - if(BUILD_WITH_FRAMEWORK) + if(BUILD_WITH_FRAMEWORK) target_link_libraries(${TEST_CASE_NAME} -Wl,--whole-archive ${anakin_lib_static} -Wl,--no-whole-archive) - else() + else() target_link_libraries(${TEST_CASE_NAME} -Wl,--whole-archive ${ANAKIN_SABER_LIB_TARGET} -Wl,--no-whole-archive) endif() endif() - set_target_properties(${TEST_CASE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY + set_target_properties(${TEST_CASE_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/${AK_OUTPUT_PATH}/unit_test) endforeach() diff --git a/test/framework/.DS_Store b/test/framework/.DS_Store new file mode 100644 index 000000000..72e29e061 Binary files /dev/null and b/test/framework/.DS_Store differ diff --git a/test/framework/graph/graph_parser_from_model_test.cpp b/test/framework/graph/graph_parser_from_model_test.cpp index eb0dc974b..3cec38b4a 100644 --- a/test/framework/graph/graph_parser_from_model_test.cpp +++ b/test/framework/graph/graph_parser_from_model_test.cpp @@ -21,6 +21,7 @@ TEST(GraphTest, graph_load_model) { graph->Optimize(); */ } +#ifndef USE_NANOPB #ifdef USE_CUDA TEST(GraphTest, nvidia_graph_save_model) { Graph* graph = new Graph(); @@ -77,6 +78,7 @@ TEST(GraphTest, arm_graph_save_model) { Status status = graph->save(save_model_path); } #endif +#endif int main(int argc, const char** argv) { // initial logger diff --git a/test/framework/net/classification_accuracy.cpp b/test/framework/net/classification_accuracy.cpp index de34e5e69..08d99aeda 100644 --- a/test/framework/net/classification_accuracy.cpp +++ b/test/framework/net/classification_accuracy.cpp @@ -250,12 +250,11 @@ void test_accuracy(std::string model_path, if (!status) { LOG(FATAL) << " [ERROR] " << status.info(); } + graph->load_calibrator_config("net_pt_config", "calibrate_file.txt"); graph->Optimize(); Net net_executer(true); - net_executer.load_calibrator_config("net_pt_config.txt", "./calibrator.txt"); net_executer.init(*graph); - auto d_tensor_in_p = net_executer.get_in("input_0"); auto d_tensor_out_p = net_executer.get_out("ip1_out"); diff --git a/test/framework/net/faster_rcnn_test.cpp b/test/framework/net/faster_rcnn_test.cpp new file mode 100644 index 000000000..cb493ea27 --- /dev/null +++ b/test/framework/net/faster_rcnn_test.cpp @@ -0,0 +1,202 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" +#endif + +void read_tensor_from_file(float* data, int length, const char* path) { + std::fstream fs(path); + int i = 0; + if (fs.is_open()) { + std::string str; + while(true) { + std::getline(fs, str); + std::size_t found = str.find(" "); + if (found != std::string::npos) { + std::cout << "first 'needle' found at: " << found << '\n'; + break; + } + data[i++] = (atof)(str.c_str()); + } + fs.close(); + } +} +#if defined(USE_OPENCV) && defined(USE_CUDA) +void fill_image_data(const cv::Mat& img, float * gpu_data, float* gpu_info, int batch){ + int elem_num = img.channels() * img.rows * img.cols; + float * cpu_data = new float[elem_num]; + // eliminate the padding added by opencv: NHWC + int idx = 0; + float scale = 1.0f / 255; + for(int c = 0; c < img.channels(); c++){ + for(int h = 0; h < img.rows; h++){ + for(int w = 0; w < img.cols; w++) + cpu_data[idx++] = img.data[h * img.step + w * img.channels() + c] * scale; + } + } + float* cpu_info = new float[3]; + cpu_info[0] = float(img.rows); + cpu_info[1] = float(img.cols); + cpu_info[2] = 1.f; + // TODO: use anakin API + for (int i = 0; i < batch; i++) { + cudaMemcpy(gpu_data + i * elem_num, cpu_data, elem_num* sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_info + i * 3, cpu_info, 3* sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] cpu_data; + delete[] cpu_info; +} +#endif + +//#define USE_DIEPSE + +std::string g_model_path = "/path/to/your/anakin_model"; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +int g_warm_up = 10; +int g_epoch = 1000; +int g_device_id = 0; +int g_start = 0; +int g_end = 0; +std::string g_image_list = ""; +//#define TEST_FAST_RCNN + +#ifdef TEST_FAST_RCNN +#ifdef USE_CUDA + +TEST(NetTest, net_execute_base_test) { + + std::ifstream ifs(g_image_list.c_str(), std::ifstream::in); + CHECK(ifs.is_open()) << g_image_list << " can not be opened"; + std::vector file_list; + while (ifs.good()) { + std::string new_file; + std::getline(ifs, new_file); + file_list.push_back(new_file); + } + + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // reshape the input_0 's shape for graph model + graph->ResetBatchSize("input_0", g_batch_size); + + //anakin graph optimization + graph->Optimize(); + + // constructs the executer net + Net net_executer(true); + + //net_executer.load_calibrator_config("net_pt_config.txt","cal_file"); + net_executer.init(*graph); + // get in + auto d_image = net_executer.get_in("input_0"); + auto d_image_info = net_executer.get_in("input_1"); + Tensor4d h_image; + Tensor4d h_image_info; + + auto image_shape = d_image->valid_shape(); + auto image_info_shape = d_image_info->valid_shape(); + for (int i = 0; i < image_shape.size(); i++) { + LOG(INFO) << "detect input_0 dims[" << i << "]" << image_shape[i]; + } + for (int i = 0; i < image_info_shape.size(); i++) { + LOG(INFO) << "detect input_1 dims[" << i << "]" << image_info_shape[i]; + } + + Context ctx(g_device_id, 0, 0); + saber::SaberTimer my_time; +#ifdef USE_OPENCV + for (int i = g_start; i < file_list.size() && i < g_end; i++) { + int img_id = 0; + cv::Mat img = cv::imread(file_list[img_id], cv::IMREAD_COLOR); + if (img.empty()) { + LOG(FATAL) << "load image " << file_list[img_id] << " failed"; + } + Shape image_shape({g_batch_size, img.channels(), img.rows, img.cols}, Layout_NCHW); + Shape info_shape({g_batch_size, 3, 1, 1}, Layout_NCHW); + d_image->reshape(image_shape); + d_image_info->reshape(info_shape); + float* gpu_image = (float*)d_image->mutable_data(); + float* gpu_image_info = (float*)d_image_info->mutable_data(); + fill_image_data(img, gpu_image, gpu_image_info, g_batch_size); + cudaDeviceSynchronize(); + //write_tensorfile(*d_image, "image.txt"); + //write_tensorfile(*d_image_info, "image_info.txt"); + net_executer.prediction(); + if (i - g_start == g_warm_up) { +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + my_time.start(ctx); + } + } +#endif + cudaDeviceSynchronize(); + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_epoch); +#endif + + LOG(INFO)<<"aveage time "< 1) { + g_model_path = std::string(argv[1]); + } + if (argc > 2) { + g_image_list = std::string(argv[2]); + } + if (argc > 3) { + g_batch_size = atoi(argv[3]); + } + if (argc > 4) { + g_warm_up = atoi(argv[4]); + } + if (argc > 5) { + g_epoch = atoi(argv[5]); + } + if (argc > 6) { + g_device_id = atoi(argv[6]); + } + if (argc > 7) { + g_start = atoi(argv[7]); + } + if (argc > 8) { + g_end = atoi(argv[8]); + } + +#ifdef USE_CUDA + TargetWrapper::set_device(g_device_id); + Env::env_init(); +#endif + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/faster_rcnn_test_x86.cpp b/test/framework/net/faster_rcnn_test_x86.cpp new file mode 100644 index 000000000..d5fbf4d45 --- /dev/null +++ b/test/framework/net/faster_rcnn_test_x86.cpp @@ -0,0 +1,167 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include + +void read_tensor_from_file(float* data, int length, const char* path) { + std::fstream fs(path); + int i = 0; + if (fs.is_open()) { + std::string str; + while(true) { + std::getline(fs, str); + std::size_t found = str.find(" "); + if (found != std::string::npos) { + //std::cout << "first 'needle' found at: " << found << '\n'; + break; + } + data[i++] = (atof)(str.c_str()); + } + fs.close(); + } else { + LOG(FATAL) << path << "can not be opened"; + } +} + +//#define USE_DIEPSE + +std::string g_model_path = "/path/to/your/anakin_model"; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +int g_warm_up = 10; +int g_epoch = 1000; +int g_device_id = 0; +//#define TEST_FAST_RCNN +#ifdef USE_X86_PLACE +#ifdef TEST_FAST_RCNN + +TEST(NetTest, net_execute_base_test) { + std::string image_file = "/home/chengyujuan/baidu/sys-hic-gpu/Anakin-2.0/generate_proposal/image_data.txt"; + std::string image_info_file = "/home/chengyujuan/baidu/sys-hic-gpu/Anakin-2.0/generate_proposal/im_info_data.txt"; + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // reshape the input_0 's shape for graph model + graph->ResetBatchSize("input_0", g_batch_size); + + //anakin graph optimization + graph->Optimize(); + + // constructs the executer net + Net net_executer(true); + + net_executer.init(*graph); + // get in + auto d_image = net_executer.get_in("input_0"); + auto d_image_info = net_executer.get_in("input_1"); + Shape image_shape({g_batch_size, 3, 426, 640}, Layout_NCHW); + Shape info_shape({g_batch_size, 3, 1, 1}, Layout_NCHW); + + d_image->reshape(image_shape); + d_image_info->reshape(info_shape); + //d_image->re_alloc(image_shape); + //d_image_info->re_alloc(image_info_shape); + for (int i = 0; i < image_shape.size(); i++) { + LOG(INFO) << "detect input_0 dims[" << i << "]" << image_shape[i]; + } + for (int i = 0; i < info_shape.size(); i++) { + LOG(INFO) << "detect input_1 dims[" << i << "]" << info_shape[i]; + } + + + float* image_data = (float*)(d_image->mutable_data()); + float* image_info_data = (float*)(d_image_info->mutable_data()); + read_tensor_from_file(image_data, d_image->valid_size(), image_file.c_str()); + read_tensor_from_file(image_info_data, d_image_info->valid_size(), image_info_file.c_str()); + + //int g_epoch = 1000; + //int g_warm_up=10; + // do inference + Context ctx(g_device_id, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + // warm up + for (int i = 0; i < g_warm_up; i++) { + read_tensor_from_file(image_data, d_image->valid_size(), image_file.c_str()); + read_tensor_from_file(image_info_data, d_image_info->valid_size(), image_info_file.c_str()); + net_executer.prediction(); + } + +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + + my_time.start(ctx); + + for (int i = 0; i < g_epoch; i++) { + read_tensor_from_file(image_data, d_image->valid_size(), image_file.c_str()); + read_tensor_from_file(image_info_data, d_image_info->valid_size(), image_info_file.c_str()); + net_executer.prediction(); + } + + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_epoch); +#endif + + LOG(INFO)<<"aveage time "<save(save_g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + if (!graph) { + delete graph; + } +} +#endif +#endif + + +int main(int argc, const char** argv){ + if (argc < 2){ + LOG(ERROR)<<"no input!!!"; + return -1; + } + if (argc > 1) { + g_model_path = std::string(argv[1]); + } + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_warm_up = atoi(argv[3]); + } + if (argc > 4) { + g_epoch = atoi(argv[4]); + } + if (argc > 5) { + g_device_id = atoi(argv[5]); + } +#ifdef USE_X86_PLACE + //TargetWrapper::set_device(g_device_id); + Env::env_init(); +#endif + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/generate_calibrator_config.cpp b/test/framework/net/generate_calibrator_config.cpp new file mode 100644 index 000000000..6b5f18a9d --- /dev/null +++ b/test/framework/net/generate_calibrator_config.cpp @@ -0,0 +1,56 @@ +#include "framework/graph/graph.h" +#include "framework/core/net/calibrator_parse.h" +#include "net_test.h" +int main(int argc, char** argv){ + + std::string model_path = ""; + std::string config_name = "net_pt_config"; + std::string default_precision = "fp32"; + std::string default_target = "NV"; + if (argc<2){ + LOG(ERROR) << "usage: generate_calibrator_config model config_name config_prec config_target"; + LOG(FATAL) << "no model to generate config"; + } + if (argc<3){ + LOG(ERROR) << "no config name, will use default name 'net_pt_config' "; + } + if (argc<4){ + LOG(ERROR) << "no config precision, will use default precision 'fp32' "; + } + if (argc<5){ + LOG(ERROR) << "no config target, will use default target 'NV' "; + } + + if (argc>=2){ + model_path = std::string(argv[1]); + } + if (argc>=3){ + config_name = std::string(argv[2]); + } + if (argc>=4){ + default_precision = std::string(argv[3]); + } + if (argc>=5){ + default_target = std::string(argv[4]); + } +#ifdef USE_CUDA + Graph graph; +#elif defined(USE_X86_PLACE) + Graph graph; +#endif +#if defined USE_CUDA || defined USE_X86_PLACE + graph.load(model_path); + std::vector node_names_in_order; + std::vector op_names; + + auto get_node_names = [&](NodePtr& node_ptr){ + node_names_in_order.push_back(node_ptr->name()); + op_names.push_back(node_ptr->get_op_name()); + }; + graph.Scanner->BFS(get_node_names); + + CalibratorParser parser; + parser.auto_config(node_names_in_order, op_names, config_name, default_precision, default_target); +#endif + return 0; +} diff --git a/test/framework/net/generate_calibrator_from_image.cpp b/test/framework/net/generate_calibrator_from_image.cpp index e8511cf35..eba27b1af 100644 --- a/test/framework/net/generate_calibrator_from_image.cpp +++ b/test/framework/net/generate_calibrator_from_image.cpp @@ -3,7 +3,7 @@ #include "framework/core/net/entropy_calibrator.h" #include "saber/funcs/timer.h" #include -#ifdef USE_CUDA +#if defined(USE_CUDA)||defined(USE_X86_PLACE) #if defined(NVIDIA_GPU) using Target = NV; @@ -26,10 +26,10 @@ std::string g_data_file = "./data_list.txt"; std::string g_calibrator_file = "./calibrator.txt"; int g_batch_size = 1; int g_bin_num = 2048; -#if defined(NVIDIA_GPU) + TEST(NetTest, calibrator) { #ifdef USE_OPENCV - Graph* graph = new Graph(); + Graph* graph = new Graph(); // load anakin model files. auto status = graph->load(g_model_path); if (!status ) { @@ -39,12 +39,19 @@ TEST(NetTest, calibrator) { } //anakin graph optimization - graph->Optimize(); - + graph->Optimize(false); // constructs the executer net - Net net_executer(*graph); - BatchStream batch_stream(g_data_file, 1, 3, 192, 192, {104.008f, 116.669f, 122.675f}, {1.f, 1.f, 1.f}); - EntropyCalibrator entropy_calibrator(&batch_stream, g_batch_size, g_calibrator_file, &net_executer, g_bin_num); + Net net_executer(*graph); + // resnet 50 params. + +// BatchStream batch_stream(g_data_file, 3, 224, 224, {103.939f, 116.779f, 123.68f}, {1.f, 1.f, 1.f}); + // fluid + BatchStream batch_stream(g_data_file, 3, 224, 224, + {255.f * 0.485, 255.f * 0.456, 255.f * 0.406}, + {1.f / 0.229 / 255.f, 1.f / 0.224f/255.f, 1.f / 0.225 / 255.f}); +// BatchStream batch_stream(g_data_file, 3, 224, 224, {103.939f, 116.779f, 123.68f}, {0.017, 0.017, 0.017});// mobilenet + EntropyCalibrator entropy_calibrator(&batch_stream, g_batch_size, g_calibrator_file, &net_executer, g_bin_num); + entropy_calibrator.generate_calibrator_table(); delete graph; @@ -52,7 +59,7 @@ TEST(NetTest, calibrator) { LOG(ERROR) << "turn on USE_OPENCV first"; #endif } -#endif + int main(int argc, const char** argv){ @@ -65,7 +72,7 @@ int main(int argc, const char** argv){ LOG(INFO) << " lite_model: path to anakin lite model"; LOG(INFO) << " data_file: path to image data list"; LOG(INFO) << " calibrate file: path to calibrate data path"; - if(argc < 4) { + if (argc < 4) { LOG(ERROR) << "useage: " << argv[0] << " "; return 0; } diff --git a/test/framework/net/generate_calibrator_from_tensor.cpp b/test/framework/net/generate_calibrator_from_tensor.cpp new file mode 100644 index 000000000..caabf5a2b --- /dev/null +++ b/test/framework/net/generate_calibrator_from_tensor.cpp @@ -0,0 +1,107 @@ +#include +#include "net_test.h" +#include "framework/core/net/entropy_calibrator.h" +#include "saber/funcs/timer.h" +#include +#if defined(NVIDIA_GPU)|| defined(USE_X86_PLACE) + +#if defined(NVIDIA_GPU) +using Target = NV; +using Target_H = NVHX86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = AMDHX86; +#endif + +//#define USE_DIEPSE + +std::string g_model_path; +std::string g_data_file = "./data_list.txt"; +std::string g_calibrator_file = "./calibrator.txt"; +int g_batch_size = 1; +int g_bin_num = 2048; + +Tensor g_tensor; +Shape g_shape; +std::vector> g_seq_offset; +Tensor* data_producer() { + static int cnt = 0; + const int data_num = 5; + cnt++; + g_tensor.reshape(g_shape); + fill_tensor_const(g_tensor, 1.f); + g_tensor.set_seq_offset(g_seq_offset); + + if (cnt <= data_num) { + return &g_tensor; + } else { + return nullptr; + } +} +TEST(NetTest, calibrator) { + Graph* graph = new Graph(); + // load anakin model files. + auto status = graph->load(g_model_path); + + if (!status) { + delete graph; + LOG(FATAL) << " [ERROR] " << status.info(); + exit(-1); + } + + auto input_names = graph->get_ins(); + graph->ResetBatchSize(input_names[0], g_batch_size); + //anakin graph optimization + graph->Optimize(false); + // constructs the executer net + g_seq_offset.push_back({0, g_batch_size}); + + Net net_executer(*graph); + g_shape = net_executer.get_in(input_names[0])->valid_shape(); + BatchStream batch_stream(data_producer); + EntropyCalibrator entropy_calibrator(&batch_stream, g_batch_size, g_calibrator_file, + &net_executer, g_bin_num); + entropy_calibrator.generate_calibrator_table(); + + delete graph; + +} + + +int main(int argc, const char** argv) { + + Env::env_init(); + // initial logger + logger::init(argv[0]); + + LOG(INFO) << "usage:"; + LOG(INFO) << argv[0] << " "; + LOG(INFO) << " lite_model: path to anakin lite model"; + LOG(INFO) << " data_file: path to image data list"; + LOG(INFO) << " calibrate file: path to calibrate data path"; + + if (argc < 5) { + LOG(ERROR) << "useage: " << argv[0] << " "; + return 0; + } + + g_model_path = argv[1]; + g_data_file = argv[2]; + g_calibrator_file = argv[3]; + g_batch_size = atoi(argv[4]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} +#else +int main(int argc, const char** argv) { + return 0; +} +#endif diff --git a/test/framework/net/generate_layout_config.cpp b/test/framework/net/generate_layout_config.cpp new file mode 100644 index 000000000..ab6604ca9 --- /dev/null +++ b/test/framework/net/generate_layout_config.cpp @@ -0,0 +1,44 @@ +#include "framework/graph/graph.h" +#include "framework/core/net/calibrator_parse.h" +#include "net_test.h" +int main(int argc, char** argv){ + + std::string model_path = ""; + std::string config_name = "model_layout_config"; + if (argc < 2) { + LOG(ERROR) << "usage: generate_layout_config model config_name"; + LOG(FATAL) << "no model to generate config"; + } + if (argc < 3) { + LOG(ERROR) << "no config name, will use default name 'model_layout_config' "; + } + if (argc >= 2) { + model_path = std::string(argv[1]); + } + if (argc >= 3) { + config_name = std::string(argv[2]); + } +#ifdef USE_CUDA + Graph graph; + using Ttype = NV; +#elif defined(USE_X86_PLACE) + Graph graph; + using Ttype = X86; +#endif +#if defined USE_CUDA || defined USE_X86_PLACE + graph.load(model_path); + std::vector edge_names_in_order; + std::vector edge_layouts; + + auto get_edge_names = [&](Edge& edge){ + edge_names_in_order.push_back(edge.name()); + edge_layouts.push_back(edge.layout()); + }; + graph.Scanner->BFS_Edge(get_edge_names); + + CalibratorParser parser; + parser.auto_config_layout(edge_names_in_order, edge_layouts, config_name); +#endif + return 0; +} + diff --git a/test/framework/net/int8_accuracy_arm.cpp b/test/framework/net/int8_accuracy_arm.cpp new file mode 100644 index 000000000..c75877080 --- /dev/null +++ b/test/framework/net/int8_accuracy_arm.cpp @@ -0,0 +1,286 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#ifdef ENABLE_OP_TIMER +#include"saber/funcs/impl/impl_base.h" +#endif +#ifdef USE_ARM_PLACE +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" +using namespace cv; + +std::string g_model_path = ""; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +std::string g_img_path = "val_list.txt"; +std::string g_img_file = "/data/local/tmp"; +int g_thread_num = 1; +int g_cluster = 0; +bool g_set_archs = false; +ARMArch g_arch = A73; + +static void fill_tensor_with_cvmat(const Mat& im, float* dout, const int num, const int channel, \ + const int width, const int height, const float* mean, const float* scale) { + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_out = dout + i * channel * height * width; + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_out[r * width + c] = (im.at(r, c)[2] - mean[0]) * scale[0]; + ptr_out[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_out[2 * stride + r * width + c] = (im.at(r, c)[0] - mean[2]) * scale[2]; + } + } + } +} + +int calc_top1(float* data, int size, int label){ + float max = -1.f; + int max_idx = -1; + for(int i = 0; i < size; ++i){ + if (data[i] > max){ + max = data[i]; + max_idx = i; + } + } + return int(max_idx == label); +} + +int calc_top5(float* data, int size, int label){ + float max = -1.f; + int max_idx = -1; + bool flag = false; + for (int k = 0; k < 5; ++k) { + for (int i = 0; i < size; ++i) { + if (data[i] > max) { + max = data[i]; + max_idx = i; + } + } + flag = flag || (max_idx == label); + data[max_idx] = -1.f; + max = -1.f; + } + return int(flag); +} + +Mat pre_process_img(Mat& im, int width, int height){ + float percent = 256.f / std::min(im.cols, im.rows); + int resized_width = int(roundf(im.cols * percent)); + int resized_height = int(roundf(im.rows * percent)); + resize(im ,im, Size(resized_width, resized_height), INTER_LANCZOS4); + int crop_width = width; + int crop_height = height; + int w_start = (im.cols - crop_width) / 2; + int h_start = (im.rows - crop_height) / 2; + Rect roi; + roi.x = w_start; + roi.y = h_start; + roi.width = crop_width; + roi.height = crop_height; + Mat crop = im(roi); + return crop; +} +//! set your mean value and scale value here +//float mean_mb[3] = {103.939, 116.779, 123.68}; +float mean_mb[3] = {0.485, 0.456, 0.406}; +//float scale_mb[3] = {1.f, 1.f, 1.f}; // for resnet +float scale_mb[3] = {1.f / 0.229, 1.f / 0.224, 1.f / 0.225}; // mobilenet + +TEST(NetTest, net_execute_base_test) { + LOG(INFO) << "begin test"; + Context ctx1; + ctx1.set_run_mode((PowerMode)g_cluster, g_thread_num); + if (g_set_archs) { + ctx1.set_arch(g_arch); + LOG(INFO) << "arm arc: " << g_arch; + } + ctx1.set_cache(32 * 1024, 512* 1024, 0); +#ifdef USE_OPENCV + using namespace cv; +#endif + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + std::vector& vin_name = graph->get_ins(); + LOG(INFO) << "number of input tensor: " << vin_name.size(); + + for (int j = 0; j < vin_name.size(); ++j) { + graph->ResetBatchSize("input_0", g_batch_size); + } + + graph->Optimize(); + + Net net_executer(true); + net_executer.init(*graph); + + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + Shape shin = d_tensor_in_p->valid_shape(); + //tin->reshape(Shape(1, 3, 224, 224)); + LOG(INFO) << "input tensor size: "; + //Shape shin = tin->valid_shape(); + LOG(INFO) << "input name: " << vin_name[j]; + for (int k = 0; k < d_tensor_in_p->dims(); ++k) { + LOG(INFO) << "|---: " << shin[k]; + } + fill_tensor_const(*d_tensor_in_p, 1.f); + } + printf("------------ start to test\n"); + std::vector& out_name = graph->get_outs(); + LOG(INFO) << "number of output tensor: " << out_name.size(); + for (int i = 0; i < out_name.size(); i++) { + Tensor* vout = net_executer.get_out(out_name[i]); + LOG(INFO) << "output tensor size: "; + Shape shout = vout->valid_shape(); + for (int j = 0; j < vout->dims(); ++j) { + LOG(INFO) << "|---: " << shout[j]; + } + } + + LOG(WARNING) << "pre-deal !!!!!!!! "; + // ==================== precision =================== + float top1_sum = 0; + float top5_sum = 0; + int total_count = 0; + // ================================================== + std::vector img_list; + std::vector labels; + //! load test image list + std::fstream fp_img(g_img_path); + std::string line; + while (getline(fp_img, line)) { + std::string path = line.substr(0, line.find(" ")); + std::string label = line.substr(line.find(" ")); + path = g_img_file + path; + LOG(INFO) << "img_file_path: " < ctx(0, 0, 0); + // do inference + double to = 0; + double tmin = 1000000; + double tmax = 0; + saber::SaberTimer t1; + // Tensor* vtin = net_executer.get_in(vin_name[0]); + Tensor* vtin = net_executer.get_in_list()[0]; + // Tensor* vtout = net_executer.get_out(out_name[0]); + Tensor* vtout = net_executer.get_out_list()[0]; + for (int i = 0; i < img_num; ++i){ + Mat im = imread(img_list[i]); + CHECK_NOTNULL(im.data) << "read image " << img_list[i] << " failed"; + im = pre_process_img(im, vtin->width(), vtin->height()); + //resize(im, im, Size(vtin[0]->width(), vtin[0]->height())); + im.convertTo(im, CV_32FC3, 1.f / 255); + fill_tensor_with_cvmat(im, (float*)vtin->mutable_data(), 1, 3, vtin->width(), \ + vtin->height(), mean_mb, scale_mb); + //! net prediction + Context ctx2(0, 0, 0); + t1.clear(); + t1.start(ctx2); + net_executer.prediction(); + t1.end(ctx2);float tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + int top1 = calc_top1((float*)vtout->mutable_data(), vtout->valid_size(), labels[i]); + int top5 = calc_top5((float*)vtout->mutable_data(), vtout->valid_size(), labels[i]); + top1_sum += top1; + top5_sum += top5; + LOG(INFO) <<"( "<< i << " ), " << img_list[i] << ",top1 accuracy: " << top1_sum / img_num \ + << ", top5 accuracy: " << top5_sum / img_num << ", prediction time: " << tdiff; + } + LOG(INFO) << "total, prediction time avg: " << to / img_num << ", min: " << tmin << ", max: " << tmax; + // std::string save_g_model_path = g_model_path + std::string(".saved"); + // status = graph->save(save_g_model_path); + delete graph; +} +#endif +/** + * g_model_path 模型地址 + * g_batch_size batch大小,默认1 + * img_path 图像路径 + * label_path 标签路径 + * g_cluster 用到的核数,默认0, 大核 + * g_thread_num 用到的线程数,默认1 + * @param argc + * @param argv + * @return + */ + +int main(int argc, const char** argv) { + LOG(INFO)<< "usage:"; + LOG(INFO)<< argv[0] << " "; + LOG(INFO)<< " lite_model: path to anakin lite model"; + LOG(INFO)<< " num: batchSize default to 1"; + LOG(INFO)<< " img_path: images list path"; + LOG(INFO)<< " img_file: images list path"; + LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores, 2: all cores, 3: threads not bind to specify cores"; + LOG(INFO)<< " threads: set openmp threads"; + + if(argc < 2) { + LOG(ERROR) << "You should fill in the variable lite model at least."; + return 0; + } + g_model_path = std::string(argv[1]); + + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_img_path = std::string(argv[3]); + } + if (argc > 4) { + g_img_file= std::string(argv[4]); + } + if (argc > 5) { + g_cluster = atoi(argv[5]); + if (g_cluster < 0) { + g_cluster = 0; + } + if (g_cluster > 5) { + g_cluster = 5; + } + } + if (argc > 6) { + g_thread_num = atoi(argv[6]); + } + if (argc > 7) { + g_set_archs = true; + if (atoi(argv[7]) > 0) { + g_arch = (ARMArch)atoi(argv[7]); + } else { + g_arch = ARM_UNKOWN; + } + } + + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + + return 0; +} +#else +int main(int argc, const char** argv) { + return 0; +} +#endif diff --git a/test/framework/net/model_int8_accuracy.cpp b/test/framework/net/model_int8_accuracy.cpp new file mode 100644 index 000000000..7bed257b3 --- /dev/null +++ b/test/framework/net/model_int8_accuracy.cpp @@ -0,0 +1,370 @@ +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include +#include +#include +#include +#include +#include +#include "saber/funcs/debug.h" +#include "saber/core/tensor_op.h" + +#ifdef USE_OPENCV +#include +#endif + +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) + +DEFINE_GLOBAL(int, gpu, 0); +DEFINE_GLOBAL(std::string, model_path, ""); +DEFINE_GLOBAL(std::string, image_root, ""); +DEFINE_GLOBAL(std::string, image_list, ""); +DEFINE_GLOBAL(int, num, 1); +DEFINE_GLOBAL(int, img_num, -1); +DEFINE_GLOBAL(int, offset_y, 0); +DEFINE_GLOBAL(bool, graph_reset_bs, true); +DEFINE_GLOBAL(bool, rgb, false); +DEFINE_GLOBAL(bool, vis, false); + +DEFINE_GLOBAL(std::string, input_data_source, "1"); +DEFINE_GLOBAL(int, max_num, 32); +DEFINE_GLOBAL(bool, dynamic_batch, false); + +#ifdef USE_OPENCV +template +void fill_tensor_with_cvmat(const cv::Mat& img_in, Tensor& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = (float*)tout.mutable_data(); + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[2] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[0] - mean[2]) * scale[2]; + } + } + } +} +#endif + +void SplitString(const std::string& s, + std::vector& v, const std::string& c) { + + std::string::size_type pos1, pos2; + pos2 = s.find(c); + pos1 = 0; + while(std::string::npos != pos2) { + v.push_back(s.substr(pos1, pos2-pos1)); + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + if(pos1 != s.length()) { + v.push_back(s.substr(pos1)); + } +} + +bool read_image_list(std::string &filename, + std::vector &results, std::vector &label) { + + //std::cout << "image list: " << filename << std::endl; + std::ifstream infile(filename.c_str()); + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + std::string line; + while (std::getline(infile, line)) { + std::vector v; + SplitString(line, v, " "); + if (v.size() < 2) { + LOG(FATAL) << "wrong file list! [path label]"; + } + results.push_back(v[0]); + label.push_back(atoi(v[1].c_str())); + } + return true; +} + +int print_topk(const float* scores, const int size, const int topk, \ + const std::vector& labels) { + + std::vector< std::pair > vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), + std::greater< std::pair >()); + +// LOG(INFO) << " out: " << vec[0].second <<" label: "<< labels[0]; + // print topk and score + for (int i = 0; i < topk; i++) { +// float score = vec[i].first; +// int index = vec[i].second; + if (vec[i].second == labels[0]) { + return 1; + } +// LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score; + } + return 0; +} + +//! set your mean value and scale value here +//float mean_mb[3] = {103.939, 116.779, 123.68}; +//float mean_mb[3] = {103.94, 116.78, 123.68}; +//float scale_mb[3] = {1.f, 1.f, 1.f}; // for resnet +//float scale_mb[3] = {0.017, 0.017, 0.017}; // mobilenet + +// fluid +float mean_mb[3] = {255.f * 0.485, 255.f * 0.456, 255.f * 0.406}; +float scale_mb[3] = {1.f / 0.229 / 255.f, 1.f / 0.224f/255.f, 1.f / 0.225 / 255.f}; + +template +void model_test() { +#ifdef USE_OPENCV + using namespace cv; +#endif + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << GLB_model_path << " ..."; + + // load anakin model files. + auto status = graph->load(GLB_model_path); + if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + auto in_list = graph->get_ins(); + + int max_batch_size = (GLB_max_num > GLB_num) ? GLB_max_num : GLB_num; + int batch_size = GLB_num; + + //reshape shape batch-size + // set batch + graph->ResetBatchSize("input_0", max_batch_size); + LOG(INFO) << "set max_batch_size : " << max_batch_size; + + //anakin graph optimization +// graph->load_layout_config("model_layout_config"); + graph->load_calibrator_config("net_pt_config", "calibrate_file.txt"); + graph->Optimize(); + + // constructs the executer net + Net net_executer(true); + net_executer.init(*graph); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + d_tensor_in_p->set_num(batch_size); + LOG(INFO) << "set batch_size : " << batch_size; + if ( ! GLB_graph_reset_bs ) { + // get in + auto init_shape_in = d_tensor_in_p->valid_shape(); + Shape new_shape({GLB_num, init_shape_in[1], init_shape_in[2], init_shape_in[3]}, Layout_NCHW); + d_tensor_in_p->reshape(new_shape); + } + + Tensor4d h_tensor_in; + Tensor out_host; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + int width = d_tensor_in_p->width(); + int height = d_tensor_in_p->height(); + int num = d_tensor_in_p->num(); + + // ==================== precision =================== + int top1_count = 0; + int top5_count = 0; + int total_count = 0; + // ================================================== + +// for (int img_num = 0; img_num < image_file_list.size(); ++img_num) + int new_batch_size = batch_size; + std::vector image_labels; + char pro[102]; + memset(pro, '\0', sizeof(pro)); + const char* spin="-\\|/"; + int ratio = 0; +#ifdef USE_OPENCV + std::vector image_file_list; + + CHECK(read_image_list(GLB_image_list, image_file_list, image_labels)); + int image_file_list_size = image_file_list.size(); + total_count = image_file_list_size; + if (GLB_img_num != -1) { + image_file_list_size = GLB_img_num + 1; + } else { + GLB_img_num = 0; + } + + for (int img_num = GLB_img_num; img_num < image_file_list_size; ++img_num) +#else + int img_num = 0; +#endif + { + if (GLB_dynamic_batch) { + new_batch_size = (img_num % (max_batch_size)) + 1; + } + d_tensor_in_p->set_num(new_batch_size); + valid_shape_in = d_tensor_in_p->valid_shape(); + h_tensor_in.re_alloc(valid_shape_in); + /*================fill tensor=================*/ +#ifdef USE_OPENCV + fflush(stdout); + ratio = (int)(100.f * (float)img_num / (float)image_file_list_size); + printf("[%-100s][%d\%][%c]\r", pro, ratio, spin[ratio & 3]); + pro[ratio] = '='; + + std::string image_path = GLB_image_root + image_file_list[img_num]; +// LOG(INFO) << "loading image " << image_path << " ..."; + Mat img = imread(image_path, CV_LOAD_IMAGE_COLOR); + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_path << " failed"; + } + + // FOR NHWC + if (h_tensor_in.width() == 3) { + fill_tensor_with_cvmat(img, h_tensor_in, batch_size, h_tensor_in.height(), + h_tensor_in.channel(), mean_mb, scale_mb); + } else { + fill_tensor_with_cvmat(img, h_tensor_in, batch_size, h_tensor_in.width(), + h_tensor_in.height(), mean_mb, scale_mb); + } +#else + fill_tensor_const(h_tensor_in, 1.f); +#endif + d_tensor_in_p->copy_from(h_tensor_in); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + std::string input_file_name = "record_In_0_image_"; + std::ostringstream ss; + ss << input_file_name << img_num << ".txt"; + input_file_name = ss.str(); +// write_tensorfile(*d_tensor_in_p, input_file_name.c_str()); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + /*================ launch =======================*/ + Context ctx(GLB_gpu, 0, 0); + + net_executer.prediction(); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + /*=============no dump======================*/ + auto graph_outs = graph->get_outs(); + auto tensor_out_p = net_executer.get_out(graph_outs[0]); + out_host.reshape(tensor_out_p->valid_shape()); + out_host.copy_from(*tensor_out_p); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + top1_count += print_topk((const float*)out_host.data(), 1000, 1, {image_labels[img_num]}); + top5_count += print_topk((const float*)out_host.data(), 1000, 5, {image_labels[img_num]}); +// for (int out_id = 0; out_id < graph_outs.size(); ++out_id) { +// auto tensor_out_p = net_executer.get_out(graph_outs[out_id]); +// write_tensorfile(*tensor_out_p, +// ("record_" + graph_outs[out_id] + "_image_" + std::to_string(img_num) + ".txt").c_str()); +// } + } + float top1 = (float)top1_count / (float)total_count; + float top5 = (float)top5_count/ (float)total_count; + LOG(INFO) << " top1: " << top1 << " top5: " << top5; +#ifndef ENABLE_DEBUG + { + auto d_tensor_in_p = net_executer.get_in("input_0"); + //Shape new_shape({1, 14, 800, 1408}); + //d_tensor_in_p->reshape(new_shape); + // performance check + int warm_up = 100; + int ts = 1000; + for (int i = 0; i < warm_up; ++i) { + net_executer.prediction(); + } +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + Context ctx(GLB_gpu, 0, 0); + saber::SaberTimer my_time; + for (int i = 0; i < ts; ++i) { + my_time.start(ctx); + net_executer.prediction(); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + my_time.end(ctx); + } + std::cout << "==========================Performance Statistics =============================\n"; + std::cout << "==================== Input_shape: [" + << d_tensor_in_p->num() << ", " + << d_tensor_in_p->channel() << ", " + << d_tensor_in_p->height() << ", " + << d_tensor_in_p->width() << "]\n"; + std::cout << "==================== Warm_up: " << warm_up << "\n"; + std::cout << "==================== Iteration: " << ts << "\n"; + std::cout << "==================== Average time: " << my_time.get_average_ms() << "ms\n"; + std::cout << "==================== 10% Quantile time: " << my_time.get_tile_time(10) << "ms\n"; + std::cout << "==================== 25% Quantile time: " << my_time.get_tile_time(25) << "ms\n"; + std::cout << "==================== 50% Quantile time: " << my_time.get_tile_time(50) << "ms\n"; + std::cout << "==================== 75% Quantile time: " << my_time.get_tile_time(75) << "ms\n"; + std::cout << "==================== 90% Quantile time: " << my_time.get_tile_time(90) << "ms\n"; + std::cout << "==================== 95% Quantile time: " << my_time.get_tile_time(95) << "ms\n"; + std::cout << "==================== 99% Quantile time: " << my_time.get_tile_time(99) << "ms" << std::endl; + } +#endif + delete graph; +} + +TEST(NetTest, net_execute_base_test) { +#ifdef USE_CUDA + model_test(); +#endif +#ifdef USE_X86_PLACE + model_test(); +#endif +} + +int main(int argc, const char** argv) { +#ifdef USE_OPENCV + if (argc < 4) { + LOG(FATAL) << "bad param \n ./anakin_model_test + model_path + img_root + img_list + [batch]"; + } else if (argc >= 4) { + GLB_model_path = argv[1]; + GLB_image_root = argv[2]; + GLB_image_list = argv[3]; + } + GLB_num = argc >= 5 ? atoi(argv[4]) : 1; + GLB_gpu = argc >= 6 ? atoi(argv[5]) : 0; + GLB_img_num = argc >= 7 ? atoi(argv[6]) : -1; +#else + if (argc < 2) { + LOG(FATAL) << "bad param \n ./anakin_model_test + model_path + [batch]"; + } else if (argc >= 2) { + GLB_model_path = argv[1]; + } +#endif + + LOG(INFO) << " model path: " << GLB_model_path; + LOG(INFO) << " image root: " << GLB_image_root; + LOG(INFO) << " image list: " << GLB_image_list; + LOG(INFO) << " GLB_num: " << GLB_num; + LOG(INFO) << " using GPU: " << GLB_gpu; + +#ifdef USE_CUDA + cudaSetDevice(GLB_gpu); + anakin::saber::Env::env_init(); + anakin::saber::Env::env_init(); + cudaSetDevice(GLB_gpu); +#endif + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_audit_exec.cpp b/test/framework/net/net_audit_exec.cpp new file mode 100644 index 000000000..52cace01b --- /dev/null +++ b/test/framework/net/net_audit_exec.cpp @@ -0,0 +1,191 @@ +#include +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include "framework/core/mem_info.h" +#include +#include "debug.h" +#include +#include +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +std::string g_model_path = ""; +std::string g_input_path = ""; + +int g_batch_size=1; +int g_thread_num=1; +int g_warm_up = 10; +int g_epoch = 1000; + +std::string model_saved_path = g_model_path + ".saved"; + +float Random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + +void fill_with_file(Tensor4d* d_tensor_in_p) { + Tensor4d h_tensor_in; + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; i>tmp; + h_data[i] = tmp; + } + d_tensor_in_p->copy_from(h_tensor_in); + file.close(); +} + +void fill_with_random(Tensor4d* d_tensor_in_p) { + Tensor4d h_tensor_in; + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); +} + +double InferencePerf(graph::Graph* graph, int thread_idx) { + LOG(INFO) << "Thread (" << thread_idx << ") processing"; + // constructs the executer net + Net net_executer(true); + + net_executer.init(*graph); + + // get ins + for(auto& input_name : graph->get_ins()) { + auto d_tensor_in_p = net_executer.get_in(input_name); + + if(g_input_path != std::string("")) { + LOG(INFO) << "Use input file: " << g_input_path; + fill_with_file(d_tensor_in_p); + } else { + fill_with_random(d_tensor_in_p); + } + } + + + // do inference warm up + for(int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + my_time.start(ctx); + double count = 0.f; + + for(int i = 0; i < g_epoch; i++) { + saber::SaberTimer my_time; + my_time.start(ctx); + //auto t0 = std::chrono::high_resolution_clock::now(); + + net_executer.prediction(); + + //auto t1 = std::chrono::high_resolution_clock::now(); + //count += std::chrono::duration_cast(t1-t0).count(); + my_time.end(ctx); + //LOG(INFO)<<"immed time : "<::Global().get_used_mem_in_mb(); + LOG(INFO) << "Checking_mem_used: " << mem_used; + } + } + + LOG(INFO)<<"InferencePerf aveage time: "<* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + // reshape the input 's shape for graph model + //graph->Reshape("data", {1, 3, 195, 758}); // face_box1 + //graph->Reshape("data", {1, 3, 227, 958}); // face_box1 not fusion + //graph->Reshape("image", {1, 3, 210, 216}); // face_box2 + + //anakin graph optimization + graph->Optimize(); + + // launch multi thread + std::vector work_pool; + double counter = 0.0f; + auto t0 = std::chrono::high_resolution_clock::now(); + for(int i=0; i(t1-t0).count(); + int QPS = g_epoch * g_thread_num / (counter / 1e6); + LOG(ERROR) << " QPS : " << QPS; + delete graph; +} + +int main(int argc, const char** argv){ + if(argc < 4){ + LOG(INFO) << "@Anakin@ model audit"; + LOG(INFO) << "usage:"; + LOG(INFO) << " Param 1: thread_num ( thread number )"; + LOG(INFO) << " Param 2: batch_size ( batch size )"; + LOG(INFO) << " Param 3: model_path ( anakin binary model file path )"; + LOG(INFO) << " Param 4: input_file_path ( anakin input_file_path )"; + exit(-1); + } + g_thread_num = atoi(argv[1]); + g_batch_size = atoi(argv[2]); + g_model_path = argv[3]; + if(argc > 4) { + g_input_path = argv[4]; + } + + Env::env_init(); + + InferencePerfWithMultiThread(); + // initial logger + //logger::init(argv[0]); + //InitTest(); + //RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_exec_map_rnn.cpp b/test/framework/net/net_exec_map_rnn.cpp index 896d02fdb..067718716 100644 --- a/test/framework/net/net_exec_map_rnn.cpp +++ b/test/framework/net/net_exec_map_rnn.cpp @@ -17,10 +17,15 @@ #if defined(NVIDIA_GPU) using Target = NV; using Target_H = NVHX86; -#else if defined(USE_X86_PLACE) + +#elif defined(USE_X86_PLACE) using Target = X86; using Target_H = X86; #include "mkl_service.h" + +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; #endif @@ -332,19 +337,19 @@ void one_thread_run(std::string path, int thread_id) { printf("%f\n", static_cast(out.data())[seq_start + seq_len - 1]); } #else - auto out =net_executer.get_out("final_output.tmp_1_gout"); - int size = out->valid_size(); - - for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) { - int seq_len = seq_offset[seq_id + 1] - seq_offset[seq_id]; - int seq_start = seq_offset[seq_id]; - - for (int i = 0; i < seq_len - 1; i++) { - printf("%f|", static_cast(out->data())[seq_start + i]); - } - - printf("%f\n", static_cast(out->data())[seq_start + seq_len - 1]); - } +// auto out =net_executer.get_out("final_output.tmp_1_gout"); +// int size = out->valid_size(); +// +// for (int seq_id = 0; seq_id < seq_offset.size() - 1; seq_id++) { +// int seq_len = seq_offset[seq_id + 1] - seq_offset[seq_id]; +// int seq_start = seq_offset[seq_id]; +// +// for (int i = 0; i < seq_len - 1; i++) { +// printf("%f|", static_cast(out->data())[seq_start + i]); +// } +// +// printf("%f\n", static_cast(out->data())[seq_start + seq_len - 1]); +// } #endif diff --git a/test/framework/net/net_exec_ps_new.cpp b/test/framework/net/net_exec_ps_new.cpp index 1cecb4e89..3cd3247b9 100644 --- a/test/framework/net/net_exec_ps_new.cpp +++ b/test/framework/net/net_exec_ps_new.cpp @@ -447,7 +447,6 @@ TEST(NetTest, net_execute_base_test) { Net net_executer(true); #endif - net_executer.load_calibrator_config("net_pt_config.txt", "cal_file"); net_executer.init(*graph); int epoch = 1; diff --git a/test/framework/net/net_exec_test.cpp b/test/framework/net/net_exec_test.cpp index 0d3d624cf..c3d4c33a7 100644 --- a/test/framework/net/net_exec_test.cpp +++ b/test/framework/net/net_exec_test.cpp @@ -31,46 +31,48 @@ int g_device_id = 0; #ifdef USE_CUDA #if 1 -TEST(NetTest, net_test_load_from_buffer) { - Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; - std::ifstream ifs; - ifs.open (g_model_path, std::ifstream::in); - if (!ifs.is_open()) { - LOG(FATAL) << "file open failed"; - } - ifs.seekg(0, ifs.end); - int length = ifs.tellg(); - ifs.seekg(0, ifs.beg); - char * buffer = new char [length]; - ifs.read(buffer, length); - ifs.close(); - - // load anakin model files. - auto status = graph->load(buffer, length); - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - graph->ResetBatchSize("input_0", g_batch_size); - graph->Optimize(); - Net net_executer(true); - net_executer.load_calibrator_config("net_pt_config.txt","cal_file"); - net_executer.init(*graph); - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i=0; icopy_from(h_tensor_in); - cudaDeviceSynchronize(); - net_executer.prediction(); - write_tensorfile(*net_executer.get_out_list()[0],"output_b.txt"); -} +//TEST(NetTest, net_test_load_from_buffer) { +// Graph* graph = new Graph(); +// LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; +// std::ifstream ifs; +// ifs.open (g_model_path, std::ifstream::in); +// if (!ifs.is_open()) { +// LOG(FATAL) << "file open failed"; +// } +// ifs.seekg(0, ifs.end); +// int length = ifs.tellg(); +// ifs.seekg(0, ifs.beg); +// char * buffer = new char [length]; +// ifs.read(buffer, length); +// ifs.close(); +// +// // load anakin model files. +// auto status = graph->load(buffer, length); +// if (!status ) { +// LOG(FATAL) << " [ERROR] " << status.info(); +// } +// graph->ResetBatchSize("input_0", g_batch_size); +// graph->Optimize(); +// Net net_executer(true); +// net_executer.init(*graph); +// auto d_tensor_in_p = net_executer.get_in("input_0"); +// Tensor4d h_tensor_in; +// +// auto valid_shape_in = d_tensor_in_p->valid_shape(); +// for (int i=0; icopy_from(h_tensor_in); +// cudaDeviceSynchronize(); +// net_executer.prediction(); +// cudaDeviceSynchronize(); +// auto h_tensor_out = net_executer.get_out_list()[0]; +// LOG(INFO) << "output mean value: " << tensor_mean_value_valid(*h_tensor_out); +// write_tensorfile(*net_executer.get_out_list()[0],"output_b.txt"); +//} TEST(NetTest, net_execute_base_test) { Graph* graph = new Graph(); @@ -80,7 +82,7 @@ TEST(NetTest, net_execute_base_test) { if(!status ) { LOG(FATAL) << " [ERROR] " << status.info(); } - + LOG(INFO)<<"net_execute_base_test"; // reshape the input_0 's shape for graph model //graph->Reshape("input_0", {1, 8, 640, 640}); graph->ResetBatchSize("input_0", g_batch_size); @@ -105,7 +107,6 @@ TEST(NetTest, net_execute_base_test) { Net net_executer(true); #endif - net_executer.load_calibrator_config("net_pt_config.txt","cal_file"); net_executer.init(*graph); // get in auto d_tensor_in_p = net_executer.get_in("input_0"); @@ -124,6 +125,8 @@ TEST(NetTest, net_execute_base_test) { } d_tensor_in_p->copy_from(h_tensor_in); + std::vector> seq_offset={{0,g_batch_size}}; + d_tensor_in_p->set_seq_offset(seq_offset); #ifdef USE_DIEPSE // for diepse model @@ -220,9 +223,9 @@ TEST(NetTest, net_execute_base_test) { //} // inner scope over LOG(ERROR) << "inner net exe over !"; - for(auto x:net_executer.get_out_list()){ -// print_tensor(*x); - } + //for (auto x:net_executer.get_out_list()){ + // print_tensor(*x); + //} //auto& tensor_out_inner_p = net_executer.get_tensor_from_edge("data_perm", "conv1"); @@ -347,8 +350,8 @@ TEST(NetTest, net_execute_reconstruction_test) { int main(int argc, const char** argv){ if (argc < 2){ - LOG(ERROR)<<"no input!!!"; - return; + LOG(ERROR) << "no input!!!, usage: ./" << argv[0] << " model_path [batch size] [warm_up_iter] [test_iter] [device_id]"; + return -1; } if (argc > 1) { g_model_path = std::string(argv[1]); diff --git a/test/framework/net/net_exec_test_arm.cpp b/test/framework/net/net_exec_test_arm.cpp new file mode 100644 index 000000000..0c2dedb3f --- /dev/null +++ b/test/framework/net/net_exec_test_arm.cpp @@ -0,0 +1,253 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#ifdef ENABLE_OP_TIMER +#include "saber/funcs/impl/impl_base.h" +#endif + +std::string g_model_path = ""; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +int g_warm_up = 0; +int g_epoch = 1; +int g_thread_num = 1; +bool g_random = 0; +int g_instance = 1; +int g_cluster = 0; +bool g_set_archs = false; +ARMArch g_arch = A73; +#ifdef USE_ARM_PLACE +template +double tensor_mean_value_host_impl(const Dtype* din, long long size) { + double sum = 0.0; + for (long long i = 0; i < size; ++i) { + sum += din[i]; + } + return sum / size; +} + +double tensor_mean(const Tensor& tensor) { + + const void* data_ptr = tensor.data(); + long long size = tensor.valid_size(); + DataType type = tensor.get_dtype(); + switch (type) { + //case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size); + case AK_INT8: return tensor_mean_value_host_impl((const signed char*)data_ptr, size); + //case AK_UINT16: return tensor_mean_value_host_impl((const unsigned short*)data_ptr, size); + //case AK_INT16: return tensor_mean_value_host_impl((const short*)data_ptr, size); + //case AK_UINT32: return tensor_mean_value_host_impl((const unsigned int*)data_ptr, size); + case AK_INT32: return tensor_mean_value_host_impl((const int*)data_ptr, size); + case AK_FLOAT: return tensor_mean_value_host_impl((const float*)data_ptr, size); + //case AK_DOUBLE: return tensor_mean_value_host_impl((const double*)data_ptr, size); + default: LOG(INFO) << "data type: " << (int)type << " is unsupported now"; + } + return 0.0; +} + +TEST(NetTest, net_execute_base_test) { + LOG(INFO) << "begin test"; + Context ctx1; + ctx1.set_run_mode((PowerMode)g_cluster, g_thread_num); + if (g_set_archs) { + ctx1.set_arch(g_arch); + LOG(INFO) << "arm arc: " << g_arch; + } + ctx1.set_cache(32 * 1024, 512* 1024, 0); + + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + std::vector& vin_name = graph->get_ins(); + LOG(INFO) << "number of input tensor: " << vin_name.size(); + + for (int j = 0; j < vin_name.size(); ++j) { + graph->ResetBatchSize("input_0", g_batch_size); + } + + graph->Optimize(); + + Net net_executer(true); + net_executer.init(*graph); + + srand(12345); + + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + Shape shin = d_tensor_in_p->valid_shape(); + //tin->reshape(Shape(1, 3, 224, 224)); + LOG(INFO) << "input tensor size: "; + //Shape shin = tin->valid_shape(); + LOG(INFO) << "input name: " << vin_name[j]; + for (int k = 0; k < d_tensor_in_p->dims(); ++k) { + LOG(INFO) << "|---: " << shin[k]; + } + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + printf("------------ start to test\n"); + std::vector& out_name = graph->get_outs(); + LOG(INFO) << "number of output tensor: " << out_name.size(); + for (int i = 0; i < out_name.size(); i++) { + Tensor* vout = net_executer.get_out(out_name[i]); + LOG(INFO) << "output tensor size: "; + Shape shout = vout->valid_shape(); + for (int j = 0; j < vout->dims(); ++j) { + LOG(INFO) << "|---: " << shout[j]; + } + } + Context ctx(0, 0, 0); + // do inference + saber::SaberTimer my_time; + double to = 0; + double tmin = 1000000; + double tmax = 0; + saber::SaberTimer t1; + + LOG(WARNING) << "EXECUTER !!!!!!!! "; + + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + my_time.start(ctx); + Context ctx2(0, 0, 0); + + for (int i = 0; i < g_epoch; i++) { + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + t1.clear(); + t1.start(ctx2); + net_executer.prediction(); + t1.end(ctx2); + float tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + printf("------------ iter: %d/%d, time(ms): %f\n", i, g_epoch, tdiff); + LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; + } + for (int i = 0; i < out_name.size(); ++i) { + Tensor* vout = net_executer.get_out(out_name[i]); + write_tensorfile(*vout, out_name[i].c_str()); +#ifdef ENABLE_DEBUG + const float* ptr = vout->data(); + for (int j = 0; j < vout->valid_size(); ++j) { + printf("%f ", ptr[j]); + if ((j + 1) % 10 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif + double mean_val = tensor_mean_value_valid(*vout); //tensor_mean(*vout); + LOG(INFO) << "output mean: " << mean_val; + } + my_time.end(ctx); + LOG(INFO) << "M:" << g_model_path << " th:" << g_thread_num << " batch_size " << g_batch_size << " average time " << to / g_epoch + << ", min time: " << tmin << "ms, max time: " << tmax << " ms"; +#ifdef ENABLE_OP_TIMER + OpTimer::print_timer(ctx1); + // std::cout << "MC:" << lite_model << " total-ops:" << OpTimer::get_timer("total").ops / FLAGS_epoch << std::endl; + LOG(INFO) << "MC:" << g_model_path << " total-ops:" << OpTimer::get_timer("total").ops / g_epoch ; +#endif //ENABLE_OP_TIMER + // std::string save_g_model_path = g_model_path + std::string(".saved"); + // status = graph->save(save_g_model_path); + delete graph; +} + +/** + * g_model_path 模型地址 + * g_batch_size batch大小,默认1 + * g_warm_up 预热次数,默认0 + * g_epoch 计时次数,默认1 + * g_thread_num 用到的线程数,默认1 + * g_random 是否是随机数输入,默认是,0代表常量输入 + * @param argc + * @param argv + * @return + */ + +int main(int argc, const char** argv) { + LOG(INFO)<< "usage:"; + LOG(INFO)<< argv[0] << " "; + LOG(INFO)<< " lite_model: path to anakin lite model"; + LOG(INFO)<< " num: batchSize default to 1"; + LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; + LOG(INFO)<< " epoch: time statistic epoch default to 10"; + LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores, 2: all cores, 3: threads not bind to specify cores"; + LOG(INFO)<< " threads: set openmp threads"; + + if(argc < 2) { + LOG(ERROR) << "You should fill in the variable lite model at least."; + return 0; + } + g_model_path = std::string(argv[1]); + + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_warm_up = atoi(argv[3]); + } + if (argc > 4) { + g_epoch = atoi(argv[4]); + } + if (argc > 5) { + g_cluster = atoi(argv[5]); + if (g_cluster < 0) { + g_cluster = 0; + } + if (g_cluster > 5) { + g_cluster = 5; + } + } + if (argc > 6) { + g_thread_num = atoi(argv[6]); + } + if (argc > 7) { + g_set_archs = true; + if (atoi(argv[7]) > 0) { + g_arch = (ARMArch)atoi(argv[7]); + } else { + g_arch = ARM_UNKOWN; + } + } + if (argc > 8) { + g_random = atoi(argv[8]); + } + + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + + return 0; +} +#else +int main(int argc, const char** argv) { + return 0; +} +#endif diff --git a/test/framework/net/net_exec_test_arm_int8.cpp b/test/framework/net/net_exec_test_arm_int8.cpp new file mode 100644 index 000000000..52f8d4fdf --- /dev/null +++ b/test/framework/net/net_exec_test_arm_int8.cpp @@ -0,0 +1,252 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#ifdef ENABLE_OP_TIMER +#include "saber/funcs/impl/impl_base.h" +#endif +std::string g_model_path = ""; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +int g_warm_up = 0; +int g_epoch = 1; +int g_thread_num = 1; +bool g_random = 0; +int g_instance = 1; +int g_cluster = 0; +bool g_set_archs = false; +ARMArch g_arch = A73; +#ifdef USE_ARM_PLACE +template +double tensor_mean_value_host_impl(const Dtype* din, long long size) { + double sum = 0.0; + for (long long i = 0; i < size; ++i) { + sum += din[i]; + } + return sum / size; +} + +double tensor_mean(const Tensor& tensor) { + + const void* data_ptr = tensor.data(); + long long size = tensor.valid_size(); + DataType type = tensor.get_dtype(); + switch (type) { + //case AK_UINT8: return tensor_mean_value_host_impl((const unsigned char*)data_ptr, size); + case AK_INT8: return tensor_mean_value_host_impl((const signed char*)data_ptr, size); + //case AK_UINT16: return tensor_mean_value_host_impl((const unsigned short*)data_ptr, size); + //case AK_INT16: return tensor_mean_value_host_impl((const short*)data_ptr, size); + //case AK_UINT32: return tensor_mean_value_host_impl((const unsigned int*)data_ptr, size); + case AK_INT32: return tensor_mean_value_host_impl((const int*)data_ptr, size); + case AK_FLOAT: return tensor_mean_value_host_impl((const float*)data_ptr, size); + //case AK_DOUBLE: return tensor_mean_value_host_impl((const double*)data_ptr, size); + default: LOG(INFO) << "data type: " << (int)type << " is unsupported now"; + } + return 0.0; +} + +TEST(NetTest, net_execute_base_test) { + LOG(INFO) << "begin test"; + Context ctx1; + ctx1.set_run_mode((PowerMode)g_cluster, g_thread_num); + if (g_set_archs) { + ctx1.set_arch(g_arch); + LOG(INFO) << "arm arc: " << g_arch; + } + ctx1.set_cache(32 * 1024, 512* 1024, 0); + + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + std::vector& vin_name = graph->get_ins(); + LOG(INFO) << "number of input tensor: " << vin_name.size(); + + for (int j = 0; j < vin_name.size(); ++j) { + graph->ResetBatchSize("input_0", g_batch_size); + } + + graph->Optimize(); + + Net net_executer(true); + net_executer.init(*graph); + + srand(12345); + + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + Shape shin = d_tensor_in_p->valid_shape(); + //tin->reshape(Shape(1, 3, 224, 224)); + LOG(INFO) << "input tensor size: "; + //Shape shin = tin->valid_shape(); + LOG(INFO) << "input name: " << vin_name[j]; + for (int k = 0; k < d_tensor_in_p->dims(); ++k) { + LOG(INFO) << "|---: " << shin[k]; + } + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + printf("------------ start to test\n"); + std::vector& out_name = graph->get_outs(); + LOG(INFO) << "number of output tensor: " << out_name.size(); + for (int i = 0; i < out_name.size(); i++) { + Tensor* vout = net_executer.get_out(out_name[i]); + LOG(INFO) << "output tensor size: "; + Shape shout = vout->valid_shape(); + for (int j = 0; j < vout->dims(); ++j) { + LOG(INFO) << "|---: " << shout[j]; + } + } + Context ctx(0, 0, 0); + // do inference + saber::SaberTimer my_time; + double to = 0; + double tmin = 1000000; + double tmax = 0; + saber::SaberTimer t1; + + LOG(WARNING) << "EXECUTER !!!!!!!! "; + + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + my_time.start(ctx); + Context ctx2(0, 0, 0); + + for (int i = 0; i < g_epoch; i++) { + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + t1.clear(); + t1.start(ctx2); + net_executer.prediction(); + t1.end(ctx2); + float tdiff = t1.get_average_ms(); + if (tdiff > tmax) { + tmax = tdiff; + } + if (tdiff < tmin) { + tmin = tdiff; + } + to += tdiff; + printf("------------ iter: %d/%d, time(ms): %f\n", i, g_epoch, tdiff); + LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; + } + for (int i = 0; i < out_name.size(); ++i) { + Tensor* vout = net_executer.get_out(out_name[i]); + write_tensorfile(*vout, out_name[i].c_str()); +#ifdef ENABLE_DEBUG + const float* ptr = vout->data(); + for (int j = 0; j < vout->valid_size(); ++j) { + printf("%f ", ptr[j]); + if ((j + 1) % 10 == 0) { + printf("\n"); + } + } + printf("\n"); +#endif + double mean_val = tensor_mean_value_valid(*vout); //tensor_mean(*vout); + LOG(INFO) << "output mean: " << mean_val; + } + my_time.end(ctx); + LOG(INFO) << "M:" << g_model_path << " th:" << g_thread_num << " batch_size " << g_batch_size << " average time " << to / g_epoch + << ", min time: " << tmin << "ms, max time: " << tmax << " ms"; +#ifdef ENABLE_OP_TIMER + OpTimer::print_timer(ctx1); + // std::cout << "MC:" << lite_model << " total-ops:" << OpTimer::get_timer("total").ops / FLAGS_epoch << std::endl; + LOG(INFO) << "MC:" << g_model_path << " total-ops:" << OpTimer::get_timer("total").ops / g_epoch ; +#endif //ENABLE_OP_TIMER + // std::string save_g_model_path = g_model_path + std::string(".saved"); + // status = graph->save(save_g_model_path); + delete graph; +} + +/** + * g_model_path 模型地址 + * g_batch_size batch大小,默认1 + * g_warm_up 预热次数,默认0 + * g_epoch 计时次数,默认1 + * g_thread_num 用到的线程数,默认1 + * g_random 是否是随机数输入,默认是,0代表常量输入 + * @param argc + * @param argv + * @return + */ + +int main(int argc, const char** argv) { + LOG(INFO)<< "usage:"; + LOG(INFO)<< argv[0] << " "; + LOG(INFO)<< " lite_model: path to anakin lite model"; + LOG(INFO)<< " num: batchSize default to 1"; + LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; + LOG(INFO)<< " epoch: time statistic epoch default to 10"; + LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores, 2: all cores, 3: threads not bind to specify cores"; + LOG(INFO)<< " threads: set openmp threads"; + + if(argc < 2) { + LOG(ERROR) << "You should fill in the variable lite model at least."; + return 0; + } + g_model_path = std::string(argv[1]); + + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_warm_up = atoi(argv[3]); + } + if (argc > 4) { + g_epoch = atoi(argv[4]); + } + if (argc > 5) { + g_cluster = atoi(argv[5]); + if (g_cluster < 0) { + g_cluster = 0; + } + if (g_cluster > 5) { + g_cluster = 5; + } + } + if (argc > 6) { + g_thread_num = atoi(argv[6]); + } + if (argc > 7) { + g_set_archs = true; + if (atoi(argv[7]) > 0) { + g_arch = (ARMArch)atoi(argv[7]); + } else { + g_arch = ARM_UNKOWN; + } + } + if (argc > 8) { + g_random = atoi(argv[8]); + } + + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + + return 0; +} +#else +int main(int argc, const char** argv) { + return 0; +} +#endif diff --git a/test/framework/net/net_exec_test_cv_topk.cpp b/test/framework/net/net_exec_test_cv_topk.cpp new file mode 100644 index 000000000..41be38545 --- /dev/null +++ b/test/framework/net/net_exec_test_cv_topk.cpp @@ -0,0 +1,348 @@ +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include +#include +#include +#include +#include +#include +#include "saber/funcs/debug.h" +#include "saber/core/tensor_op.h" + +#ifdef USE_OPENCV +#include +#endif + +#define DEFINE_GLOBAL(type, var, value) \ + type (GLB_##var) = (value) + +DEFINE_GLOBAL(int, gpu, 0); +DEFINE_GLOBAL(std::string, model_path, ""); +DEFINE_GLOBAL(std::string, image_root, ""); +DEFINE_GLOBAL(std::string, image_list, ""); +DEFINE_GLOBAL(int, num, 1); +DEFINE_GLOBAL(int, img_num, -1); +DEFINE_GLOBAL(int, offset_y, 0); +DEFINE_GLOBAL(bool, graph_reset_bs, true); +DEFINE_GLOBAL(bool, rgb, false); +DEFINE_GLOBAL(bool, vis, false); + +DEFINE_GLOBAL(std::string, input_data_source, "1"); +DEFINE_GLOBAL(int, max_num, 32); +DEFINE_GLOBAL(bool, dynamic_batch, false); + +#ifdef USE_OPENCV +template +void fill_tensor_with_cvmat(const cv::Mat& img_in, Tensor& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = (float*)tout.mutable_data(); + int stride = width * height; + + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } +} +#endif + +void SplitString(const std::string& s, + std::vector& v, const std::string& c) { + + std::string::size_type pos1, pos2; + pos2 = s.find(c); + pos1 = 0; + + while (std::string::npos != pos2) { + v.push_back(s.substr(pos1, pos2 - pos1)); + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + + if (pos1 != s.length()) { + v.push_back(s.substr(pos1)); + } +} + +bool read_image_list(std::string& filename, + std::vector& results, std::vector& label) { + + //std::cout << "image list: " << filename << std::endl; + std::ifstream infile(filename.c_str()); + + if (!infile.good()) { + std::cout << "Cannot open " << std::endl; + return false; + } + + std::string line; + + while (std::getline(infile, line)) { + std::vector v; + SplitString(line, v, " "); + + if (v.size() < 2) { + LOG(FATAL) << "wrong file list! [path label]"; + } + + results.push_back(v[0]); + label.push_back(atoi(v[1].c_str())); + } + + return true; +} + +int print_topk(const float* scores, const int size, const int topk, \ + const std::vector& labels) { + + std::vector< std::pair > vec; + vec.resize(size); + + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + + std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), + std::greater< std::pair >()); + + // LOG(INFO) << " out: " << vec[0].second <<" label: "<< labels[0]; + // print topk and score + for (int i = 0; i < topk; i++) { + // float score = vec[i].first; + // int index = vec[i].second; + if (vec[i].second == labels[0]) { + return 1; + } + + // LOG(INFO) << i <<": " << index << " " << labels[index] << " " << score; + } + + return 0; +} + +//! set your mean value and scale value here +float mean_mb[3] = {103.939, 116.779, 123.68}; +float scale_mb[3] = {1.f, 1.f, 1.f}; + +template +void model_test() { +#ifdef USE_OPENCV + using namespace cv; +#endif + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << GLB_model_path << " ..."; + + // load anakin model files. + auto status = graph->load(GLB_model_path); + + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + auto in_list = graph->get_ins(); + + int max_batch_size = (GLB_max_num > GLB_num) ? GLB_max_num : GLB_num; + int batch_size = GLB_num; + + //reshape shape batch-size + // set batch + graph->ResetBatchSize("input_0", max_batch_size); + LOG(INFO) << "set max_batch_size : " << max_batch_size; + + //anakin graph optimization + + graph->load_calibrator_config("net_pt_config", "calibrate_file.txt"); + graph->Optimize(); + + // constructs the executer net + Net net_executer(true); + net_executer.init(*graph); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + d_tensor_in_p->set_num(batch_size); + LOG(INFO) << "set batch_size : " << batch_size; + + if (! GLB_graph_reset_bs) { + // get in + auto init_shape_in = d_tensor_in_p->valid_shape(); + Shape new_shape({GLB_num, init_shape_in[1], init_shape_in[2], init_shape_in[3]}, Layout_NCHW); + d_tensor_in_p->reshape(new_shape); + } + + Tensor4d h_tensor_in; + Tensor out_host; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + int width = d_tensor_in_p->width(); + int height = d_tensor_in_p->height(); + int num = d_tensor_in_p->num(); + + // ==================== precision =================== + int top1_count = 0; + int top5_count = 0; + int total_count = 0; + // ================================================== + + // for (int img_num = 0; img_num < image_file_list.size(); ++img_num) + int new_batch_size = batch_size; + std::vector image_labels; + char pro[102]; + memset(pro, '\0', sizeof(pro)); + const char* spin = "-\\|/"; + int ratio = 0; +#ifdef USE_OPENCV + std::vector image_file_list; + + CHECK(read_image_list(GLB_image_list, image_file_list, image_labels)); + int image_file_list_size = image_file_list.size(); + total_count = image_file_list_size; + + if (GLB_img_num != -1) { + image_file_list_size = GLB_img_num + 1; + } else { + GLB_img_num = 0; + } + + for (int img_num = GLB_img_num; img_num < image_file_list_size; ++img_num) +#else + int img_num = 0; + +#endif + { + if (GLB_dynamic_batch) { + new_batch_size = (img_num % (max_batch_size)) + 1; + } + + d_tensor_in_p->set_num(new_batch_size); + valid_shape_in = d_tensor_in_p->valid_shape(); + h_tensor_in.re_alloc(valid_shape_in); + /*================fill tensor=================*/ +#ifdef USE_OPENCV + fflush(stdout); + ratio = (int)(100.f * (float)img_num / (float)image_file_list_size); + printf("[%-100s][%d\%][%c]\r", pro, ratio, spin[ratio & 3]); + pro[ratio] = '='; + + std::string image_path = GLB_image_root + image_file_list[img_num]; + // LOG(INFO) << "loading image " << image_path << " ..."; + Mat img = imread(image_path, CV_LOAD_IMAGE_COLOR); + + if (img.empty()) { + LOG(FATAL) << "opencv read image " << image_path << " failed"; + } + + // FOR NHWC + if (h_tensor_in.width() == 3) { + fill_tensor_with_cvmat(img, h_tensor_in, batch_size, h_tensor_in.height(), + h_tensor_in.channel(), mean_mb, scale_mb); + } else { + fill_tensor_with_cvmat(img, h_tensor_in, batch_size, h_tensor_in.width(), + h_tensor_in.height(), mean_mb, scale_mb); + } + +#else + fill_tensor_const(h_tensor_in, 1.f); +#endif + d_tensor_in_p->copy_from(h_tensor_in); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + std::string input_file_name = "record_In_0_image_"; + std::ostringstream ss; + ss << input_file_name << img_num << ".txt"; + input_file_name = ss.str(); + // write_tensorfile(*d_tensor_in_p, input_file_name.c_str()); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + /*================launch=======================*/ + Context ctx(GLB_gpu, 0, 0); + + net_executer.prediction(); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + /*=============no dump======================*/ + auto graph_outs = graph->get_outs(); + auto tensor_out_p = net_executer.get_out(graph_outs[0]); + out_host.reshape(tensor_out_p->valid_shape()); + out_host.copy_from(*tensor_out_p); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + top1_count += print_topk((const float*)out_host.data(), 1000, 1, {image_labels[img_num]}); + top5_count += print_topk((const float*)out_host.data(), 1000, 5, {image_labels[img_num]}); + // for (int out_id = 0; out_id < graph_outs.size(); ++out_id) { + // auto tensor_out_p = net_executer.get_out(graph_outs[out_id]); + // write_tensorfile(*tensor_out_p, + // ("record_" + graph_outs[out_id] + "_image_" + std::to_string(img_num) + ".txt").c_str()); + // } + } + float top1 = (float)top1_count / (float)total_count; + float top5 = (float)top5_count / (float)total_count; + LOG(INFO) << " top1: " << top1 << " top5: " << top5; + + delete graph; +} + +TEST(NetTest, net_execute_base_test) { +#ifdef USE_CUDA + model_test(); +#endif +#ifdef USE_X86_PLACE + model_test(); +#endif +} + +int main(int argc, const char** argv) { +#ifdef USE_OPENCV + + if (argc < 4) { + LOG(FATAL) << "bad param \n ./anakin_model_test + model_path + img_root + img_list + [batch]"; + } else if (argc >= 4) { + GLB_model_path = argv[1]; + GLB_image_root = argv[2]; + GLB_image_list = argv[3]; + } + + GLB_num = argc >= 5 ? atoi(argv[4]) : 1; + GLB_gpu = argc >= 6 ? atoi(argv[5]) : 0; + GLB_img_num = argc >= 7 ? atoi(argv[6]) : -1; +#else + + if (argc < 2) { + LOG(FATAL) << "bad param \n ./anakin_model_test + model_path + [batch]"; + } else if (argc >= 2) { + GLB_model_path = argv[1]; + } + +#endif + + LOG(INFO) << " model path: " << GLB_model_path; + LOG(INFO) << " image root: " << GLB_image_root; + LOG(INFO) << " image list: " << GLB_image_list; + LOG(INFO) << " GLB_num: " << GLB_num; + LOG(INFO) << " using GPU: " << GLB_gpu; + +#ifdef USE_CUDA + cudaSetDevice(GLB_gpu); + anakin::saber::Env::env_init(); + anakin::saber::Env::env_init(); + cudaSetDevice(GLB_gpu); +#endif + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_exec_test_for_feed.cpp b/test/framework/net/net_exec_test_for_feed.cpp new file mode 100644 index 000000000..8a223103b --- /dev/null +++ b/test/framework/net/net_exec_test_for_feed.cpp @@ -0,0 +1,229 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include + +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +//#define USE_DIEPSE + +std::string g_model_path = "/path/to/your/anakin_model"; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 0; // 0 means not set max batch +int g_feature_size = 10; // we support different feature size in different slots. +int g_warm_up = 100; +int g_epoch = 1000; +int g_device_id = 0; +int g_thread_num = 1; +std::string g_data_path=""; + +std::vector + split_string(const std::string& s, char delim) { + + std::stringstream ss(s); + std::string item; + std::vector elems; + while (std::getline(ss, item, delim)) { + elems.push_back(item); + } + return elems; +} + +void read_slot_file(std::vector>& input_data, std::string& data_path, int max_batch = 0) { + + std::ifstream infile(data_path); + if (!infile.good()) { + LOG(FATAL) <<"Cannot open " << data_path; + } + int max_feature = 0; + LOG(INFO) << "found filename: " << data_path; + std::string line; + int line_num = 0; + while (std::getline(infile, line)) { + std::vector line_vector; + std::vector split_line = split_string(line,'\t'); + std::string line_key = split_line[0]; + std::vector line_data = + split_string(split_line[1],' '); + for (auto c : line_data) { + line_vector.push_back((float)atof(c.c_str())); + } + if (max_feature < line_vector.size()) { + max_feature = line_vector.size(); + } + input_data.push_back(line_vector); + if (max_batch != 0) { + ++line_num; + if (line_num >= (412 * max_batch)) { +// LOG(INFO) << "line_num = " << line_num << " max_batch = " << max_batch; + break; + } + } + } + LOG(INFO) << "max_feature = " << max_feature; +} + +#if defined(USE_CUDA)||defined(USE_X86_PLACE) +#if defined(USE_X86_PLACE) +#include "mkl_service.h" +#include "omp.h" +#endif + +TEST(NetTest, net_execute_base_test) { +#if defined(USE_X86_PLACE) + if (g_thread_num != 0) { + omp_set_dynamic(0); + omp_set_num_threads(g_thread_num); + mkl_set_num_threads(g_thread_num); + } else { + LOG(INFO) << "use all core on CPU!!"; + } +#endif + std::vector> input_data; + read_slot_file(input_data, g_data_path, g_batch_size); + + CHECK_EQ((input_data.size() % 412), 0) << " FATAL ERROR slot num is not right!!! "; + + std::vector seq_offset{0}; + for (int i = 1; i < input_data.size() + 1; ++i) { + seq_offset.push_back(seq_offset[i - 1] + input_data[i - 1].size() / 11); + } +// printf_pointer(seq_offset.data(), seq_offset.size()); + + Graph *graph = new Graph(); + LOG(INFO) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + int total_feature_size = seq_offset[seq_offset.size() - 1]; // this is feature_size + int slot = 412; // this is slots num + int max_batch = 2048; // the possible max batch + + // reshape the input_0 's shape for graph model + Shape shape({max_batch, 1, total_feature_size / max_batch, 11}, Layout_NCHW); + + graph->Reshape("input_0", shape); +// graph->ResetBatchSize("input_0", g_batch_size); + LOG(INFO) << "g_batch_size = " << g_batch_size; + //anakin graph optimization + graph->Optimize(); + Net net_executer(true); + + net_executer.init(*graph); + // get in + auto ins = graph->get_ins(); + auto d_tensor_in_p = net_executer.get_in(ins[0]); + Shape new_shape({1, 1, total_feature_size, 11}, Layout_NCHW); + + d_tensor_in_p->reshape(new_shape); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i = 0; i < valid_shape_in.size(); i++) { + LOG(INFO) << "detect input_0 dims[" << i << "]" << valid_shape_in[i]; + } + + h_tensor_in.re_alloc(valid_shape_in); + float *h_data = (float *) (h_tensor_in.mutable_data()); + + int idx = 0; + for (auto i : input_data) { + for (auto j : i) { + h_data[idx++] = j; + } + } + d_tensor_in_p->copy_from(h_tensor_in); + d_tensor_in_p->set_seq_offset({seq_offset}); + // do inference + Context ctx(g_device_id, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + Tensor h_tensor_out; + h_tensor_out.re_alloc(net_executer.get_out_list()[0]->valid_shape(), AK_FLOAT); + +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + + my_time.start(ctx); + //auto start = std::chrono::system_clock::now(); + for (int i = 0; i < g_epoch; i++) { +// d_tensor_in_p->copy_from(h_tensor_in); + //DLOG(ERROR) << " g_epoch(" << i << "/" << g_epoch << ") "; + net_executer.prediction(); +// h_tensor_out.copy_from(*net_executer.get_out_list()[0]); + } +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_epoch); +#endif + + LOG(INFO) << "aveage time " << my_time.get_average_ms() / g_epoch << " ms"; + write_tensorfile(*net_executer.get_out_list()[0], "output.txt"); + //} // inner scope over + + LOG(ERROR) << "inner net exe over !"; + + // save the optimized model to disk. + std::string save_g_model_path = g_model_path + std::string(".saved"); + status = graph->save(save_g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + if (!graph) { + delete graph; + } +} + +#endif + +int main(int argc, const char **argv) { + if (argc < 2) { + LOG(FATAL) << "no input!!!, usage: ./" << argv[0] + << " model_path input_data_path [batch_size] [device_id]"; + return -1; + } + if (argc > 1) { + g_model_path = std::string(argv[1]); + } + if (argc > 2) { + g_data_path = std::string(argv[2]); + } + if (argc > 3) { + g_batch_size = atoi(argv[3]); + } + if (argc > 4) { + g_device_id = atoi(argv[4]); + } + TargetWrapper::set_device(g_device_id); + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_exec_test_int8.cpp b/test/framework/net/net_exec_test_int8.cpp index a72a29f82..ebf81e8dd 100644 --- a/test/framework/net/net_exec_test_int8.cpp +++ b/test/framework/net/net_exec_test_int8.cpp @@ -19,18 +19,22 @@ using Target_H = X86; //#define USE_DIEPSE -std::string model_path = "/home/zhangshuai20/workspace/baidu/sys-hic-gpu/anakin-models/adu/anakin_models/yolo_camera_detector/yolo_camera_detector.anakin.bin"; -//std::string model_path = "/home/zhangshuai20/workspace/baidu/sys-hic-gpu/anakin-models/public/anakin_models/Resnet50/Resnet50.anakin.bin"; - -std::string model_saved_path = model_path + ".saved"; +//std::string g_model_path = "/home/zhangshuai20/workspace/baidu/sys-hic-gpu/anakin-models/adu/anakin_models/yolo_camera_detector/yolo_camera_detector.anakin.bin"; +std::string g_model_path = "/path/to/your/anakin_model"; +int g_batch_size = 1; +int g_warm_up = 10; +int g_epoch = 1000; +int g_device_id = 0; +//std::string model_path = "/home/zhangshuai20/workspace/baidu/sys-hic-gpu/anakin-models/public/anakin_models/vgg16/vgg16.anakin.bin"; +std::string model_saved_path = g_model_path + ".saved"; #ifdef USE_CUDA #if 1 TEST(NetTest, net_execute_base_test) { - Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << model_path << " ..."; + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; // load anakin model files. - auto status = graph->load(model_path); + auto status = graph->load(g_model_path); if (!status ) { LOG(FATAL) << " [ERROR] " << status.info(); } @@ -41,7 +45,7 @@ TEST(NetTest, net_execute_base_test) { // register all tensor inside graph // graph->RegistAllOut(); - +// graph->load_calibrator_config("net_pt_config", "calibrate_file.txt"); // register edge // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand"); // graph->RegistOut("relu#3(conv2d_0)","pool2d#4(pool2d_0)"); @@ -53,31 +57,30 @@ TEST(NetTest, net_execute_base_test) { //{ // inner scope #ifdef USE_DIEPSE //Net net_executer(*graph, true); - Net net_executer(true); + Net net_executer(true); #else //Net net_executer(*graph, true); - Net net_executer(true); + Net net_executer(true); #endif - - net_executer.load_calibrator_config("net_pt_config.txt", "cal_file.txt"); +// net_executer.load_x86_layout_config("layout_config.txt"); net_executer.init(*graph); // get in auto d_tensor_in_p = net_executer.get_in("input_0"); Tensor4d h_tensor_in; auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i=0; iget_dtype()); +// float* h_data = (float*)(h_tensor_in.mutable_data()); +// +// for (int i=0; icopy_from(h_tensor_in); +// d_tensor_in_p->copy_from(h_tensor_in); #ifdef USE_DIEPSE // for diepse model @@ -114,23 +117,24 @@ TEST(NetTest, net_execute_base_test) { d_tensor_in_2_p->copy_from(h_tensor_in_2); #endif - int epoch = 1; + int epoch = g_epoch; // do inference Context ctx(0, 0, 0); saber::SaberTimer my_time; LOG(WARNING) << "EXECUTER !!!!!!!! "; // warm up - /*for(int i=0; i<10; i++) { + for (int i = 0; i(end - start).count(); //LOG(WARNING) << "avg time : " << time/epoch <<" ms"; - my_time.end(ctx); - LOG(INFO)<<"aveage time "<save(save_model_path); - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } +// auto tensor_out_0_p = net_executer.get_out("dim_pred_out"); +// +// +// // get out result +// //LOG(WARNING)<< "result avg: " << tensor_average(tensor_out_0_p); +// test_print(tensor_out_0_p); +// +// // save the optimized model to disk. +// std::string save_model_path = g_model_path + std::string(".saved"); +// status = graph->save(save_model_path); +// if (!status ) { +// LOG(FATAL) << " [ERROR] " << status.info(); +// } if (!graph){ delete graph; } } -#endif +#endif #endif -#ifdef USE_CUDA +#ifdef USE_CUDA2 TEST(NetTest, net_execute_reconstruction_test) { Graph* graph = new Graph(); LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ..."; @@ -234,7 +246,6 @@ TEST(NetTest, net_execute_reconstruction_test) { // constructs the executer net Net net_executer(true); - net_executer.load_calibrator_config("net_pt_config.txt", "cal_file.txt"); net_executer.init(*graph); // get in auto d_tensor_in_p = net_executer.get_in("input_0"); @@ -281,11 +292,29 @@ TEST(NetTest, net_execute_reconstruction_test) { } #endif int main(int argc, const char** argv){ - + if (argc < 2) { + LOG(ERROR) << "no input!!!, usage: ./" << argv[0] << " model_path [batch size] [warm_up_iter] [test_iter] [device_id]"; + return -1; + } + if (argc > 1) { + g_model_path = std::string(argv[1]); + } + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_warm_up = atoi(argv[3]); + } + if (argc > 4) { + g_epoch = atoi(argv[4]); + } + if (argc > 5) { + g_device_id = atoi(argv[5]); + } Env::env_init(); // initial logger logger::init(argv[0]); InitTest(); - RUN_ALL_TESTS(argv[0]); + RUN_ALL_TESTS(argv[0]); return 0; } diff --git a/test/framework/net/net_exec_test_rt.cpp b/test/framework/net/net_exec_test_rt.cpp deleted file mode 100644 index 8fcb34326..000000000 --- a/test/framework/net/net_exec_test_rt.cpp +++ /dev/null @@ -1,121 +0,0 @@ - -#include -#include "graph_base.h" -#include "graph.h" -#include -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include "saber/funcs/timer.h" -#include -#include "debug.h" -#include - -#ifdef USE_TENSORRT -#include "rt_net.h" -using namespace anakin; -using ::anakin::test::Test; - -using namespace anakin::graph; -std::string g_model_path = "/path/to/your/anakin_model"; - -std::string model_saved_path = g_model_path + ".saved"; -int g_batch_size = 1; -int g_warm_up = 10; -int g_epoch = 1000; -int g_device_id = 0; - - -void rt_net_test() { - Graph* graph = new Graph(); - LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; - // load anakin model files. - auto status = graph->load(g_model_path); - if(!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - - graph->ResetBatchSize("input_0", g_batch_size); - - graph->Optimize(true); - - RTNet net_executer(*graph, NULL); - - // get in - auto d_tensor_in_p = net_executer.get_in("input_0"); - Tensor4d h_tensor_in; - - auto valid_shape_in = d_tensor_in_p->valid_shape(); - for (int i=0; icopy_from(h_tensor_in); - - - //int g_epoch = 1000; - //int g_warm_up=10; - // do inference - LOG(WARNING) << "EXECUTER !!!!!!!! "; - // warm up - for(int i = 0; i < g_warm_up; i++) { - net_executer.prediction(); - } - - //auto start = std::chrono::system_clock::now(); - for(int i = 0; i < g_epoch; i++) { - //DLOG(ERROR) << " g_epoch(" << i << "/" << g_epoch << ") "; - net_executer.prediction(); - } - cudaDeviceSynchronize(); - - //write_tensorfile(*net_executer.get_out_list()[0],"output.txt"); - - LOG(ERROR) << "inner net exe over !"; - for(auto x:net_executer.get_out_list()){ - print_tensor(*x); - } - // save the optimized model to disk. - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); - } - if (!graph){ - delete graph; - } -} -#endif - -int main(int argc, const char** argv){ - if (argc < 2){ - LOG(ERROR)<<"no input!!!"; - return; - } -#ifdef USE_TENSORRT - if (argc > 1) { - g_model_path = std::string(argv[1]); - } - if (argc > 2) { - g_batch_size = atoi(argv[2]); - } - if (argc > 3) { - g_warm_up = atoi(argv[3]); - } - if (argc > 4) { - g_epoch = atoi(argv[4]); - } - if (argc > 5) { - g_device_id = atoi(argv[5]); - } - cudaSetDevice(g_device_id); - // initial logger - logger::init(argv[0]); - rt_net_test(); -#endif - return 0; -} diff --git a/test/framework/net/net_exec_test_rt.cpp:28:17: b/test/framework/net/net_exec_test_rt.cpp:28:17: deleted file mode 100644 index c91891721..000000000 --- a/test/framework/net/net_exec_test_rt.cpp:28:17: +++ /dev/null @@ -1 +0,0 @@ -f USE diff --git a/test/framework/net/net_exec_test_x86.cpp b/test/framework/net/net_exec_test_x86.cpp index 4ec56d59c..8b604f8cb 100644 --- a/test/framework/net/net_exec_test_x86.cpp +++ b/test/framework/net/net_exec_test_x86.cpp @@ -7,98 +7,318 @@ //#define USE_DIEPSE -std::string g_model_path = "/home/liujunjie03/py_anakin/tools/external_converter_v2/output/vggish.anakin.bin"; +std::string g_model_path = ""; std::string model_saved_path = g_model_path + ".saved"; int g_batch_size = 1; -int g_warm_up = 10; -int g_epoch = 1000; +int g_warm_up = 0; +int g_epoch = 1; +int g_thread_num = 1; +bool g_random = 0; +int g_instance = 1; +int g_change_batch = 0; +int g_auto_config_layout = 0; +#define USE_FROZEN_INT8 0 #ifdef USE_X86_PLACE -#include +#include "mkl_service.h" #include "omp.h" #if 1 -TEST(NetTest, net_execute_base_test) { +void instance_run() { + + if (g_thread_num != 0) { + omp_set_dynamic(0); + omp_set_num_threads(g_thread_num); + mkl_set_num_threads(g_thread_num); + } else { + LOG(INFO) << "use all core!!"; + } + + LOG(INFO) << "set thread = " << g_thread_num << " , " << mkl_get_max_threads() << "," << + omp_get_max_threads(); + +#if USE_FROZEN_INT8 + Graph* graph = new Graph(); +#else Graph* graph = new Graph(); +#endif LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; // load anakin model files. auto status = graph->load(g_model_path); + if (!status) { LOG(FATAL) << " [ERROR] " << status.info(); } - graph->ResetBatchSize("input_0", g_batch_size); +#if USE_FROZEN_INT8 + +#else + graph->load_calibrator_config("net_pt_config", "cal_file"); + graph->load_layout_config("model_layout_config"); +#endif + // graph->Reshape("input_0",Shape({1,3,400,600},Layout_NCHW)); + std::vector& vin_name = graph->get_ins(); + + for (int j = 0; j < vin_name.size(); ++j) { + graph->ResetBatchSize("input_0", g_batch_size); + } + +#if USE_FROZEN_INT8 + graph->Optimize(false); +#else graph->Optimize(); +#endif +#if USE_FROZEN_INT8 + Net net_executer(true); +#else Net net_executer(true); - net_executer.load_calibrator_config("net_pt_config.txt","cal_file"); - net_executer.init(*graph); +#endif + if (g_auto_config_layout){ + LOG(INFO) << "===================auto_config_layout===================="; + net_executer.init(*graph,true); + }else { +// net_executer.load_x86_layout_config("layout_config_me.txt"); + net_executer.init(*graph); + } // get in -// auto d_tensor_in_p = net_executer.get_in("input_0"); - std::vector& vin_name = graph->get_ins(); + std::vector> seq_offset={{0,g_batch_size}}; + srand(12345); + for (int j = 0; j < vin_name.size(); ++j) { - auto d_tensor_in_p = net_executer.get_in(vin_name[j]); - fill_tensor_const(*d_tensor_in_p, 1.f); + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + // d_tensor_in_p->reshape(Shape({1,3,400,600},Layout_NCHW)); + LOG(INFO) << "input name: " << vin_name[j] << " , " << d_tensor_in_p->valid_shape(); + d_tensor_in_p->set_seq_offset(seq_offset); + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } } - // do inference Context ctx(0, 0, 0); saber::SaberTimer my_time; LOG(WARNING) << "EXECUTER !!!!!!!! "; - // warm up - for (int i = 0; i < g_warm_up; i++) { - net_executer.prediction(); - } + + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } my_time.start(ctx); + + int real_batch=1; for (int i = 0; i < g_epoch; i++) { + if (g_change_batch > 0) { + real_batch = real_batch < g_batch_size ? real_batch + 1 : 1; + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + Shape old_shape=d_tensor_in_p->valid_shape(); + old_shape.set_num(real_batch); + d_tensor_in_p->reshape(old_shape); + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + } + net_executer.prediction(); } + my_time.end(ctx); - LOG(INFO)<<"aveage time "<save(save_g_model_path); - if (!status ) { - LOG(FATAL) << " [ERROR] " << status.info(); + LOG(INFO) << "g_auto_config_layout:" << g_auto_config_layout; + LOG(INFO) << "average time " << my_time.get_average_ms() / g_epoch << " ms"; + + std::vector& out_name = graph->get_outs(); + + for (int j = 0; j < out_name.size(); ++j) { + LOG(INFO) << "output tensor : " << out_name[j]<<","<valid_shape(); + write_tensorfile(*net_executer.get_out(out_name[j]), out_name[j].c_str()); + } + +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_warm_up + g_epoch); +#endif + + // std::string save_g_model_path = g_model_path + std::string(".saved"); + // status = graph->save(save_g_model_path); + delete graph; +} +#endif + +void multi_instance_run(){ + std::vector> instances_vec; + for (int i = 0; i < g_instance; ++i) { + instances_vec.emplace_back( + new std::thread(&instance_run)); } - if (!graph){ - delete graph; + for (int i = 0; i < g_instance; ++i) { + instances_vec[i]->join(); } } -#endif +#if 0 +void net_execute_base_test_int8() { + + if (g_thread_num != 0) { + omp_set_dynamic(0); + omp_set_num_threads(g_thread_num); + mkl_set_num_threads(g_thread_num); + } else { + LOG(INFO) << "use all core!!"; + } + + LOG(INFO) << "set thread = " << g_thread_num << " , " << mkl_get_max_threads() << "," << + omp_get_max_threads(); + + Graph* graph = new Graph(); + + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + graph->load_calibrator_config("net_pt_config.txt", "cal_file"); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // graph->Reshape("input_0",Shape({1,3,400,600},Layout_NCHW)); + std::vector& vin_name = graph->get_ins(); + + for (int j = 0; j < vin_name.size(); ++j) { + graph->ResetBatchSize("input_0", g_batch_size); + } + + graph->Optimize(); + + Net net_executer(true); + net_executer.load_x86_layout_config("layout_config_me.txt"); + net_executer.init(*graph); + // get in + + srand(12345); + + for (int j = 0; j < vin_name.size(); ++j) { + Tensor* d_tensor_in_p = net_executer.get_in(vin_name[j]); + // d_tensor_in_p->reshape(Shape({1,3,400,600},Layout_NCHW)); + LOG(INFO) << "input name: " << vin_name[j] << " , " << d_tensor_in_p->valid_shape(); + + if (g_random) { + fill_tensor_rand(*d_tensor_in_p); + } else { + fill_tensor_const(*d_tensor_in_p, 1.f); + } + } + + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + + LOG(WARNING) << "EXECUTER !!!!!!!! "; + + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + + my_time.start(ctx); + + for (int i = 0; i < g_epoch; i++) { + net_executer.prediction(); + } + + my_time.end(ctx); + LOG(INFO) << "average time " << my_time.get_average_ms() / g_epoch << " ms"; + std::vector& out_name = graph->get_outs(); -int main(int argc, const char** argv){ - if (argc < 2){ - LOG(ERROR)<<"no input!!!"; - return; + for (int j = 0; j < out_name.size(); ++j) { + LOG(INFO) << "output tensor : " << out_name[j]<<","<valid_shape(); + write_tensorfile(*net_executer.get_out(out_name[j]), out_name[j].c_str()); } + +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_warm_up + g_epoch); +#endif + + // std::string save_g_model_path = g_model_path + std::string(".saved"); + // status = graph->save(save_g_model_path); +} +#endif + +/** + * g_model_path 模型地址 + * g_batch_size batch大小,默认1 + * g_warm_up 预热次数,默认0 + * g_epoch 计时次数,默认1 + * g_thread_num 用到的线程数,默认1 + * g_random 是否是随机数输入,默认是,0代表常量输入 + * @param argc + * @param argv + * @return + */ + +int main(int argc, const char** argv) { +// LOG(INFO)<<"kmp_get_affinity_max_proc = "< 1) { g_model_path = std::string(argv[1]); } + if (argc > 2) { g_batch_size = atoi(argv[2]); } + if (argc > 3) { g_warm_up = atoi(argv[3]); } + if (argc > 4) { g_epoch = atoi(argv[4]); } - Env::env_init(); + if (argc > 5) { + g_thread_num = atoi(argv[5]); + } + + if (argc > 6) { + g_random = atoi(argv[6]); + } + + if (argc > 7) { + g_auto_config_layout = atoi(argv[7]); + } + + if (argc > 8) { + g_instance = atoi(argv[8]); + } + + if (argc > 9) { + g_change_batch = atoi(argv[9]); + } + + + + Env::env_init(); // initial logger logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; + + multi_instance_run(); + + return 0; } #else -int main(int argc, const char** argv){ - +int main(int argc, const char** argv) { + return 0; } #endif diff --git a/test/framework/net/net_subgraph_test.cpp b/test/framework/net/net_subgraph_test.cpp new file mode 100644 index 000000000..af2a53b71 --- /dev/null +++ b/test/framework/net/net_subgraph_test.cpp @@ -0,0 +1,613 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + + +TEST(NetTest, net_execute_subgraph_0) { + Graph* graph = new Graph(); + + std::vector input{"x"}; + std::vector output{"y"}; + + graph->AddOp("op1", "Dense", input, output); + graph->AddOpAttr("op1", "out_dim", 2); + graph->AddOpAttr("op1", "bias_term", false); + graph->AddOpAttr("op1", "axis", 3); + std::vector shape = {1, 1, 3, 2}; + anakin::saber::Shape tmp_shape{shape}; + PBlock weight1(tmp_shape); + float *cpu_data = static_cast(weight1.h_tensor().mutable_data()); + for (int i = 0; i < 2 * 3; i++) { cpu_data[i] = i + 1; } + + weight1.d_tensor().set_shape(tmp_shape); + weight1.d_tensor().copy_from(weight1.h_tensor()); + + graph->AddOpAttr("op1", "weight_1", weight1); + + graph->Freeze(); + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + anakin::PTuple input_shape = {1, 1, 1, 3}; + graph->AddOpAttr("x", "input_shape", input_shape); + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto d_tensor_in_p = net_executer_p->get_in("x"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + net_executer_p->prediction(); + + auto tensor_out = net_executer_p->get_out("y"); + LOG(INFO) << "get output tensor:"; + test_print(tensor_out); +} + +TEST(NetTest, net_execute_subgraph_three_fc_with_split) { + Graph* graph = new Graph(); + + auto add_fc_op = [&](const std::string& fc_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(fc_name, "Dense", input, output); + graph->AddOpAttr(fc_name, "out_dim", 5); + graph->AddOpAttr(fc_name, "bias_term", false); + graph->AddOpAttr(fc_name, "axis", 1); + std::vector shape = {1, 1, 5, 5}; + anakin::saber::Shape tmp_shape{shape}; + PBlock weight1(tmp_shape); + float *cpu_data = static_cast(weight1.h_tensor().mutable_data()); + for (int i = 0; i < 5*5; i++) { cpu_data[i] = i + 1; } + + weight1.d_tensor().set_shape(tmp_shape); + weight1.d_tensor().copy_from(weight1.h_tensor()); + + graph->AddOpAttr(fc_name, "weight_1", weight1); + + }; + + add_fc_op("op1", {"op1_in"}, {"temp"}); + add_fc_op("op2", {"temp"}, {"op2_out"}); + add_fc_op("op3", {"temp"}, {"op3_out"}); + + auto status = graph->Freeze(); + if (!status){ + LOG(FATAL) << "Freeze error"; + } + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = std::string("subgraph.saved"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + + anakin::PTuple input_shape = {1, 5, 1, 1}; + graph->AddOpAttr("op1_in", "input_shape", input_shape); + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto d_tensor_in_p = net_executer_p->get_in("op1_in"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + net_executer_p->prediction(); + + auto tensor_out_2 = net_executer_p->get_out("op2_out"); + LOG(INFO) << "get output tensor 2:"; + test_print(tensor_out_2); + auto tensor_out_3 = net_executer_p->get_out("op3_out"); + LOG(INFO) << "get output tensor 3:"; + test_print(tensor_out_3); + + +} + +TEST(NetTest, net_execute_subgraph_mult_fc) { + Graph* graph = new Graph(); + + auto add_fc_op = [&](const std::string& fc_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(fc_name, "Dense", input, output); + graph->AddOpAttr(fc_name, "out_dim", 1); + graph->AddOpAttr(fc_name, "bias_term", false); + graph->AddOpAttr(fc_name, "axis", 1); + std::vector shape = {1, 1, 1, 1}; + anakin::saber::Shape tmp_shape{shape}; + PBlock weight1(tmp_shape); + float *cpu_data = static_cast(weight1.h_tensor().mutable_data()); + for (int i = 0; i < 1*1; i++) { cpu_data[i] = i + 1; } + + weight1.d_tensor().set_shape(tmp_shape); + weight1.d_tensor().copy_from(weight1.h_tensor()); + + graph->AddOpAttr(fc_name, "weight_1", weight1); + + }; + auto add_concat_op = [&](const std::string& cc_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(cc_name, "concat", input, output); + graph->AddOpAttr(cc_name, "axis", 3); + }; + + auto add_relu_op = [&](const std::string& relu_name, + const std::vector& input, + const std::vector& output){ + graph->AddOp(relu_name, "ReLU", input, output); + graph->AddOpAttr(relu_name, "alpha", 0.0f); + }; + + + + add_fc_op("op0", {"x"}, {"out0"}); + add_fc_op("op1", {"x"}, {"out1"}); + add_fc_op("op2", {"x"}, {"out2"}); + add_fc_op("op3", {"x"}, {"out3"}); + add_fc_op("op4", {"x"}, {"out4"}); + add_fc_op("op5", {"x"}, {"out5"}); + add_fc_op("op6", {"x"}, {"out6"}); + add_concat_op("concat", {"out0", "out1", "out2", "out3", "out4", "out5", "out6"}, {"out_concat"}); + add_relu_op("relu", {"out_concat"}, {"out"}); + + + // this api should be called before freeze + graph->RegistVar("out0"); + + auto status = graph->Freeze(); + if (!status){ + LOG(FATAL) << "Freeze error"; + } + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = std::string("multi_fc_subgraph_with_regist_input.saved2"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + + anakin::PTuple input_shape = {1, 1, 1, 1}; + graph->AddOpAttr("x", "input_shape", input_shape); + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto d_tensor_in_p = net_executer_p->get_in("x"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + net_executer_p->prediction(); + + //auto tensor_out = net_executer_p->get_out("out"); + //LOG(INFO) << "get output tensor"; + //test_print(tensor_out); +} + +TEST(NetTest, net_execute_subgraph_concat) { + Graph* graph = new Graph(); + + auto add_concat_op = [&](const std::string& cc_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(cc_name, "concat", input, output); + graph->AddOpAttr(cc_name, "axis", 3); + }; + + add_concat_op("concat_1", {"x", "y"}, {"out"}); + + auto status = graph->Freeze(); + if (!status){ + LOG(FATAL) << "Freeze error"; + } + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = std::string("concat_subgraph.saved2"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + + anakin::PTuple input_shape_x = {1, 1, 5, 1}; + graph->AddOpAttr("x", "input_shape", input_shape_x); + anakin::PTuple input_shape_y = {1, 1, 5, 3}; + graph->AddOpAttr("y", "input_shape", input_shape_y); + + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto xd_tensor_in_p = net_executer_p->get_in("x"); + auto yd_tensor_in_p = net_executer_p->get_in("y"); + auto fill_tensor = [&](Tensor4d * d_tensor_p, float val) { + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + }; + + fill_tensor(xd_tensor_in_p, 1.0); + fill_tensor(yd_tensor_in_p, 2.0); + + + net_executer_p->prediction(); + + auto tensor_out = net_executer_p->get_out("out"); + LOG(INFO) << "get output tensor"; + test_print(tensor_out); +} + +TEST(NetTest, net_execute_subgraph_eltwise) { + Graph* graph = new Graph(); + + auto add_eltwise_op = [&](const std::string& eltwise_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(eltwise_name, "Eltwise", input, output); + graph->AddOpAttr(eltwise_name, "type", std::string("Add")); + anakin::PTuple coeff; + coeff.push_back(1.0); + coeff.push_back(-1.0); + LOG(INFO) << "coeff[0] " << coeff[0]; + //LOG(INFO) << "coeff[1] " << coeff[1]; + graph->AddOpAttr(eltwise_name, "coeff", coeff); + + }; + + add_eltwise_op("eltwise", {"x", "y"}, {"out"}); + + auto status = graph->Freeze(); + if (!status){ + LOG(FATAL) << "Freeze error"; + } + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = std::string("eltwise_subgraph.saved2"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + + anakin::PTuple input_shape_x = {1, 1, 1, 3}; + graph->AddOpAttr("x", "input_shape", input_shape_x); + anakin::PTuple input_shape_y = {1, 1, 1, 3}; + graph->AddOpAttr("y", "input_shape", input_shape_y); + + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto xd_tensor_in_p = net_executer_p->get_in("x"); + auto yd_tensor_in_p = net_executer_p->get_in("y"); + auto fill_tensor = [&](Tensor4d * d_tensor_p, float val) { + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + }; + + fill_tensor(xd_tensor_in_p, 2.0); + fill_tensor(yd_tensor_in_p, 3.0); + + + net_executer_p->prediction(); + + auto tensor_out = net_executer_p->get_out("out"); + LOG(INFO) << "get output tensor"; + test_print(tensor_out); +} + +TEST(NetTest, net_execute_subgraph_resnet_base_arch) { + Graph* graph = new Graph(); + + auto add_conv_op = [&](const std::string& conv_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(conv_name, "Convolution", input, output); + graph->AddOpAttr(conv_name, "group", 1); + graph->AddOpAttr(conv_name, "bias_term", false); + graph->AddOpAttr>(conv_name, "padding", {0, 0}); + graph->AddOpAttr>(conv_name, "strides", {1, 1}); + graph->AddOpAttr>(conv_name, "dilation_rate", {0, 0}); + graph->AddOpAttr(conv_name, "filter_num", 1); + graph->AddOpAttr>(conv_name, "kernel_size", {1, 1}); + graph->AddOpAttr(conv_name, "axis", 1); + + std::vector shape = {1, 1, 1, 1}; + anakin::saber::Shape tmp_shape{shape}; + auto* weight1 = graph::GraphGlobalMem::Global().template new_block(tmp_shape); + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + for (int i = 0; i < 1*1; i++) { cpu_data[i] = i + 1; } + + weight1->d_tensor().set_shape(tmp_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + + graph->AddOpAttr(conv_name, "weight_1", *weight1); + + }; + auto add_relu_op = [&](const std::string& relu_name, + const std::vector& input, + const std::vector& output){ + graph->AddOp(relu_name, "ReLU", input, output); + graph->AddOpAttr(relu_name, "alpha", 0.0f); + }; + + auto add_eltwise_op = [&](const std::string& eltwise_name, + const std::vector& input, + const std::vector& output) { + graph->AddOp(eltwise_name, "Eltwise", input, output); + graph->AddOpAttr(eltwise_name, "type", std::string("Add")); + anakin::PTuple coeff; + coeff.push_back(1.0); + coeff.push_back(1.0); + graph->AddOpAttr(eltwise_name, "coeff", coeff); + + }; + + add_conv_op("conv_0", {"x"}, {"conv_0_out"}); + add_relu_op("conv_0_relu", {"conv_0_out"}, {"conv_0_relu_out"}); + add_conv_op("conv_1", {"conv_0_relu_out"}, {"conv_1_out"}); + add_eltwise_op("eltwise", {"conv_1_out", "conv_0_relu_out"}, {"out"}); + + auto status = graph->Freeze(); + if (!status){ + LOG(FATAL) << "Freeze error"; + } + + for (auto in : graph->get_ins()) { + LOG(INFO) << "get in: " << in; + } + + for (auto out : graph->get_outs()) { + LOG(INFO) << "get out: " << out; + } + + //anakin graph optimization + graph->Optimize(); + + // save the optimized model to disk. + std::string save_model_path = std::string("resnet_subgraph.saved2"); + status = graph->save(save_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + + anakin::PTuple input_shape_x = {1, 1, 1, 1}; + graph->AddOpAttr("x", "input_shape", input_shape_x); + + + //Net net_executer(true); + std::unique_ptr > net_executer_p(new Net(true)); + + + net_executer_p->init(*graph); + + auto xd_tensor_in_p = net_executer_p->get_in("x"); + auto fill_tensor = [&](Tensor4d * d_tensor_p, float val) { + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + }; + + fill_tensor(xd_tensor_in_p, 1.0); + + + net_executer_p->prediction(); + + auto tensor_out = net_executer_p->get_out("out"); + LOG(INFO) << "get output tensor"; + test_print(tensor_out); +} + +TEST(NetTest, net_execute_subgraph_test_share_from) { + // construct base gpu tensor + std::vector shape = {1, 1, 1, 5}; + anakin::saber::Shape tmp_shape{shape}; + Tensor4d d_tensor(tmp_shape); + + auto fill_tensor = [&](Tensor4d * d_tensor_p, float val) { + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + }; + + fill_tensor(&d_tensor, 42.0f); + + + Tensor4d shallow_d_tensor;//(tmp_shape); + shallow_d_tensor.reshape(tmp_shape); + { + // construct shallow copy gpu tensor + //Context ctx(0, 0, 0); + //saber::SaberTimer my_time; + //my_time.start(ctx); + Tensor4d temp_tensor(d_tensor.mutable_data(), Target(), 0, tmp_shape); + //my_time.end(ctx); + //LOG(INFO)<<"aveage time "<::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_subgraph_test2.cpp b/test/framework/net/net_subgraph_test2.cpp new file mode 100644 index 000000000..2114eac0f --- /dev/null +++ b/test/framework/net/net_subgraph_test2.cpp @@ -0,0 +1,82 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +std::string g_model_path = "/home/cuichaowen/baidu/Anakin-2.0/buil/not_fuse_before_net_init.bin"; + +TEST(NetTest, net_execute_base_test) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if(!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + LOG(INFO)<<"net_execute_base_test"; + +#if 0 + graph->Reshape("data", {1, 3, 227, 958}); // right results +#else + graph->Reshape("data", {1, 3, 1500, 1500}); // wrong results +#endif + + graph->Optimize(); + + Net net_executer(true); + + net_executer.init(*graph); + + auto d_tensor_in_p = net_executer.get_in("data"); + + d_tensor_in_p->reshape(Shape({1, 3, 227, 958})); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + net_executer.prediction(); + + auto* tensor_out_0_p = net_executer.get_out("detection_output_0.tmp_0662"); + print_tensor_valid(*tensor_out_0_p); + + + delete graph; +} + + + +int main(int argc, const char** argv){ + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/net_test.h b/test/framework/net/net_test.h index af0e21987..7a662da8d 100644 --- a/test/framework/net/net_test.h +++ b/test/framework/net/net_test.h @@ -5,12 +5,12 @@ You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_NET_TEST_H @@ -31,7 +31,7 @@ using ::anakin::test::Test; using namespace anakin::graph; /** - * \brief Graph test is base Test class for anakin graph funciton. + * \brief Graph test is base Test class for anakin graph funciton. */ class NetTest: public Test { public: @@ -69,6 +69,17 @@ void test_print(Tensor4dPtr& out_tensor_p) { } #endif +#ifdef USE_ARM_PLACE +void test_print(Tensor4dPtr& out_tensor_p) { + LOG(ERROR) << "result count : " << out_tensor_p->valid_shape().count(); + LOG(INFO) << "output num:" << out_tensor_p->valid_size(); + float * data = (float*)(out_tensor_p->mutable_data()); + for (int i = 0; i < out_tensor_p->valid_size(); i++) { + LOG(INFO) << " GET OUT (" << i << ") " << data[i]; + } +} +#endif + template double tensor_average(Tensor4dPtr& out_tensor_p) { double sum = 0.0f; diff --git a/test/framework/net/ps_content2_test.cpp b/test/framework/net/ps_content2_test.cpp new file mode 100644 index 000000000..c0ad7a958 --- /dev/null +++ b/test/framework/net/ps_content2_test.cpp @@ -0,0 +1,628 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include +#include "saber/core/tensor_op.h" + +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + + + +//#define USE_DIEPSE + +//std::string g_model_path = "/home/lxc/projects/models/converter_lego/output/ps.anakin.bin"; +std::string g_model_path = "/home/chengyujuan/baidu/sys-hic-gpu/anakin-models/ps/content2.0/content_dnn_2.0.anakin.bin"; +//std::string g_model_path = "/home/lxc/projects/anakin_icode/Anakin-2.0/tools/external_converter_v2/output/ps.anakin.bin"; +//std::string g_data_path = "/home/lxc/projects/models/test_data/test_40.txt"; +//std::string g_data_path = "/home/lxc/projects/models/test_data/fake.txt"; +//std::string g_data_path = "/home/lxc/projects/models/test_data/sample_by_query_length.expand.sample_url"; +std::string g_data_path = "/home/chengyujuan/ps_content_test_data"; +int g_batch_size = 1; +int g_epoch = 1; + + +std::string model_saved_path = g_model_path + ".saved"; + +// some data pre-handle funcs. +namespace test_ps { + std::vector input_names{"q_basic_input", "q_bigram0_input", "q_bigram1_input", "pt_basic_input", + "pt_bigram0_input", "pt_bigram1_input", "pa_basic_input", "pa_bigram0_input", "pa_bigram1_input"}; + size_t query_len = 50; + size_t batch_size = 1; + std::vector inputed_lines; + void set_batch_size (int bs) {batch_size = bs;} + + void load_input_lines(char *filename) { + static const int max_line_buf_size = 100 * 1024 * 1024; + char *line_buffer = (char *)calloc(max_line_buf_size, sizeof(char)); + FILE *input_file = fopen(filename, "r"); + + while (fgets(line_buffer, max_line_buf_size, input_file)) { + // trim newline at end + char *pos = NULL; + if ((pos = strchr(line_buffer, '\n')) != NULL){ + *pos = 0; + } + inputed_lines.push_back(line_buffer); + } + free(line_buffer); + line_buffer = NULL; + fclose(input_file); + } + + void split2( + const std::string& main_str, + std::vector& str_list, + const std::string & delimiter) { + size_t pre_pos = 0; + size_t position = 0; + std::string tmp_str; + + str_list.clear(); + if (main_str.empty()) { + return; + } + + while ((position = main_str.find(delimiter, pre_pos)) != std::string::npos) { + tmp_str.assign(main_str, pre_pos, position - pre_pos); + str_list.push_back(tmp_str); + pre_pos = position + 1; + } + + tmp_str.assign(main_str, pre_pos, main_str.length() - pre_pos); + + if (!tmp_str.empty()) { + str_list.push_back(tmp_str); + } + } + +/* + int string_to_id_buffer( + float* out_buffer, const int capacity, const std::string& str) { + std::vector id_strs; + split2(str, id_strs, std::string(" ")); + if ((int)id_strs.size() > capacity){ + fprintf(stderr, "input length(%lu) is larger than capacity(%d)\n", + id_strs.size(), capacity); + return -1; + } + for (size_t i = 0; i < id_strs.size(); i++){ + out_buffer[i] = static_cast(atof(id_strs[i].c_str())); + } + return id_strs.size(); + } +*/ +#ifdef USE_CUDA + int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + + size_t input_size = input_names.size(); + std::vector > h_inputs(input_size); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + std::vector query_shapes(input_size); + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { +// split2(number_strs[i][j+1], tmp, std::string(" ")); +// query_shapes[j][0] += tmp.size(); +// add the case that input's empty + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 1; + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + h_inputs[i].reshape(query_shapes[i]); + d_inputs[i]->reshape(query_shapes[i]); + } + // feed inputs. + for (size_t i = 0; i < line_vec.size(); i++) { + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + float* h_data = (float*)h_inputs[j].mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t i = 0; i < tmp.size(); i++){ + h_data[i] = static_cast(atof(tmp[i].c_str())); + } + } + } + } + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->copy_from(h_inputs[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +#endif + +#ifdef USE_X86_PLACE +int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + + size_t input_size = input_names.size(); + std::vector > h_inputs(input_size); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + std::vector query_shapes(input_size); + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { +// split2(number_strs[i][j+1], tmp, std::string(" ")); +// query_shapes[j][0] += tmp.size(); +// add the case that input's empty + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 1; + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + h_inputs[i].reshape(query_shapes[i]); + d_inputs[i]->reshape(query_shapes[i]); + } + // feed inputs. + for (size_t i = 0; i < line_vec.size(); i++) { + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + float* h_data = (float*)h_inputs[j].mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t i = 0; i < tmp.size(); i++){ + h_data[i] = static_cast(atof(tmp[i].c_str())); + } + } + } + } + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->copy_from(h_inputs[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +#endif + +#ifdef USE_CUDA + int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + + size_t input_size = input_names.size(); + std::vector > h_inputs(input_size); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + Shape temp({0, 0, 0, 0}); + std::vector query_shapes(input_size, temp); + // for (size_t i = 0; i < input_size; i++) { + // query_shapes[i]({0, 0, 0, 0}); + // } + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + // split2(number_strs[i][j+1], tmp, std::string(" ")); + // query_shapes[j][0] += tmp.size(); + // add the case that input's empty + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 1; + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + h_inputs[i].reshape(query_shapes[i]); + d_inputs[i]->reshape(query_shapes[i]); + } + // feed inputs. + for (size_t i = 0; i < line_vec.size(); i++) { + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + float* h_data = (float*)h_inputs[j].mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t i = 0; i < tmp.size(); i++){ + h_data[i] = static_cast(atof(tmp[i].c_str())); + } + } + } + } + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->copy_from(h_inputs[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +#endif + +// X86 +#ifdef USE_X86_PLACE + int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + + size_t input_size = input_names.size(); + std::vector > h_inputs(input_size); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + Shape temp({0, 0, 0, 0}); + std::vector query_shapes(input_size, temp); + // for (size_t i = 0; i < input_size; i++) { + // query_shapes[i]({0, 0, 0, 0}); + // } + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + // split2(number_strs[i][j+1], tmp, std::string(" ")); + // query_shapes[j][0] += tmp.size(); + // add the case that input's empty + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 1; + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + h_inputs[i].reshape(query_shapes[i]); + d_inputs[i]->reshape(query_shapes[i]); + } + // feed inputs. + for (size_t i = 0; i < line_vec.size(); i++) { + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + float* h_data = (float*)h_inputs[j].mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + }else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t i = 0; i < tmp.size(); i++){ + h_data[i] = static_cast(atof(tmp[i].c_str())); + } + } + } + } + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->copy_from(h_inputs[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +#endif + +} // namespace test_ps. + +#ifdef USE_CUDA +#if 1 +TEST(NetTest, net_execute_base_test) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + graph->Optimize(); + + // constructs the executer net + //{ // inner scope +#ifdef USE_DIEPSE + Net net_executer(true); +#else + Net net_executer(true); +#endif + + net_executer.init(*graph); + + int epoch = 1; + // do inference + Context ctx(0, 0, 0); + saber::SaberTimer my_time; + saber::SaberTimer my_time1; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + // warm up + /*for(int i=0; i<10; i++) { + net_executer.prediction(); + }*/ + + // feed inputs. + test_ps::load_input_lines(g_data_path.c_str()); + int count = 0; + float elapsedTime = 0.0f; + my_time.start(ctx); + //for (int i = 0; i < test_ps::inputed_lines.size(); i+= test_ps::batch_size) { + for (int i = 0; i < test_ps::inputed_lines.size() && i < g_epoch * test_ps::batch_size; i+= test_ps::batch_size) { + std::vector line_vec; + int pre_query_index = -1; + for (int j = i; j < test_ps::batch_size + i && j < test_ps::inputed_lines.size(); j++) { + auto line = test_ps::inputed_lines[j]; + std::vector number_strs; + std::vector tmp; + test_ps::split2(line, number_strs, ";"); + test_ps::split2(number_strs[0], tmp, std::string(" ")); + int cur_query_index = atoi(tmp[0].c_str()); + //LOG(INFO) << "raw str" << line; + //LOG(INFO) << "pre_query_index:" << pre_query_index; + //LOG(INFO) << "cur_query_index:" << cur_query_index; + if (pre_query_index != -1 && cur_query_index != pre_query_index) { break; + } else { + line_vec.push_back(line); + pre_query_index = cur_query_index; + } + } + i -= (test_ps::batch_size - line_vec.size()); +// LOG(INFO) << "this is line:"<<(i+1); + int flag = test_ps::batch_string_to_input(line_vec, net_executer); + if (flag == -1){ + fprintf(stderr, + "[ERROR]line %d string to input returned error %d\n", i, flag); + continue; + } +// cudaDeviceSynchronize(); + net_executer.prediction(); + //if (count >= 10) { + // my_time1.start(ctx); + //} + //for (int k = 0; k< 1000; k++) { + // net_executer.prediction(); + //} + // + //if (count >=10) { + // my_time1.end(ctx); + // elapsedTime += my_time1.get_average_ms(); + //} +// cudaDeviceSynchronize(); + auto tensor_out_0_p = net_executer.get_out("qps_out"); + LOG(INFO) << "start: " << i<< " batch_size: "<< line_vec.size(); + test_print(tensor_out_0_p); + //count++; + //if (count>=1) + // break; + } + my_time.end(ctx); +// LOG(INFO) << "average execute time:" << elapsedTime / (count) << "ms"; + LOG(INFO) << "average execute time:" << elapsedTime / (count-10) << "ms"; +// LOG(INFO) << "average execute time:" << elapsedTime / count << "ms"; + LOG(INFO) << "all execute time:" << my_time.get_average_ms() / (count) << "ms"; + + + // save the optimized model to disk. + std::string save_g_model_path = g_model_path + std::string(".saved"); + status = graph->save(save_g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + delete graph; +} +#endif +#endif + +#ifdef USE_X86_PLACE +#if 0 +TEST(NetTest, net_execute_performance) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + graph->Optimize(); + + // constructs the executer net + //{ // inner scope +#ifdef USE_DIEPSE + Net net_executer(true); +#else + Net net_executer(true); +#endif + net_executer.load_calibrator_config("net_pt_config_x86.txt", "cal_file"); + net_executer.init(*graph); + + // feed inputs. + test_ps::load_input_lines(g_data_path.c_str()); + for (int i = 0; i < 1/*test_ps::inputed_lines.size()*/; i+= test_ps::batch_size) { + std::vector line_vec; + for (int j = i; j < test_ps::batch_size + i && j < test_ps::inputed_lines.size(); j++) { + line_vec.push_back(test_ps::inputed_lines[j]); + } + LOG(INFO) << "this is line:"<<(i+1); + int flag = test_ps::batch_string_to_input(line_vec, net_executer); + if (flag == -1){ + fprintf(stderr, + "[ERROR]line %d string to input returned error %d\n", i, flag); + continue; + } + + // warm up +// for (int i = 0; i < 50; i++) { +// net_executer.prediction(); +// } + + int epoch = 1; +// Context ctx(0, 0, 0); + Context ctx; + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + my_time.start(ctx); + for (int i = 0; i < epoch; i++) { + net_executer.prediction(); + } + my_time.end(ctx); + LOG(INFO)<<"average time "<< my_time.get_average_ms()/epoch << " ms"; + auto tensor_out_0_p = net_executer.get_out("qps_out"); + test_print(tensor_out_0_p); + } + + delete graph; +} +#endif +#endif +int main(int argc, const char** argv){ + if (argc >=2) { + g_model_path = argv[1]; + } + if (argc >= 3){ + g_data_path = argv[2]; + } + if (argc >= 4){ + g_epoch = atoi(argv[3]); + } + if (argc >= 5){ + g_batch_size = atoi(argv[4]); + } + test_ps::set_batch_size(g_batch_size); + LOG(INFO) << "g_batch_size" << g_batch_size; + + Env::env_init(); +// Env::env_init(); +// Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/pyramid_dnn_test.cpp b/test/framework/net/pyramid_dnn_test.cpp new file mode 100644 index 000000000..d32de9ff9 --- /dev/null +++ b/test/framework/net/pyramid_dnn_test.cpp @@ -0,0 +1,316 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include +#include "saber/core/tensor_op.h" +#include + +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +std::string g_model_path = "./ps_shared.anakin.bin"; +std::string g_data_path = "/home/chaowen/4u8/cuichaowen/backup/ps_anakin/sample_by_query_length.expand.sample_url"; +int g_epoch = 1; +int g_num_threads = 1; +int g_batch_size = 1; + + +std::string g_model_saved_path = g_model_path + ".saved"; + +// some data pre-handle funcs. +namespace test_ps { + std::vector input_names{"qb_input", "qp_input", "p_tb_input", "p_tp_input"}; + size_t query_len = 50; + size_t batch_size = g_batch_size; + std::vector inputed_lines; + void set_batch_size(int batch_size_in) { + batch_size = batch_size_in; + } + + void load_input_lines(const char *filename) { + static const int max_line_buf_size = 100 * 1024 * 1024; + char *line_buffer = (char *)calloc(max_line_buf_size, sizeof(char)); + FILE *input_file = fopen(filename, "r"); + + while (fgets(line_buffer, max_line_buf_size, input_file)) { + // trim newline at end + char *pos = NULL; + if ((pos = strchr(line_buffer, '\n')) != NULL){ + *pos = 0; + } + inputed_lines.push_back(line_buffer); + } + free(line_buffer); + line_buffer = NULL; + fclose(input_file); + } + + void split2( + const std::string& main_str, + std::vector& str_list, + const std::string & delimiter) { + size_t pre_pos = 0; + size_t position = 0; + std::string tmp_str; + + str_list.clear(); + if (main_str.empty()) { + return; + } + + while ((position = main_str.find(delimiter, pre_pos)) != std::string::npos) { + tmp_str.assign(main_str, pre_pos, position - pre_pos); + str_list.push_back(tmp_str); + pre_pos = position + 1; + } + + tmp_str.assign(main_str, pre_pos, main_str.length() - pre_pos); + + if (!tmp_str.empty()) { + str_list.push_back(tmp_str); + } + } + +#ifdef USE_X86_PLACE +int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + + size_t input_size = input_names.size(); + std::vector > h_inputs(input_size); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + std::vector query_shapes(input_size); + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 1; + } else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + h_inputs[i].reshape(query_shapes[i]); + d_inputs[i]->reshape(query_shapes[i]); + } + // feed inputs. + for (size_t i = 0; i < line_vec.size(); i++) { + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + float* h_data = (float*)h_inputs[j].mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + } else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t i = 0; i < tmp.size(); i++) { + h_data[i] = static_cast(atof(tmp[i].c_str())); + } + } + } + } + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->copy_from(h_inputs[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +// X86 + int batch_string_to_input(const std::vector &line_vec, Net& net_executer){ + int max_length = 100; + size_t input_size = input_names.size(); + std::vector* > d_inputs(input_size); + for (size_t i = 0; i < input_size; i++) { + d_inputs[i] = net_executer.get_in(input_names[i]); + d_inputs[i]->reshape(Shape({test_ps::batch_size * max_length, 1, 1, 1}, Layout_NCHW)); + } + + std::vector > offset; + offset.resize(input_size); + int batch = line_vec.size(); + for (size_t i = 0; i < input_size; i++) { + offset[i].resize(batch + 1); + offset[i][0] = 0; + } + // determin inputs' shape. + std::vector> number_strs(line_vec.size()); + Shape temp({0, 0, 0, 0}); + std::vector query_shapes(input_size, temp); + // for (size_t i = 0; i < input_size; i++) { + // query_shapes[i]({0, 0, 0, 0}); + // } + for (size_t i = 0; i < input_size; i++) { + query_shapes[i][0] = 0; + query_shapes[i][1] = 1; + query_shapes[i][2] = 1; + query_shapes[i][3] = 1; + } + + for (size_t i = 0; i < line_vec.size(); i++) { + split2(line_vec[i], number_strs[i], ";"); + if (number_strs[i].size() < input_size + 1){ + fprintf(stderr, "input slots is no enough, has %lu expect %lu", + number_strs[i].size(), input_size); + return -1; + } + std::vector tmp; + for (size_t j = 0; j < input_size; j++) { + // add the case that input's empty + if (number_strs[i][j+1].empty()) { + query_shapes[j][0] += 0; + } else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + query_shapes[j][0] += tmp.size(); + } + float* h_data = (float*)d_inputs[j]->mutable_data(); + h_data = h_data + offset[j][i]; + if (number_strs[i][j+1].empty()) { + h_data[0] = -1; //padding_idx == -1. + } else { + split2(number_strs[i][j+1], tmp, std::string(" ")); + for (size_t k = 0; k < tmp.size(); k++){ + h_data[k] = static_cast(atof(tmp[k].c_str())); + } + } + offset[j][i+1] = query_shapes[j][0]; + } + } + + //reshape + for (size_t i = 0; i < input_size; i++) { + d_inputs[i]->reshape(query_shapes[i]); + d_inputs[i]->set_seq_offset({offset[i]}); + } + + return 0; + } +#endif +} // namespace test_ps. + +#ifdef USE_X86_PLACE +#if 1 +TEST(NetTest, net_execute_performance) { + omp_set_num_threads(g_num_threads); + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + graph->Optimize(); + + // constructs the executer net + //{ // inner scope + Net net_executer(true); + //net_executer.load_calibrator_config("net_pt_config_x86.txt", "cal_file"); + net_executer.init(*graph); + + // feed inputs. + test_ps::load_input_lines(g_data_path.c_str()); + int batch_num = g_epoch * test_ps::batch_size; + Context ctx; + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + my_time.start(ctx); + for (int i = 0; i < test_ps::inputed_lines.size(); i+= test_ps::batch_size) { + std::vector line_vec; + int start = i % test_ps::inputed_lines.size(); + for (int j = start; j < test_ps::batch_size + start && j < test_ps::inputed_lines.size(); j++) { + line_vec.push_back(test_ps::inputed_lines[j]); + } + //LOG(INFO) << "this is line:"<<(i+1); + int flag = test_ps::batch_string_to_input(line_vec, net_executer); + if (flag == -1){ + fprintf(stderr, + "[ERROR]line %d string to input returned error %d\n", i, flag); + continue; + } + + //int epoch = 1; +// Context ctx(0, 0, 0); + net_executer.prediction(); + auto tensor_out_0_p = net_executer.get_out("ps_out"); + test_print(tensor_out_0_p); + } + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_epoch); +#endif + LOG(INFO)<<"average time "<< my_time.get_average_ms()/g_epoch << " ms"; + + delete graph; +} +#endif +#endif +int main(int argc, const char** argv){ + if (argc >=2) { + g_model_path = argv[1]; + } + if (argc >= 3){ + g_data_path = argv[2]; + } + if (argc >= 4){ + g_num_threads = atoi(argv[3]); + } + if (argc >= 5){ + g_epoch = atoi(argv[4]); + } + if (argc >= 6){ + g_batch_size = atoi(argv[5]); + } + test_ps::set_batch_size(g_batch_size); + LOG(INFO) << "g_batch_size" << g_batch_size; + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/framework/net/seg_precision_test.cpp b/test/framework/net/seg_precision_test.cpp new file mode 100644 index 000000000..3f5124725 --- /dev/null +++ b/test/framework/net/seg_precision_test.cpp @@ -0,0 +1,184 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include + +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" +using namespace cv; +std::string g_model_path = "path/to/your/anakin_model"; +std::string g_precition_path = "path/to/your/precision_file"; +std::string g_calibrate_path = "path/to/your/calib_file"; +std::string g_img_path = "path/to/your/image list"; +std::string g_gt_path = "path/to/your/ground truth list"; + +typedef Tensor TensorHf; + +void fill_tensor_with_cvmat(const Mat& img_in, TensorHf& tout, const int num, \ + const int width, const int height, const float* mean, const float* scale) { + cv::Mat im; + cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); + float* ptr_data_in = static_cast(tout.mutable_data()); + int stride = width * height; + for (int i = 0; i < num; i++) { + float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); + for (int r = 0; r < height; r++) { + for (int c = 0; c < width; c++) { + ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; + ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; + ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; + } + } + } +} + +void cmp_seg_result(const Mat& gt_img, const TensorHf& tin, long long& diff_count, double& accuracy) { + int height = tin.height(); + int width = tin.width(); + diff_count = 0; + const float* din = static_cast(tin.data()); + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + int gt = gt_img.at(h, w); + int test = *(din++) >= 0.5; + if (gt != test) { + diff_count++; + } + } + } + accuracy = (double)diff_count / (height * width); +} + +#ifdef USE_CUDA +TEST(NetTest, net_execute_base_test) { + + std::vector img_list; + std::vector gt_list; + //! load test image list and ground truth image list + std::fstream fp_img(g_img_path); + std::string line; + while (getline(fp_img, line)) { + img_list.push_back(line); + } + LOG(INFO) << "total test image number: " << img_list.size(); + fp_img.close(); + + std::fstream fp_gt(g_gt_path); + while (getline(fp_gt, line)) { + gt_list.push_back(line); + } + LOG(INFO) << "total ground truth image number: " << gt_list.size(); + CHECK_EQ(gt_list.size(), img_list.size()) << "test image number must = ground truth image number"; + + LOG(INFO) << "finish load test image list"; + + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + graph->load_calibrator_config(g_precition_path, g_calibrate_path); + //anakin graph optimization + graph->Optimize(); + Net net_executer(true); + graph->load_calibrator_config(g_precition_path, g_calibrate_path); + + net_executer.init(*graph); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; ivalid_shape(); + + TensorHf h_tensor_in; + h_tensor_in.re_alloc(valid_shape_in); + + TensorHf h_tensor_out; + h_tensor_out.re_alloc(valid_shape_out); + + int hin = h_tensor_in.height(); + int win = h_tensor_in.width(); + + float mean_val[3] = {104.008f, 116.669f, 122.675f}; + float scale_val[3] = {1.f, 1.f, 1.f}; + + double acc = 0.0; + + for (int k = 0; k < img_list.size(); ++k) { + //! pre-processing + Mat img = imread(img_list[k], CV_LOAD_IMAGE_COLOR); + fill_tensor_with_cvmat(img, h_tensor_in, 1, win, hin, mean_val, scale_val); + LOG(INFO) << "test image name: " << img_list[k] << ", gt image name: " << gt_list[k]; + Mat img_gt = imread(gt_list[k], CV_LOAD_IMAGE_UNCHANGED); + if (img.empty() || img_gt.empty()) { + LOG(FATAL) << "load image failed"; + } + Mat img_gt_resize; + cv::resize(img_gt, img_gt_resize, cv::Size(192, 192)); + d_tensor_in_p->copy_from(h_tensor_in); + + net_executer.prediction(); + + TargetWrapper::device_sync(); + h_tensor_out.copy_from(*d_tensor_out_p); + + double mean = tensor_mean_value_valid(h_tensor_out); + LOG(INFO) << "output mean: " << mean; + + //! post processing + long long diff_count = 0; + double acc_curr = 0.0; + cmp_seg_result(img_gt_resize, h_tensor_out, diff_count, acc_curr); + acc += acc_curr; + LOG(INFO) << "image : " << img_list[k] << ", diff count: " << diff_count << ", accuracy: " << acc_curr; + } + LOG(INFO) << "test accuracy is: " << acc / img_list.size(); +} +#endif + +int main(int argc, const char** argv){ + if (argc < 6){ + LOG(ERROR) << "usage: " << argv[0] << " "; + return 0; + } + g_model_path = std::string(argv[1]); + g_precition_path = std::string(argv[2]); + g_calibrate_path = std::string(argv[3]); + g_img_path = std::string(argv[4]); + g_gt_path = std::string(argv[5]); + + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} +#else //opencv +int main(int argc, const char** argv){ + LOG(ERROR) << "turn on USE_OPENCV firstly"; + return 0; +} +#endif //opencv \ No newline at end of file diff --git a/test/framework/net/yolo_v3_test.cpp b/test/framework/net/yolo_v3_test.cpp new file mode 100644 index 000000000..ddc81c747 --- /dev/null +++ b/test/framework/net/yolo_v3_test.cpp @@ -0,0 +1,392 @@ +#include +#include "net_test.h" +#include "saber/funcs/timer.h" +#include +#include "debug.h" +#include +#if defined(USE_CUDA) +using Target = NV; +using Target_H = X86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +//#define USE_DIEPSE + +std::string g_model_path = "/path/to/your/anakin_model"; + +std::string model_saved_path = g_model_path + ".saved"; +int g_batch_size = 1; +int g_warm_up = 10; +int g_epoch = 1000; +int g_device_id = 0; + +#ifdef USE_CUDA +#if 1 + +//TEST(NetTest, net_test_load_from_buffer) { +// Graph* graph = new Graph(); +// LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; +// std::ifstream ifs; +// ifs.open (g_model_path, std::ifstream::in); +// if (!ifs.is_open()) { +// LOG(FATAL) << "file open failed"; +// } +// ifs.seekg(0, ifs.end); +// int length = ifs.tellg(); +// ifs.seekg(0, ifs.beg); +// char * buffer = new char [length]; +// ifs.read(buffer, length); +// ifs.close(); +// +// // load anakin model files. +// auto status = graph->load(buffer, length); +// if (!status ) { +// LOG(FATAL) << " [ERROR] " << status.info(); +// } +// graph->ResetBatchSize("input_0", g_batch_size); +// graph->Optimize(); +// Net net_executer(true); +// net_executer.init(*graph); +// auto d_tensor_in_p = net_executer.get_in("input_0"); +// Tensor4d h_tensor_in; +// +// auto valid_shape_in = d_tensor_in_p->valid_shape(); +// for (int i=0; icopy_from(h_tensor_in); +// cudaDeviceSynchronize(); +// net_executer.prediction(); +// cudaDeviceSynchronize(); +// auto h_tensor_out = net_executer.get_out_list()[0]; +// LOG(INFO) << "output mean value: " << tensor_mean_value_valid(*h_tensor_out); +// write_tensorfile(*net_executer.get_out_list()[0],"output_b.txt"); +//} + +TEST(NetTest, net_execute_base_test) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from " << g_model_path << " ..."; + // load anakin model files. + auto status = graph->load(g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + LOG(INFO)<<"net_execute_base_test"; + // reshape the input_0 's shape for graph model + //graph->Reshape("input_0", {1, 8, 640, 640}); + graph->ResetBatchSize("input_0", g_batch_size); + graph->ResetBatchSize("input_1", g_batch_size); + + // register all tensor inside graph + // graph->RegistAllOut(); + + // register edge + // graph->RegistOut("conv2_2/expand/scale", "relu2_2/expand"); + // graph->RegistOut("relu#3(conv2d_0)","pool2d#4(pool2d_0)"); + + //anakin graph optimization + graph->Optimize(); + graph->save("/home/tianxiaogang/txg/bug_model/yolo_v3.anakin.bin.saved"); + + // constructs the executer net + //{ // inner scope +#ifdef USE_DIEPSE + //Net net_executer(*graph, true); + Net net_executer(true); +#else + //Net net_executer(*graph, true); + Net net_executer(true); +#endif + + net_executer.init(*graph); + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + auto d_tensor_in_p1 = net_executer.get_in("input_1"); + Tensor4d h_tensor_in1; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; ivalid_shape(); + for (int i=0; icopy_from(h_tensor_in); + d_tensor_in_p1->copy_from(h_tensor_in1); + std::vector> seq_offset={{0,g_batch_size}}; + d_tensor_in_p->set_seq_offset(seq_offset); + +#ifdef USE_DIEPSE + // for diepse model + auto d_tensor_in_1_p = net_executer.get_in("input_1"); + Tensor4d h_tensor_in_1; + + h_tensor_in_1.re_alloc(d_tensor_in_1_p->valid_shape()); + for (int i=0; ivalid_shape().size(); i++) { + LOG(INFO) << "detect input_1 dims[" << i << "]" << d_tensor_in_1_p->valid_shape()[i]; + } + h_data = h_tensor_in_1.mutable_data(); + h_data[0] = 1408; + h_data[1] = 800; + h_data[2] = 0.733333; + h_data[3] = 0.733333; + h_data[4] = 0; + h_data[5] = 0; + d_tensor_in_1_p->copy_from(h_tensor_in_1); + + auto d_tensor_in_2_p = net_executer.get_in("input_2"); + Tensor4d h_tensor_in_2; + + h_tensor_in_2.re_alloc(d_tensor_in_2_p->valid_shape()); + for (int i=0; ivalid_shape().size(); i++) { + LOG(INFO) << "detect input_2 dims[" << i << "]" << d_tensor_in_2_p->valid_shape()[i]; + } + h_data = h_tensor_in_2.mutable_data(); + h_data[0] = 2022.56; + h_data[1] = 989.389; + h_data[2] = 2014.05; + h_data[3] = 570.615; + h_data[4] = 1.489; + h_data[5] = -0.02; + d_tensor_in_2_p->copy_from(h_tensor_in_2); +#endif + + //int g_epoch = 1000; + //int g_warm_up=10; + // do inference + Context ctx(g_device_id, 0, 0); + saber::SaberTimer my_time; + LOG(WARNING) << "EXECUTER !!!!!!!! "; + // warm up + for (int i = 0; i < g_warm_up; i++) { + net_executer.prediction(); + } + for (auto x:net_executer.get_in_list()){ + fill_tensor_const(*x, 1); + } +#ifdef ENABLE_OP_TIMER + net_executer.reset_op_time(); +#endif + + my_time.start(ctx); + + //auto start = std::chrono::system_clock::now(); + for (int i = 0; i < g_epoch; i++) { + //DLOG(ERROR) << " g_epoch(" << i << "/" << g_epoch << ") "; + net_executer.prediction(); + } + /* // running part of model + net_executer.execute_stop_at_node("relu2_2/expand"); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + + // get inner tensor after stop + auto tensor_out_inner_p = net_executer.get_tensor_from_edge("conv2_2/expand", "relu2_2/expand"); + LOG(WARNING) << "inner tensor avg value : " << tensor_average(tensor_out_inner_p); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + + for (int i = 0; i < 3; i++) { + net_executer.execute_start_from_node("relu2_2/expand"); + } + +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif*/ + + //auto end = std::chrono::system_clock::now(); + + //double time = std::chrono::duration_cast(end - start).count(); + //LOG(WARNING) << "avg time : " << time/g_epoch <<" ms"; + cudaDeviceSynchronize(); + my_time.end(ctx); +#ifdef ENABLE_OP_TIMER + net_executer.print_and_reset_optime_summary(g_epoch); +#endif + + LOG(INFO)<<"aveage time "<save(save_g_model_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + if (!graph){ + delete graph; + } +} +#endif +#endif + +#if 0 +TEST(NetTest, net_execute_reconstruction_test) { + Graph* graph = new Graph(); + LOG(WARNING) << "load anakin model file from optimized model " << model_saved_path << " ..."; + // load anakin model files. + auto status = graph->load(model_saved_path); + if (!status ) { + LOG(FATAL) << " [ERROR] " << status.info(); + } + + // regisiter output tensor + //graph->RegistOut("data_perm", "data_scale"); + //graph->RegistOut("data_perm", "conv1"); + + //anakin graph optimization + graph->Optimize(); + + // constructs the executer net + Net net_executer(*graph); + + // get in + auto d_tensor_in_p = net_executer.get_in("input_0"); + Tensor4d h_tensor_in; + + auto valid_shape_in = d_tensor_in_p->valid_shape(); + for (int i=0; icopy_from(h_tensor_in); + + // do inference + Context ctx(g_device_id, 0, 0); + saber::SaberTimer my_time; + my_time.start(ctx); + + LOG(WARNING) << "EXECUTER !!!!!!!! "; + for (int i=0; i<1; i++) { + net_executer.prediction(); + + } + my_time.end(ctx); + LOG(INFO)<<"aveage time "< 1) { + g_model_path = std::string(argv[1]); + } + if (argc > 2) { + g_batch_size = atoi(argv[2]); + } + if (argc > 3) { + g_warm_up = atoi(argv[3]); + } + if (argc > 4) { + g_epoch = atoi(argv[4]); + } + if (argc > 5) { + g_device_id = atoi(argv[5]); + } + TargetWrapper::set_device(g_device_id); + Env::env_init(); + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/lite/test_activation_lite.cpp b/test/lite/test_activation_lite.cpp deleted file mode 100755 index b6932f473..000000000 --- a/test/lite/test_activation_lite.cpp +++ /dev/null @@ -1,224 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_activation.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -int test_iter = 10; - -int w_in = 9; -int h_in = 9; -int ch_in = 9; -int num_in = 9; -int cluster = 0; -int threads = 4; -ActiveType active_type=Active_relu; -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 - -template -void activation_basic(const TensorHf4& tin, TensorHf4& tout, ActivationParam& param) { - - int num = tin.num(); - int channel = tin.channel(); - int height = tin.height(); - int width = tin.width(); - - dtype* dout = (dtype*)tout.mutable_data(); - const dtype* din = (const dtype*)tin.data(); - int count = tin.valid_size(); - int size = height * width; - - switch (param._act_type) { - //x > 0 ? x : 0 - case Active_relu: - for (size_t i = 0; i < count; i++) { - dout[i] = din[i] > 0 ? din[i] : 0; - } - - break; - - // sigmoid: 1/(exp(-x) + 1) - case Active_sigmoid: - - for (size_t i = 0; i < count; i++) { - dout[i] = 1.0f / (exp(-din[i]) + 1.0f); - } - - break; - - // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) - case Active_tanh: - for (size_t i = 0; i < count; i++) { - dout[i] = tanh(din[i]);//(exp(din[i]) - exp(-din[i])) / (exp(din[i]) + exp(-din[i])); - } - - break; - - // stanh : b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} - case Active_stanh: - for (size_t i = 0; i < count; i++) { - dtype val = din[i] * param._neg_slope; - dout[i] = param._coef * tanh(val); - } - - break; - - // x > 0 ? x : 0; - // x < threshold ? x : threshold - case Active_clipped_relu: - for (size_t i = 0; i < count; i++) { - const dtype threshold = param._coef; - dout[i] = din[i] > 0 ? (din[i] < threshold ? din[i] : threshold) : 0; - } - - break; - - //elu: x > 0 ? x : coef * (exp(x) - 1) - case Active_elu: - for (size_t i = 0; i < count; i++) { - dout[i] = din[i] > 0 ? din[i] : param._coef * (exp(din[i]) - 1); - } - - break; - - - //prelu: x > 0 ? x : slope[c] * x - case Active_prelu: - for (int n = 0; n < num; n++) { - const dtype* in_ptr = din + n * channel * size; - dtype* out_ptr = dout + n * channel * size; - - // const dtype *slope_ptr = (const dtype*)prelu_param.slope->data(); - for (int c = 0; c < channel; c++) { - const dtype* in_ch_ptr = in_ptr + c * size; - dtype* out_ch_ptr = out_ptr + c * size; - float slope = param._prelu_channel_shared? param._prelu_weights[0] : \ - param._prelu_weights[c]; - - for (int k = 0; k < size; k++) { - out_ch_ptr[k] = in_ch_ptr[k] > 0 ? in_ch_ptr[k] : in_ch_ptr[k] * slope; - } - } - } - break; - default: - LOG(FATAL) << "unsupported activation type: " << param._act_type; - } -} - -TEST(TestSaberLite, test_func_activation_arm) { - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out = shape_in; - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ - ch_in << ", height=" << h_in << ", width=" << w_in; - - SaberActivation activation_lite; - float slopes[ch_in]; - for (int i=0; i vin; - std::vector vout; - - Tensor thin(shape_in); - fill_tensor_rand(thin, -1.f, 1.f); - TensorHf4 tout; - TensorHf4 tout_basic(shape_out); - vin.push_back(&thin); - -#if COMPARE_RESULT - activation_basic(thin, tout_basic, param); - //print_tensor_host(tout_basic); -#endif - - vout.push_back(&tout); - activation_lite.compute_output_shape(vin, vout); - CHECK_EQ(shape_out == vout[0]->valid_shape(), true) << "compute shape error"; - - LOG(INFO) << "re-alloc tensor buffer"; - vout[0]->re_alloc(vout[0]->valid_shape()); - - LOG(INFO) << "activation initialized to saber impl"; - activation_lite.init(vin, vout, ctx1); - - SaberTimer t1; - - LOG(INFO) << "saber activation compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - activation_lite.dispatch(vin, vout); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber activation total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - - tensor_cmp_host(tout_basic, tout, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} - -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - Env::env_init(); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc ==4 ) { - LOG(INFO)< 4 || argc < 2){ - LOG(ERROR)<<"please use "< -void test_buffer(){ - LOG(INFO) << "test buffer"; - typedef typename DataTrait::Dtype Dtype; - typedef Buffer BufferH; - - int n0 = 1024; - int n1 = 2048; - - void* tmp_ptr = nullptr; - Dtype* arm_ptr; - - tmp_ptr = fast_malloc(n0 * sizeof(Dtype)); - arm_ptr = static_cast(tmp_ptr); - for (int i = 0; i < n0; i++){ - arm_ptr[i] = static_cast(i); - } - - LOG(INFO) << "Buffer: test default(empty) constructor"; - BufferH arm_buf0; - - LOG(INFO) << "Buffer: test constructor with data size"; - BufferH arm_buf1(n0 * sizeof(Dtype)); - - LOG(INFO) << "Buffer: test constructor with data pointer, size and device id"; - BufferH arm_buf2(arm_ptr, n0 * sizeof(Dtype)); - - LOG(INFO) << "Buffer: test copy constructor"; - BufferH arm_buf3(arm_buf2); - CHECK_EQ(arm_buf3.get_capacity(), arm_buf2.get_capacity()) << "shared buffer should have same data count"; - - - LOG(INFO) << "Buffer: test operator ="; - arm_buf0 = arm_buf2; - CHECK_EQ(arm_buf0.get_capacity(), arm_buf2.get_capacity()) << "shared buffer should have same data count"; - - LOG(INFO) << "Buffer: test re_alloc"; - arm_buf1.re_alloc(n1 * sizeof(Dtype)); - CHECK_EQ(arm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer count error"; - - arm_buf1.re_alloc(n0 * sizeof(Dtype)); - CHECK_EQ(arm_buf1.get_capacity(), n1 * sizeof(Dtype)) << "buffer capacity error"; - - LOG(INFO) << "Buffer: test deep_cpy()"; - arm_buf1.copy_from(arm_buf2); - LOG(INFO) << "deep copy between two host buffer: "; - Dtype* data_ptr1 = (Dtype*)arm_buf1.get_data(); - LOG(INFO) << "data in buffer 1"; - for (int i = 0; i < n0; i++) { - printf("%.2f ", data_ptr1[i]); - if ((i + 1) % 10 == 0) { - printf("\n"); - } - } - printf("\n"); - Dtype* data_ptr2 = (Dtype*)arm_buf2.get_data(); - LOG(INFO) << "data in buffer2"; - for (int i = 0; i < n0; i++) { - printf("%.2f ", data_ptr2[i]); - if ((i + 1) % 10 == 0) { - printf("\n"); - } - } - printf("\n"); - CHECK_EQ(data_ptr1[n0 / 2], data_ptr2[n0 / 2]) << "deep copy between host is incorrect"; - LOG(INFO) << "deep copy from host buffer to device buffer"; -} - -TEST(TestSaberLite, test_buffer_lite) { - test_buffer(); -} - -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - - diff --git a/test/lite/test_calibrate_lite.cpp b/test/lite/test_calibrate_lite.cpp deleted file mode 100644 index 4f0548511..000000000 --- a/test/lite/test_calibrate_lite.cpp +++ /dev/null @@ -1,461 +0,0 @@ -#include "saber/lite/funcs/calibrate_lite.h" -#include "test_lite.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; -int cluster = 0; -int threads = 1; -int iter = 1; -typedef Tensor TensorH; -std::vector get_scale_basic(const TensorH& tin, int axis, float scale_factor) { - int axis_dims = tin.valid_shape()[axis]; - std::vector scale_out; - scale_out.resize(axis_dims); - int out_dims = tin.count_valid(0, axis); - long long inner_dims = tin.count(axis + 1, tin.dims()); - long long inner_size = inner_dims * axis_dims; - // printf("inner_dims: %d, inner_size: %d \n", inner_dims, inner_size); - const float* in_data = static_cast(tin.data()); -#pragma omp parallel for - for (int c = 0; c < axis_dims; ++c) { - float max_val = 0.f; - const float* din = in_data + c * inner_dims; - for (int j = 0; j < out_dims; ++j) { - const float* ptr_in = din + j * inner_size; - for (int i = 0; i < inner_dims; ++i) { - float read_data = fabsf(ptr_in[i]); - max_val = (read_data > max_val) ? read_data : max_val; - } - } - // printf("max_val: %d \n", max_val); - scale_out[c] = max_val / scale_factor; - } - return scale_out; -} -void fp32_to_int8_basic(const TensorH& tin, TensorH& tout, int axis, std::vector scale_factor) { - int outer_size = tin.count_valid(0, axis); - int inner_size = tin.count_valid(axis, tin.dims()); - const float* din = static_cast(tin.data()); - char* dout = static_cast(tout.mutable_data()); - for (int i = 0; i < outer_size; ++i) { - float scale = 1.f / scale_factor[i]; - for (int j = 0; j < inner_size; ++j) { -#ifdef __aarch64__ - dout[j] = static_cast(round(din[j] * scale)); -#else - dout[j] = static_cast((din[j] * scale)); -#endif - } - dout += inner_size; - din += inner_size; - } -} -void fp32_to_int8_inplace_basic(const TensorH& tin, int axis, std::vector scale_factor) { - //! alloc memory - // int m = tin.num(); - // int k = tin.count_valid(1, tin.dims()); - Tensor tout; - tout.re_alloc(tin.valid_shape(), AK_INT8); - int outer_size = tin.count_valid(0, axis); - int inner_size = tin.count_valid(axis, tin.dims()); - // printf("inner_size: %d, outer_size: %d \n", inner_size, outer_size); - const float* din = static_cast(tin.data()); - char* dout = static_cast(tout.mutable_data()); - for (int i = 0; i < outer_size; ++i) { - float scale = 1.f / scale_factor[i]; - for (int j = 0; j < inner_size; ++j) { -#ifdef __aarch64__ - dout[j] = static_cast(round(din[j] * scale)); -#else - dout[j] = static_cast((din[j] * scale)); -#endif - } - dout += inner_size; - din += inner_size; - } - // tin.reshape(Shape(m, k, 1, 1), AK_INT8); - tin.copy_from(tout); -} -void tensor_to_int8_basic(const Tensor& tin, Tensor& tout){ - if (tin.get_dtype() != AK_FLOAT) { - return SaberInvalidValue; - } - if (tout.get_dtype() != AK_INT8) { - tout.set_dtype(AK_INT8); - } - tout.reshape(tin.valid_shape()); - //! get scale - std::vector scale = tin.get_scale(); - // const float* din = static_cast(tin.data()); - // char* dout = static_cast(tout.mutable_data()); - //! convert to int8 - fp32_to_int8_basic(tin, tout, 1, scale); -} -void tensor_to_int8_inplace_basic(const Tensor& tin){ - if (tin.get_dtype() != AK_FLOAT) { - return SaberInvalidValue; - } - //! get scale - std::vector scale = tin.get_scale(); - //! convert to int8 - fp32_to_int8_inplace_basic(tin, 1, scale); -} -bool test_get_scale(int axis, float scale_factor) { - Shape sh(get_rand(1, 100), get_rand(1, 100), get_rand(1, 512), get_rand(1, 512)); - // Shape sh(4, 32, 112, 112); - TensorH tin; - tin.re_alloc(sh, AK_FLOAT); - fill_tensor_rand(tin, -20, 20); - LOG(INFO) << "input shape num = " << sh[0]; - LOG(INFO) << "input shape channel = " << sh[1]; - LOG(INFO) << "input shape height = " << sh[2]; - LOG(INFO) << "input shape width = " << sh[3]; - std::vector scale_basic; - std::vector scale_lite; - LOG(INFO) << "get_scale_basic compute"; - scale_basic = get_scale_basic(tin, axis, scale_factor); - LOG(INFO) << "get_tensor_scale compute"; - double to = 0; - double min_time = 1000000; - SaberTimer t1; - for (int i = 0; i < iter; i++){ - t1.clear(); - t1.start(); - get_tensor_scale(tin, scale_lite, axis, scale_factor); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "get_tensor_scale running time, ave: " << to / iter << ", min time: " << min_time; - if (scale_basic.size() != scale_lite.size()) { - LOG(INFO) << "scale_basic size:" << scale_basic.size() <<", scale_lite size: " << scale_lite.size(); - return false; - } - // LOG(INFO) << "basic result"; - // for (int i = 0; i < scale_basic.size(); ++i) { - // printf("%.6f ", scale_basic[i]); - // if ((i + 1) % 10 == 0) - // printf("\n"); - // } - // printf("\n"); - // LOG(INFO) << "lite result"; - // for (int i = 0; i < scale_lite.size(); ++i) { - // printf("%.6f ", scale_lite[i]); - // if ((i + 1) % 10 == 0) - // printf("\n"); - // } - // printf("\n"); - LOG(INFO) << "diff"; - for (int i = 0; i < scale_basic.size(); ++i) { - float tmp = scale_basic[i] - scale_lite[i]; - // printf("%.6f ", tmp); - // if ((i + 1) % 10 == 0) - // printf("\n"); - // if (tmp != 0){ - // printf("i: %d, tmp: %.6f, a: %.6f, b: %.6f \n", i, tmp, scale_basic[i], scale_lite[i]); - // } - CHECK_EQ(fabsf(tmp) < 1e-5f, true) << "compute result error";//scale_basic[i] - scale_lite[i] - // return false; - } - LOG(INFO) << "get_tensor_scale result is right"; - return true; -} -bool test_fp32_to_int8(int axis, float scale_factor, Context ctx){ - Shape sh(get_rand(1, 10), get_rand(1, 50), get_rand(1, 512), get_rand(1, 512)); - // Shape sh(4, 32, 112, 112); - TensorH tin; - tin.re_alloc(sh, AK_FLOAT); - fill_tensor_rand(tin, -20, 20); - LOG(INFO) << "input shape num = " << sh[0]; - LOG(INFO) << "input shape channel = " << sh[1]; - LOG(INFO) << "input shape height = " << sh[2]; - LOG(INFO) << "input shape width = " << sh[3]; - std::vector scale_basic; - std::vector scale_lite; - LOG(INFO) << "get_scale_basic compute"; - scale_basic = get_scale_basic(tin, axis, scale_factor); - LOG(INFO) << "get_tensor_scale compute"; - get_tensor_scale(tin, scale_lite, axis, scale_factor); - if (scale_basic.size() != scale_lite.size()) { - return false; - } - for (int i = 0; i < scale_basic.size(); ++i) { - // float tmp = scale_basic[i] - scale_lite[i]; - // if (tmp != 0){ - // printf("i: %d, tmp: %.6f \n", i, tmp); - // } - CHECK_EQ(fabsf(scale_basic[i] - scale_lite[i]) < 1e-4f, true) << "scale compute result error"; - // return false; - // if (fabsf(scale_basic[i] - scale_lite[i]) > 1e-5f) { - // LOG(INFO) << "scale compute failed"; - // return false; - // } - } - LOG(INFO) << "scale is right"; - TensorH tout; - TensorH tout_basic; - tout.re_alloc(sh, AK_INT8); - tout_basic.re_alloc(sh, AK_INT8); - LOG(INFO) << "fp32_to_int8_basic compute"; - fp32_to_int8_basic(tin, tout_basic, axis + 1, scale_lite); - // print_tensor(tout_basic); - LOG(INFO) << "trans_fp32_weights_to_int8 compute"; - int outer_size = tin.count_valid(0, axis); - int inner_size = tin.count_valid(axis, tin.dims()); - LOG(INFO) << "outer_size: " << outer_size << ", inner_size: " << inner_size; - // fp32_to_int8((const float*)tin.data(), (char*)tout.mutable_data(), scale_lite, outer_size, inner_size); - double to = 0; - double min_time = 1000000; - SaberTimer t1; - for (int i = 0; i < iter; i++){ - t1.clear(); - t1.start(); - trans_fp32_weights_to_int8(tin, tout, scale_factor, 0, &ctx); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "trans_fp32_weights_to_int8 running time, ave: " << to / iter << ", min time: " << min_time; - // print_tensor(tout); - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - int out_size = tout.valid_size(); - char* ptr_basic = static_cast(tout_basic.data()); - char* ptr = static_cast(tout.data()); - LOG(INFO) << "trans_fp32_weights_to_int8 diff, size: " << out_size; - for (int i = 0; i < out_size; i++){ - int a = ptr[i]; - int b = ptr_basic[i]; - int diff1 = a - b; - int diff = diff1 < 0 ? -diff1 : diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % sh[3] == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - // printf("\n"); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; - LOG(INFO)<< "tensor_to_int8"; - tin.set_scale(scale_lite); - TensorH tout1; - TensorH tout_basic1; - tout1.re_alloc(sh, AK_INT8); - tout_basic1.re_alloc(sh, AK_INT8); - LOG(INFO) << "tensor_to_int8_basic compute"; - tensor_to_int8_basic(tin, tout_basic1); - LOG(INFO) << "trans_tensor_to_int8 compute"; - to = 0; - min_time = 1000000; - SaberTimer t2; - for (int i = 0; i < iter; i++){ - t2.clear(); - t2.start(); - trans_tensor_fp32_to_int8(tin, tout1, &ctx); - t2.end(); - to += t2.get_average_ms(); - if (t2.get_average_ms() < min_time) { - min_time = t2.get_average_ms(); - } - } - LOG(INFO) << "trans_tensor_to_int8 running time, ave: " << to / iter << ", min time: " << min_time; - ptr_basic = static_cast(tout_basic1.data()); - ptr = static_cast(tout1.data()); - LOG(INFO) << "trans_tensor_to_int8 diff, size: " << out_size; - for (int i = 0; i < out_size; i++){ - int a = ptr[i]; - int b = ptr_basic[i]; - int diff1 = a - b; - int diff = diff1 < 0 ? -diff1 : diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % sh[3] == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - return true; -} -bool test_fp32_to_int8_inplace(int axis, float scale_factor, Context ctx){ - Shape sh(get_rand(1, 10), get_rand(1, 50), get_rand(1, 512), get_rand(1, 512)); - // Shape sh(4, 32, 112, 112); - TensorH tin, tin1, tin2, tin3, tin4; - tin.re_alloc(sh, AK_FLOAT); - tin1.re_alloc(sh, AK_FLOAT); - tin2.re_alloc(sh, AK_FLOAT); - tin3.re_alloc(sh, AK_FLOAT); - tin4.re_alloc(sh, AK_FLOAT); - fill_tensor_rand(tin, -20, 20); - tin1.copy_from(tin); - tin2.copy_from(tin); - tin3.copy_from(tin); - tin4.copy_from(tin); - LOG(INFO) << "input shape num = " << sh[0]; - LOG(INFO) << "input shape channel = " << sh[1]; - LOG(INFO) << "input shape height = " << sh[2]; - LOG(INFO) << "input shape width = " << sh[3]; - std::vector scale_basic; - std::vector scale_lite; - LOG(INFO) << "get_scale_basic compute"; - scale_basic = get_scale_basic(tin, axis, scale_factor); - LOG(INFO) << "get_tensor_scale compute"; - get_tensor_scale(tin, scale_lite, axis, scale_factor); - if (scale_basic.size() != scale_lite.size()) { - return false; - } - for (int i = 0; i < scale_basic.size(); ++i) { - float tmp = scale_basic[i] - scale_lite[i]; - // if (tmp != 0){ - // printf("i: %d, tmp: %.6f \n", i, tmp); - // } - if (fabsf(scale_basic[i] - scale_lite[i]) > 1e-4f) { - LOG(INFO) << "scale compute failed"; - return false; - } - } - LOG(INFO) << "scale is right"; - TensorH tout; - TensorH tout_basic; - tout.re_alloc(sh, AK_INT8); - tout_basic.re_alloc(sh, AK_INT8); - LOG(INFO) << "fp32_to_int8_inplace_basic compute"; - fp32_to_int8_inplace_basic(tin1, axis + 1, scale_lite); - // print_tensor(tout_basic); - LOG(INFO) << "trans_fp32_weights_to_int8_inplace compute"; - // int outer_size = tin.count_valid(0, axis); - // int inner_size = tin.count_valid(axis, tin.dims()); - // LOG(INFO) << "outer_size: " << outer_size << ", inner_size: " << inner_size; - // fp32_to_int8((const float*)tin.data(), (char*)tout.mutable_data(), scale_lite, outer_size, inner_size); - trans_fp32_weights_to_int8_inplace(tin2, scale_factor, 0, &ctx); - // print_tensor(tout); - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - int out_size = tin2.valid_size(); - char* ptr_basic = static_cast(tin1.data()); - char* ptr = static_cast(tin2.data()); - LOG(INFO) << "trans_fp32_weights_to_int8 diff, size: " << out_size; - for (int i = 0; i < out_size; i++){ - int a = ptr[i]; - int b = ptr_basic[i]; - int diff1 = a - b; - int diff = diff1 < 0 ? -diff1 : diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % sh[3] == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - // printf("\n"); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; - LOG(INFO)<< "tensor_to_int8"; - tin3.set_scale(scale_lite); - tin4.set_scale(scale_lite); - LOG(INFO) << "tensor_to_int8_inplace_basic compute"; - tensor_to_int8_inplace_basic(tin3); - LOG(INFO) << "trans_tensor_to_int8 compute"; - trans_tensor_fp32_to_int8_inplace(tin4, &ctx); - ptr_basic = static_cast(tin3.data()); - ptr = static_cast(tin4.data()); - LOG(INFO) << "trans_tensor_to_int8 diff, size: " << out_size; - for (int i = 0; i < out_size; i++){ - int a = ptr[i]; - int b = ptr_basic[i]; - int diff1 = a - b; - int diff = diff1 < 0 ? -diff1 : diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % sh[3] == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - return true; -} -TEST(TestSaberLite, test_calibrate_lite) { - Context ctx1; - PowerMode mode = SABER_POWER_HIGH; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } -#if 1 - LOG(INFO) << "scale compute"; - for (auto& axis : {0, 1, 2, 3}) { - for (auto& scale : {63.f, 127.f}) { - if (test_get_scale(axis, scale)) { - LOG(INFO) << "test calibrate get_scale, axis=" << axis << ", scale=" << scale; - }else{ - LOG(INFO) << "test calibrate get_scale, axis=" << axis << ", scale=" << scale <<", compute error"; - return; - } - } - } -#endif - LOG(INFO) << "****************************"; -#if 1 - LOG(INFO) << "fp32_to_int8 compute"; - for (auto& axis : {0}) { - for (auto& scale : {63.f, 127.f}) { - LOG(INFO) << "test calibrate get_scale, axis=" << axis << ", scale=" << scale; - if (test_fp32_to_int8(axis, scale, ctx1)) { - LOG(INFO) << "The fp32_to_int8 result is right"; - } - } - } -#endif - LOG(INFO) << "****************************"; -#if 1 - LOG(INFO) << "fp32_to_inplace_int8 compute"; - for (auto& axis : {0}) { - for (auto& scale : {63.f, 127.f}) { - LOG(INFO) << "test calibrate get_scale, axis=" << axis << ", scale=" << scale; - if (test_fp32_to_int8_inplace(axis, scale, ctx1)) { - LOG(INFO) << "The fp32_to_inplace_int8 result is right"; - } - } - } -#endif -} -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - Env::env_init(); - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4) { - iter = atoi(argv[3]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} \ No newline at end of file diff --git a/test/lite/test_concat_lite.cpp b/test/lite/test_concat_lite.cpp deleted file mode 100644 index 9fd70f986..000000000 --- a/test/lite/test_concat_lite.cpp +++ /dev/null @@ -1,168 +0,0 @@ -#include "saber/lite/funcs/saber_concat.h" -#include "test_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; - -typedef Tensor TensorH; - -template -void concat_basic(const std::vector& inputs, std::vector& outputs, ConcatParam& param){ - - int axis = param._axis; - int num = outputs[0]->num(); - int channel = outputs[0]->channel(); - int height = outputs[0]->height(); - int width = outputs[0]->width(); - - Shape out_sh = outputs[0]->valid_shape(); - int out_concat_axis = out_sh[axis]; - int num_concats = inputs[0]->count_valid(0, param._axis); - int concat_input_size = inputs[0]->count_valid(param._axis + 1, inputs[0]->dims()); - - dtype* dout = (dtype*)outputs[0]->mutable_data(); - int total_size = out_concat_axis * concat_input_size; - - for(int k = 0; k < num_concats; k++){ - dtype* dout_ptr = dout + k * total_size; - int out_size = 0; - for(int i = 0; i < inputs.size(); i++){ - Shape in_sh = inputs[i]->valid_shape(); - int size = in_sh[axis] * concat_input_size; - const dtype* din = (dtype*)inputs[i]->data(); - const dtype* din_ptr = din + k * size; - dtype* dout_ptr_axis = dout_ptr + out_size; - for(int j = 0; j < size; j++){ - dout_ptr_axis[j] = din_ptr[j]; - } - out_size += size; - } - } -} - -TEST(TestSaberLite, test_func_concat_arm) { - - Context ctx1; - PowerMode mode = SABER_POWER_HIGH; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - const int test_iter = 100; - - SaberConcat concat_lite; - for (auto& axis : {0, 1, 2, 3}) { - ConcatParam param(axis); - concat_lite.load_param(¶m); - for (auto& type : {AK_FLOAT, AK_INT8}) { - int n = get_rand(1, 10); - int c = get_rand(1, 100); - int h = get_rand(1, 100); - int w = get_rand(1, 100); - - Shape sh1 = {n, c, h, w}; - Shape sh2 = sh1; - Shape sh3 = sh1; - sh1[axis] = get_rand(1, 100); - sh2[axis] = get_rand(1, 100); - sh3[axis] = get_rand(1, 100); - - Shape shape_out = sh1; - shape_out[axis] = sh1[axis] + sh2[axis] + sh3[axis]; - LOG(INFO) << " input size, num=" << n << ", channel=" << \ - c << ", height=" << h << ", width=" << w; - LOG(INFO) << "concat axis= " << axis << ", size: " << sh1[axis] << \ - ", " << sh2[axis] << ", " << sh3[axis]; - LOG(INFO) << "compute precision: " << ((type == AK_FLOAT)? "float" : "int8"); - - //! prepare inputs and outputs - std::vector vin; - std::vector vout; - - TensorH th1, th2, th3; - th1.re_alloc(sh1, type); - th2.re_alloc(sh2, type); - th3.re_alloc(sh3, type); - fill_tensor_rand(th1, -100, 100); - fill_tensor_rand(th2, -100, 100); - fill_tensor_rand(th3, -100, 100); - vin.push_back(&th1); - vin.push_back(&th2); - vin.push_back(&th3); - - TensorH tdev_out; - vout.push_back(&tdev_out); - - concat_lite.compute_output_shape(vin, vout); - LOG(INFO) << "output shape: " << tdev_out.valid_shape()[0] << ", " \ - << tdev_out.valid_shape()[1] << ", " << tdev_out.valid_shape()[2] \ - << ", " << tdev_out.valid_shape()[3]; - - CHECK_EQ(shape_out == vout[0]->valid_shape(), true) << "compute shape error"; - tdev_out.re_alloc(shape_out, type); - - //! set op precision type - concat_lite.set_op_precision(type); - - concat_lite.init(vin, vout, ctx1); - - SaberTimer t1; - t1.clear(); - t1.start(); - - for (int i = 0; i < test_iter; ++i) { - concat_lite.dispatch(vin, vout); - } - - t1.end(); - float ts = t1.get_average_ms(); - LOG(INFO) << "total time : " << ts << ", avg time : " << ts / test_iter; - - std::vector vout_basic; - TensorH tout_basic; - tout_basic.re_alloc(shape_out, type); - vout_basic.push_back(&tout_basic); - - if (type == AK_FLOAT) { - concat_basic(vin, vout_basic, param); - } else if (type == AK_INT8) { - concat_basic(vin, vout_basic, param); - } else { - LOG(FATAL) << "unsupported dtype"; - } - - double max_ratio; - double max_diff; - tensor_cmp_host(*vout[0], *vout_basic[0], max_ratio, max_diff); - CHECK_EQ(fabsf(max_ratio) < 1e-6f, true) << "concat compute result error"; - LOG(INFO) << "finished compare, pass!"; - } - } -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - Env::env_init(4); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_context_lite.cpp b/test/lite/test_context_lite.cpp deleted file mode 100644 index 719841a37..000000000 --- a/test/lite/test_context_lite.cpp +++ /dev/null @@ -1,125 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/core/context_lite.h" - -using namespace anakin; -using namespace anakin::saber; -using namespace anakin::saber::lite; - -TEST(TestSaberLite, test_arm_context) { - - Context ctx; - LOG(INFO) << "create runtime ctx"; - //ctx.set_power_mode(MERC_HIGH); - //ctx.set_act_cores({4, 5, 6, 7}); - LOG(INFO) << "high mode, 4 threads"; - ctx.set_run_mode(SABER_POWER_HIGH, 4); - LOG(INFO) << "set active ids"; - - LOG(INFO) << "test threads activated"; -#ifdef USE_OPENMP -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } - int th_id; -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } - - LOG(INFO) << "high mode, 2 threads"; - ctx.set_run_mode(SABER_POWER_HIGH, 2); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } - - LOG(INFO) << "high mode, 1 threads"; - ctx.set_run_mode(SABER_POWER_HIGH, 1); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } - - LOG(INFO) << "low mode, 4 threads"; - ctx.set_run_mode(SABER_POWER_LOW, 4); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } - - LOG(INFO) << "low mode, 2 threads"; - ctx.set_run_mode(SABER_POWER_LOW, 2); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } - - LOG(INFO) << "low mode, 1 threads"; - ctx.set_run_mode(SABER_POWER_LOW, 1); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { - int threads = omp_get_num_threads(); - printf("number of threads: %d\n", threads); - } -#pragma omp parallel private(th_id) - { - th_id = omp_get_thread_num(); -#pragma omp parallel - printf("thread1 core ID: %d\n", th_id); - - } -#endif -} - -int main(int argc, const char** argv){ - - Env::env_init(); - - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} \ No newline at end of file diff --git a/test/lite/test_conv_act_pooling_lite.cpp b/test/lite/test_conv_act_pooling_lite.cpp deleted file mode 100644 index d77c505d8..000000000 --- a/test/lite/test_conv_act_pooling_lite.cpp +++ /dev/null @@ -1,192 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_conv_pooling.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; - -#define USE_COMPARE -const bool FLAG_RELU = true; - -typedef Tensor TensorHf4; -template -void tensor_diff(Tensor_t& t1, Tensor_t& t2, Tensor_t& tdiff) { - - typedef typename Tensor_t::Dtype dtype; - int size1 = t1.valid_size(); - int size2 = t2.valid_size(); - int size_out = tdiff.valid_size(); - CHECK_EQ(size1, size2) << "wrong shape"; - CHECK_EQ(size1, size_out) << "wrong shape"; - const dtype* ptr1 = t1.data(); - const dtype* ptr2 = t2.data(); - dtype* ptr_out = tdiff.mutable_data(); - for (int i = 0; i < size1; ++i) { - ptr_out[i] = ptr1[i] - ptr2[i]; - } -} - -void test_arm_conv(std::vector& tin, \ - int ch_out, int kernel, int stride, int pad, \ - int dila, int group, bool bias, int thread_num, int cluster_id) { - - int test_iter = 100; - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - SaberConvPooling2D conv; - - Context ctx1; - PowerMode mode = cluster_id == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - TensorHf4* thin = tin[0]; - - std::vector tvout_saber; - - tvout_saber.push_back(&tout_saber); - - int num = tin[0]->num(); - int chin = tin[0]->channel(); - int hin = tin[0]->height(); - int win = tin[0]->width(); - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << num; - LOG(INFO) << " in_channels = " << chin; - LOG(INFO) << " img_h = " << hin; - LOG(INFO) << " img_w = " << win; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad = " << pad; - LOG(INFO) << " stride = " << stride; - LOG(INFO) << " dilation = " << dila; - LOG(INFO) << " kernel = " << kernel; - LOG(INFO) << " out_channels = " << ch_out; - - int input_dim = tin[0]->height(); // P - int kernel_exten = dila * (kernel - 1) + 1; - int hout = (input_dim + 2 * pad - kernel_exten) / stride + 1; - - input_dim = tin[0]->width(); // Q - kernel_exten = dila * (kernel - 1) + 1; - int wout = (input_dim + 2 * pad - kernel_exten) / stride + 1; - - Shape shape_out{num, ch_out, 1, 1}; - - Shape shw{ch_out, chin / group, kernel, kernel}; - Shape shb{1, ch_out, 1, 1}; - TensorHf4 pweiht(shw); - TensorHf4 pbias(shb); - - fill_tensor_rand(pweiht, -1.f, 1.f); - fill_tensor_rand(pbias, -1.f, 1.f); - - //fill_tensor_host_const(pweiht, 1.f); - //fill_tensor_host_const(pbias, 1.f); - - TensorHf4* bias_ptr = nullptr; - if (bias) { - bias_ptr = &pbias; - } - - SaberConvPooling2D conv_lite; - ConvPool2DParam param(pweiht.valid_size(), ch_out, group, \ - kernel, kernel, stride, stride, pad, pad, dila, dila, bias, pweiht.data(), pbias.data(), \ - false, true, Active_relu, 0.f, 1.f, false, nullptr, \ - Pooling_average_include_padding, true, 1, 1, 1, 1, 1, 1); -// conv_lite.load_param(pweiht.valid_size(), ch_out, group, \ -// kernel, kernel, stride, stride, pad, pad, dila, dila, bias, Active_relu, true, \ -// Pooling_average_include_padding, true, 1, 1, 1, 1, 1, 1, pweiht.data(), pbias.data()); - LITE_CHECK(conv_lite.load_param(¶m)); - - conv_lite.compute_output_shape(tin, tvout_saber); - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber conv impl init"; - CHECK_EQ(conv_lite.init(tin, tvout_saber, ctx1), SaberSuccess) << "init error"; - - //! compute - LOG(INFO) << "saber conv compute"; - to = 0; - - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - conv_lite.dispatch(tin, tvout_saber); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "saber conv running time, ave: " << to / test_iter << ", min time: " << min_time; - //print_tensor_host(*tvout_saber[0]); -} - -#if 1 -TEST(TestSaberLite, test_conv_act_pooling) { - - int num = 1; - int chin = 32; - int hin = 112; - int win = 112; - - int group = chin; - int pad = 1; - int stride = 2; - int dilation = 1; - int kernel = 3; - int chout = chin; - - bool bias_term = true; - - Shape shape_in(num, chin, hin, win); - - TensorHf4 tdin; - - tdin.re_alloc(shape_in); - fill_tensor_const(tdin, 1.f); - - std::vector tin; - tin.push_back(&tdin); - - test_arm_conv(tin, chout, kernel, stride, pad, dilation, group, bias_term, threads, cluster); -} -#endif -int main(int argc, const char** argv){ - Env::env_init(); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_conv_block_utils.cpp b/test/lite/test_conv_block_utils.cpp deleted file mode 100644 index 0cedc0277..000000000 --- a/test/lite/test_conv_block_utils.cpp +++ /dev/null @@ -1,408 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/conv_block_utils.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; - -typedef Tensor TensorHf4; - -int g_cluster = 0; -int g_threads = 1; -bool g_basic_test = false; -int g_test_iter = 100; -bool g_compared_result = true; -int g_ch_n = 4; -int g_hei_n = 1; -int g_num = 4; -int g_channel = 16; -int g_height = 112; -int g_width = 112; -int g_kernel_size = 9; - -/*preprocessing weights -* input weights: [chout, chin/ group, 3, 3] --> outputs weights: [chout / n, chin/ group, 3, 3 * n] -*/ -template -void conv_trans_weights_numc_basic(const dtype* din, dtype* dout, int chout, int chin, int n, int kernel_size) { - if (n <= 0){ - LOGE("ch_n and hei_n are more than zero\n"); - return SaberInvalidValue; - } - int c_loop = chout / n; - int chout_round = (chout + n - 1) / n; - int win_stride = chin * kernel_size; - int wout_stride = n * win_stride; - int co = 0; - for (; co < c_loop; ++co) { - dtype* dout_c = dout + co * wout_stride; - const dtype *din_array[n]; - din_array[0] = din + co * wout_stride; - for (int i = 1; i < n; i++){ - din_array[i] = din_array[i - 1] + win_stride; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++){ - *(dout_c++) = * (din_array[i]++); - } - } - } - } - // pad final chout - if (chout_round > c_loop) { - dtype* dout_c = dout + c_loop * wout_stride; - const dtype *din_array[n]; - din_array[0] = din + c_loop * wout_stride; - for (int i = 1; i < n; i++){ - din_array[i] = din_array[i - 1] + win_stride; - } - //deal remain - int cremain = chout_round * n - chout; - for (int i = 1; i <= cremain; i++){ - din_array[n - i] = din_array[0]; - } - for (int ci = 0; ci < chin; ++ci) { - for (int k = 0; k < kernel_size; ++k) { - for (int i = 0; i < n; i++){ - *(dout_c++) = * (din_array[i]++); - } - } - } - } -} - -/*preprocessing inputs -* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws] -* n = he - hs -*/ -template -void prepack_input_nxw_basic(const dtype* din, dtype* dout, int n, int hs, int he, int ws, int we, \ - int channel, int width, int height, dtype* zero_ptr) { - - if (n <= 0){ - LOGE("hei_n is more than zero\n"); - return; - } - int w0 = ws < 0 ? 0 : ws; - int w1 = we > width ? width : we; - int h0 = hs < 0 ? 0: hs; - int h1 = he > height ? height : he; - - int size_w = we - ws; - int size_wc_len = size_w * channel; - int size_c = width * height; - - int valid_w = w1 - w0; - int valid_h = h1 - h0; - size_t valid_w_byte = valid_w * sizeof(dtype); - - dtype *out_array[n]; - out_array[0] = dout; - for (int i = 1; i < n; i++){ - out_array[i] = out_array[i - 1] + size_wc_len; - } - - dtype* ptr_zero; - memset(ptr_zero, 0, valid_w_byte); - for (int c = 0; c < channel; ++c) { - int j = 0; - //valid height - for (int i = hs; i < he; i++){ - //get address - dtype *in_array = din + i * width; - if (i < 0 || i >= height){ - in_array = ptr_zero; - } - for (int w = ws; w < w0; ++w) { - *(out_array[j]++) = 0.f; - } - memcpy(out_array[j], in_array, valid_w_byte); - out_array[j] += valid_w; - for (int w = w1; w < we; ++w) { - *(out_array[j]++) = 0.f; - } - j++; - } - //remain - // for (int i = valid_h; i < n; i++){ - // for (int w = ws; w < we; w++){ - // *(out_array[i]++) = 0.f; - // } - // } - din += size_c; - } - return SaberSuccess; -} - -/*wirte result in outputs -* input din: [n, c / n, h, w * n], output dout: [n, c, h, w] -*/ -template -void write_to_output_nxw_basic(const dtype* din, dtype* dout, int ch_n, int hei_n, int cs, int ce, int hs, int he,\ - int ws, int we, int channel, int height, int width, bool flag_relu, dtype* trash_ptr) { - - if (ch_n <= 0 || hei_n <= 0){ - LOGE("ch_n and hei_n are more than zero\n"); - return; - } - int size_c_out = width * height; - - dtype *dout_array[ch_n]; - dout_array[0] = dout + cs * size_c_out + hs * width + ws; - for (int i = 1; i < ch_n; i++){ - dout_array[i] = dout_array[i - 1] + size_c_out; - } - - const dtype* ptr_din = din; - - if (ce > channel) { - int cremain = ce - channel; - for (int i = cremain; i > 0; i--){ - dout_array[ch_n - i] = trash_ptr; - } - } - - int size_h = (he > height ? height : he) - hs; - for (int i = 0; i < hei_n; i++){ - for (int j = 0; j < width; j++){ - int size_w = i * width; - for (int c = 0; c < ch_n; c++){ - dtype *ptr = dout_array[c] + size_w; - if (flag_relu){ - *ptr = *ptr_din > 0 ? *ptr_din : 0; - }else{ - *ptr = *ptr_din; - } - ptr_din++; - } - } - } -} - -template -void fill_packed_bias_nxmw_basic(const dtype* bias, dtype* dout, int ch_n, int hei_n, int wround){ - if (ch_n <= 0 || hei_n <= 0){ - LOGE("ch_n and hei_n are more than zero\n"); - return; - } - for(int i = 0; i < hei_n; i++){ - for (int j = 0; j < wround; j++){ - const dtype* bias_ptr = bias; - for (int k = 0; k < ch_n; k++){ - *dout = * bias_ptr; - dout++; - bias_ptr++; - } - } - } -} - -SaberStatus test_arm_conv_block_utils(int n, int c, int h, int w, \ - int ch_n, int hei_n, int kernel_size, int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = (PowerMode)cluster_id; - ctx1.set_run_mode(mode, thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - TensorHf4 tout_basic_int; - TensorHf4 tout_saber_int; - - Shape shin = {n, c, h, w}; - TensorHf4 thin; - TensorHf4 thin32; - - thin.re_alloc(shin, AK_FLOAT); - fill_tensor_rand(thin, -1.f, 1.f); - // fill_tensor_const(thin, 1.f); - - thin32.re_alloc(shin, AK_INT32); - fill_tensor_rand(thin32, -1.f, 1.f); - - LOG(INFO) << "conv block param: "; - LOG(INFO) << " img_num = " << n; - LOG(INFO) << " in_channels = " << c; - LOG(INFO) << " img_h = " << h; - LOG(INFO) << " img_w = " << w; - LOG(INFO) << " ch_n = " << ch_n; - LOG(INFO) << " hei_n = " << hei_n; - LOG(INFO) << " kernel_size = " << kernel_size; - - //c1 -> cn - int hout = h; - - int wout = w * ch_n; - - int chout = c / ch_n + c % ch_n; - - //cn->c1 - int hout_c = h; - - int wout_c = w / ch_n; - - int chout_c = c * ch_n; - - Shape shape_out{n, chout, hout, wout}; - LOG(INFO) << " chout = " << chout; - LOG(INFO) << " hout = " << hout; - LOG(INFO) << " wout = " << wout; - - const float* din = static_cast(thin.data()); - const int* din_int32 = static_cast(thin32.data()); - - //! compute - LOG(INFO) << "saber conv block compute"; - to = 0; - tout_saber.re_alloc(shape_out, AK_FLOAT); - fill_tensor_const(tout_saber, 0.f); - float* dout_f32 = static_cast(tout_saber.mutable_data()); - tout_saber_int.re_alloc(shape_out, AK_INT32); - fill_tensor_const(tout_saber_int, 0.f); - int* dout_int32 = static_cast(tout_saber_int.mutable_data()); - int* trash_ptr = static_cast(ctx1.get_work_space()); - memset(trash_ptr, 0, wout * sizeof(signed int)); - float* ptr_zero = static_cast(ctx1.get_work_space()) + wout; - memset(ptr_zero, 0, w * sizeof(float)); - for (int i = 0; i < g_test_iter; ++i) { - t1.clear(); - t1.start(); - conv_trans_weights_numc(din, dout_f32, chout, c, ch_n, kernel_size); - // prepack_input_nxw(din, dout_f32, hei_n, 0, 4, -1, 20, c, w, h, ptr_zero); - // fill_packed_bias_nxmw_f32(din, dout_f32, c, w, h); - // conv_trans_weights_numc(din_int32, dout_int32, chout, c, ch_n, kernel_size); - if (ch_n == 4){ - write_to_output_c4_int32(din_int32, dout_int32, ch_n, hei_n, 0, 4, 0, 2, 0, w * ch_n, \ - chout, hout, wout, true, trash_ptr); - } - if (ch_n == 8){ - write_to_output_c8_int32(din_int32, dout_int32, ch_n, hei_n, 0, 4, 0, 2, 0, w * ch_n, \ - chout, hout, wout, true, trash_ptr); - } - - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - // print_tensor(tout_basic); - } - LOG(INFO) << "saber conv block running time, ave: " << to / g_test_iter << ", min time: " << min_time; - // print_tensor(tout_saber); - - - if (g_compared_result) { - LOG(INFO) << "run basic conv block for precision comparation"; - tout_basic.re_alloc(shape_out, AK_FLOAT); - fill_tensor_const(tout_basic, 0.f); - float* dout = static_cast(tout_basic.mutable_data()); - - tout_basic_int.re_alloc(shape_out, AK_INT32); - fill_tensor_const(tout_basic_int, 0.f); - int* dout_32 = static_cast(tout_basic_int.mutable_data()); - conv_trans_weights_numc_basic(din, dout, chout, c, ch_n, kernel_size); - // prepack_input_nxw_basic(din, dout, hei_n, 0, 4, -1, 20, c, w, h, ptr_zero); - // fill_packed_bias_nxmw_basic(din, dout, c, w, h); - // conv_trans_weights_numc_basic(din_int32, dout_32, chout, c, ch_n, kernel_size); - write_to_output_nxw_basic(din_int32, dout_32, ch_n, hei_n, 0, 4, 0, 2, 0, w * ch_n, \ - chout, hout, wout, true, trash_ptr); - // print_tensor(tout_basic); - double max_ratio = 0; - double max_diff = 0; - // tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - tensor_cmp_host(tout_basic_int, tout_saber_int, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabsf(max_ratio) > 1e-3f) { - TensorHf4 tdiff(tout_basic_int.valid_shape()); - LOG(INFO) << "biasc result"; - print_tensor(tout_basic_int); - LOG(INFO) << "saber result"; - print_tensor(tout_saber_int); - tensor_diff(tout_basic_int, tout_saber_int, tdiff); - print_tensor(tdiff); - return SaberInvalidValue; - } - max_ratio = 0; - max_diff = 0; - // tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabsf(max_ratio) > 1e-3f) { - TensorHf4 tdiff(tout_basic.valid_shape()); - LOG(INFO) << "biasc result"; - print_tensor(tout_basic); - LOG(INFO) << "saber result"; - print_tensor(tout_saber); - tensor_diff(tout_basic, tout_saber, tdiff); - print_tensor(tdiff); - return SaberInvalidValue; - } - } - return SaberSuccess; - -} - -TEST(TestSaberLite, test_custom) { - auto flag = test_arm_conv_block_utils(g_num, g_channel, g_height, g_width, g_ch_n, g_hei_n, g_kernel_size, g_threads, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test conv block utils: batchsize: " << g_num << ", channel: " << g_channel << ", h: " << g_height << \ - ", w: " << g_width << ", ch_n: " << g_ch_n << ", hei_n" << g_hei_n <<", kernel_size: " << g_kernel_size << \ - ", threads: " << g_threads << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(FATAL) << "test conv block utils: batchsize: " << g_num << ", channel: " << g_channel << ", h: " << g_height << \ - ", w: " << g_width << ", ch_n: " << g_ch_n << ", hei_n" << g_hei_n <<", kernel_size: " << g_kernel_size << \ - ", threads: " << g_threads << ", cluster: " << g_cluster << " failed!!"; - } -} - - -int main(int argc, const char** argv){ - anakin::saber::lite::Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [test iter] [compare result]"; - if (argc > 1) { - g_basic_test = atoi(argv[1]) > 0; - } - if (argc > 2) { - g_cluster = atoi(argv[2]); - } - if (argc > 3) { - g_threads = atoi(argv[3]); - } - if (argc > 4) { - g_test_iter = atoi(argv[4]); - } - if (argc > 5){ - g_compared_result = atoi(argv[5]); - } - if (argc > 6){ - if (argc < 13) { - LOG(FATAL) << "usage: ./" << argv[0] << " do_basic_test cluster threads test_iter " << \ - " compare_result num channel height width ch_n hei_n kernel_size"; - return -1; - } - g_num = atoi(argv[6]); - g_channel = atoi(argv[7]); - g_height = atoi(argv[8]); - g_width = atoi(argv[9]); - g_ch_n = atoi(argv[10]); //channel num - g_hei_n = atoi(argv[11]); //height num - g_kernel_size = atoi(argv[12]); - } - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_conv_lite.cpp b/test/lite/test_conv_lite.cpp deleted file mode 100644 index 1ec0e4e90..000000000 --- a/test/lite/test_conv_lite.cpp +++ /dev/null @@ -1,355 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_conv.h" -#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int g_cluster = 0; -int g_threads = 1; -int g_test_iter = 2; -bool g_basic_test = false; -bool g_compare_result = true; -bool g_flag_relu = true; -bool g_flag_bias = true; - -int g_num = 1; -int g_ch_in = 32; -int g_h_in = 112; -int g_w_in = 112; - -int g_ch_out = 32; -int g_group = 32; -int g_kw = 3; -int g_pad_w = 1; -int g_stride_w = 1; -int g_dila_w = 1; -int g_kh = 3; -int g_pad_h = 1; -int g_stride_h = 1; -int g_dila_h = 1; - -typedef Tensor TensorHf4; - -SaberStatus test_arm_conv(int n, int c, int h, int w, \ - int ch_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w, int pad_h, \ - int dila_w, int dila_h, int group, bool is_bias, bool is_relu, int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = (PowerMode)cluster_id; - ctx1.set_run_mode(mode, thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - Shape shin = {n, c, h, w}; - TensorHf4 thin; - - thin.re_alloc(shin, AK_FLOAT); - - std::vector tvin; - std::vector tvout_saber; - - tvin.push_back(&thin); - tvout_saber.push_back(&tout_saber); - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << n; - LOG(INFO) << " in_channels = " << c; - LOG(INFO) << " img_h = " << h; - LOG(INFO) << " img_w = " << w; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad_width = " << pad_w; - LOG(INFO) << " pad_height = " << pad_h; - LOG(INFO) << " stride_width = " << stride_w; - LOG(INFO) << " stride_height = " << stride_h; - LOG(INFO) << " dilation_w = " << dila_w; - LOG(INFO) << " dilation_h = " << dila_h; - LOG(INFO) << " kernel_w = " << kernel_w; - LOG(INFO) << " kernel_h = " << kernel_h; - LOG(INFO) << " out_channels = " << ch_out; - LOG(INFO) << " bias flag = " << (is_bias? "true" : "false"); - LOG(INFO) << " relu flag = " << (is_relu? "true" : "false"); - - int kernel_exten = dila_h * (kernel_h - 1) + 1; - int hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; - - kernel_exten = dila_w * (kernel_w - 1) + 1; - int wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; - - Shape shape_out{n, ch_out, hout, wout}; - - Shape shw{ch_out, c / group, kernel_h, kernel_w}; - Shape shb{1, ch_out, 1, 1}; - TensorHf4 pweiht(shw); - TensorHf4 pbias(shb); - - fill_tensor_rand(thin, -1.f, 1.f); - fill_tensor_rand(pweiht, -1.f, 1.f); - fill_tensor_rand(pbias, -1.f, 1.f); - -// fill_tensor_const(thin, 1.f); -// fill_tensor_const(pweiht, 1.f); -// fill_tensor_const(pbias, 1.f); -// print_tensor(pweiht); -// print_tensor(pbias); - TensorHf4* bias_ptr = nullptr; - if (is_bias) { - bias_ptr = &pbias; - } - const float* din = static_cast(thin.data()); - - if (g_compare_result) { - LOG(INFO) << "run basic conv for precision comparation"; - tout_basic.re_alloc(shape_out); - fill_tensor_const(tout_basic, 0.f); - float* dout = static_cast(tout_basic.mutable_data()); - const float* wptr = static_cast(pweiht.data()); - const float* bptr = nullptr; - if (is_bias) { - bptr = static_cast(pbias.data()); - } - conv_basic(din, dout, n, ch_out, hout, wout, c, h, w, \ - wptr, bptr, group, kernel_w, kernel_h, stride_w, stride_h, \ - dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); -// print_tensor(tout_basic); - } - - SaberConv2D conv_lite; - - Conv2DParam param(pweiht.valid_size(), ch_out, group, kernel_w, kernel_h, \ - stride_w, stride_h, pad_w, pad_h, dila_w, dila_h, is_bias, pweiht.data(), pbias.data(), \ - false, is_relu, Active_relu, 0.f, 1.f, false, nullptr); - - conv_lite.load_param(¶m); - - conv_lite.compute_output_shape(tvin, tvout_saber); - - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber conv impl init"; - auto states = conv_lite.init(tvin, tvout_saber, ctx1); - CHECK_EQ(states, SaberSuccess) << "Saber conv init failed"; - - //! compute - LOG(INFO) << "saber conv compute"; - to = 0; - for (int i = 0; i < g_test_iter; ++i) { - t1.clear(); - t1.start(); - conv_lite.dispatch(tvin, tvout_saber); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "saber conv running time, ave: " << to / g_test_iter << ", min time: " << min_time; -// print_tensor(*tvout_saber[0]); - - if (g_compare_result) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabsf(max_ratio) > 1e-3f) { - if (max_diff > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape()); - tensor_diff(tout_basic, tout_saber, tdiff); - print_tensor(tdiff); - return SaberInvalidValue; - } - - } -// CHECK_EQ(fabsf(max_ratio) < 5e-4f, true) << "compute result error"; - } - return SaberSuccess; - -} - -#if 1 -TEST(TestSaberLite, test_conv_depthwise) { - if (g_basic_test) { - for (auto& batch : {1, 2, 4, 8}) { - for (auto& c : {1, 8, 16, 32, 64}) { - for (auto& h : {2, 3, 15, 28, 56, 112, 128, 150, 224, 300}) { - int w = h; - for (auto& stride : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - for (auto& th : {1, 2, 4}) { - auto flag = test_arm_conv(batch, c, h, w, c, 3, 3, stride, stride, 1, 1, 1, 1, c, flag_bias, flag_relu, th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test fp32 depthwise conv: batchsize: " << batch << ", channel: " << c << ", h & w: " << h << \ - ", stride: " << stride << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << (flag_relu? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(FATAL) << "test fp32 depthwise conv: batchsize: " << batch << ", channel: " << c << ", h & w: " << h << \ - ", stride: " << stride << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << (flag_relu? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!"; - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_conv_1x1s1) { - if (g_basic_test) { - for (auto& batch : {1, 2, 4, 8}) { - for (auto &c : {1, 8, 16, 32, 64}) { - for (auto& cout : {1, 16, 32, 64, 128}) { - for (auto &g_div : {1, 2, 4}) { - for (auto &h : {2, 3, 15, 28, 56, 112, 128, 150, 224, 300}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - - int w = h; - int g = g_div; - if (g % g_div != 0) { - g = 1; - } - auto flag = test_arm_conv(batch, c, h, w, cout, 1, 1, 1, 1, \ - 0, 0, 1, 1, g, flag_bias, flag_relu, th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test fp32 1x1s1 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(FATAL) << "test fp32 1x1s1 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!"; - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_conv_fp32_costom_size) { - auto flag = test_arm_conv(g_num, g_ch_in, g_h_in, g_w_in, g_ch_out, g_kw, g_kh, g_stride_w, g_stride_h, \ - g_pad_w, g_pad_h, g_dila_w, g_dila_h, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test fp32 conv: batchsize: " << g_num << ", channel: " - << g_ch_in << ", h & w: " << g_h_in << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(INFO) << "test fp32 1x1s1 conv: batchsize: " << g_num << ", channel: " - << g_ch_in << ", h & w: " << g_h_in << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " failed!!"; - } -} -#endif - -int main(int argc, const char** argv){ - Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - - if (argc >= 2) { - g_basic_test = atoi(argv[1]) > 0; - } - - if (argc >= 3) { - g_cluster = atoi(argv[2]); - } - if (argc >= 4) { - g_threads = atoi(argv[3]); - } - if (argc >= 5) { - g_test_iter = atoi(argv[4]); - } - if (argc >= 6) { - g_compare_result = atoi(argv[5]) > 0; - } - if (argc >= 7) { - g_flag_bias = atoi(argv[6]) > 0; - } - if (argc >= 8) { - g_flag_relu = atoi(argv[7]) > 0; - } - if (argc >= 9) { - if (argc < 18) { - LOG(FATAL) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - return -1; - } - g_num = atoi(argv[8]); - g_ch_in = atoi(argv[9]); - g_h_in = atoi(argv[10]); - g_w_in = atoi(argv[11]); - g_ch_out = atoi(argv[12]); - g_group = atoi(argv[13]); - g_kw = atoi(argv[14]); - g_kh = g_kw; - g_pad_w = atoi(argv[15]); - g_pad_h = g_pad_w; - g_stride_w = atoi(argv[16]); - g_stride_h = g_stride_w; - g_dila_w = atoi(argv[17]); - g_dila_h = g_dila_w; - } - if (argc > 18) { - g_kh = atoi(argv[18]); - } - if (argc > 19) { - g_pad_h = atoi(argv[19]); - } - if (argc > 20) { - g_stride_h = atoi(argv[20]); - } - if (argc > 21) { - g_dila_h = atoi(argv[21]); - } - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_conv_lite_int8.cpp b/test/lite/test_conv_lite_int8.cpp deleted file mode 100644 index dabef9912..000000000 --- a/test/lite/test_conv_lite_int8.cpp +++ /dev/null @@ -1,608 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_conv.h" -#include "saber/lite/funcs/neon/impl/conv_arm_impl.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int g_cluster = 0; -int g_threads = 1; -int g_test_iter = 1; - -bool g_basic_test = false; -bool g_compare_result = true; -bool g_flag_relu = false; -bool g_flag_bias = false; - -int g_num = 1; -int g_chin = 4; -int g_h_in = 10; -int g_w_in = 10; - -int g_ch_out = 4; -int g_group = 1; -int g_kw = 1; -int g_pad_w = 0; -int g_stride_w = 1; -int g_dila_w = 1; -int g_kh = 1; -int g_pad_h = 0; -int g_stride_h = 1; -int g_dila_h = 1; - -typedef Tensor TensorH; - -SaberStatus test_arm_conv_int8(int n, int c, int h, int w, \ - int ch_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w, int pad_h, \ - int dila_w, int dila_h, int group, bool is_bias, bool is_relu, int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = static_cast(cluster_id); - ctx1.set_run_mode(mode, thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorH tout_basic_int32; - TensorH tout_basic_int8; - TensorH tout_saber_int32; - TensorH tout_saber_int8; - TensorH tout_basic_fp32; - TensorH tout_saber_fp32; - - TensorH thinf; - TensorH thinc; - Shape shin = {n, c, h, w}; - thinf.re_alloc(shin, AK_FLOAT); - thinc.re_alloc(shin, AK_INT8); - - std::vector tvin_fp32; - std::vector tvin_int8; - std::vector tvout_saber_fp32; - std::vector tvout_saber_int32; - std::vector tvout_saber_int8; - - tvin_fp32.push_back(&thinf); - tvin_int8.push_back(&thinc); - tvout_saber_fp32.push_back(&tout_saber_fp32); - tvout_saber_int32.push_back(&tout_saber_int32); - tvout_saber_int8.push_back(&tout_saber_int8); - - int num = n; - int chin = c; - int hin = h; - int win = w; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << num << " in_channels = " << chin << " img_h = " << hin << " img_w = " << win; - LOG(INFO) << " num_out = " << ch_out << " group = " << group << " kernel_w = " << kernel_w << " kernel_h = " << kernel_h << \ - " stride_width = " << stride_w << " stride_height = " << stride_h << \ - " pad_width = " << pad_w << " pad_height = " << pad_h << \ - " dilation_w = " << dila_w << " dilation_h = " << dila_h; - LOG(INFO) << " bias flag = " << (is_bias? "true" : "false") << ", relu flag = " << (is_relu? "true" : "false"); - - int kernel_exten = dila_h * (kernel_h - 1) + 1; - int hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; - - kernel_exten = dila_w * (kernel_w - 1) + 1; - int wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; - - Shape shape_out{num, ch_out, hout, wout}; - - Shape shw{ch_out, chin / group, kernel_h, kernel_w}; - Shape shb{1, ch_out, 1, 1}; - - TensorH pweihtf; - TensorH pbiasf; - - TensorH pweihtc; - TensorH pbiasi; - - pweihtf.re_alloc(shw, AK_FLOAT); - pbiasf.re_alloc(shb, AK_FLOAT); - - pweihtc.re_alloc(shw, AK_INT8); - pbiasi.re_alloc(shb, AK_INT32); - - fill_tensor_rand(thinf, -10, 10); - fill_tensor_rand(pweihtf, -10, 10); - fill_tensor_rand(pbiasf, -10, 10); -// fill_tensor_const(thinf, 1.f); -// fill_tensor_const(pweihtf, 1.f); -// fill_tensor_const(pbiasf, 1.f); - - //! convert input data type - get_tensor_scale_inplace(thinf, -1, 63.f); -// LOG(INFO) << "input tesnor scale at factor 63.f is " << thinf.get_scale()[0] << ", max_val: " << 63.f * thinf.get_scale()[0]; - trans_tensor_fp32_to_int8(thinf, thinc, &ctx1); - thinc.set_scale(thinf.get_scale()); -// print_tensor(thinf); -// print_tensor(thinc); - - //! convert weight data type - get_tensor_scale_inplace(pweihtf, 0, 63.f); - std::vector w_scale = pweihtf.get_scale(); -// LOG(INFO) << "input tesnor scale at factor 63.f is "; -// for (int j = 0; j < w_scale.size(); ++j) { -// LOG(INFO) << "|-- " << j << ": " << w_scale[j] << ", max_val: " << 63.f * w_scale[j]; -// } - trans_fp32_weights_to_int8(pweihtf, pweihtc, 63.f, 0, &ctx1); - trans_fp32_bias_to_int32(pbiasf, pbiasi, thinf.get_scale()[0], w_scale, &ctx1); - -// print_tensor(pweihtf); -// print_tensor(pweihtc); - - //! get int8 and fp32 basic result - if (g_compare_result) { - LOG(INFO) << "run basic conv for precision comparation"; - const char* dinc = static_cast(thinc.data()); - const char* weightc = static_cast(pweihtc.data()); - const int* biasi = static_cast(pbiasi.data()); - const float* dinf = static_cast(thinf.data()); - const float* weightf = static_cast(pweihtf.data()); - const float* biasf = static_cast(pbiasf.data()); - tout_basic_fp32.re_alloc(shape_out, AK_FLOAT); - tout_basic_int32.re_alloc(shape_out, AK_INT32); - tout_basic_int8.re_alloc(shape_out, AK_INT8); - - float* dout_basic_fp32 = static_cast(tout_basic_fp32.mutable_data()); - int* dout_basic_int32 = static_cast(tout_basic_int32.mutable_data()); - - LOG(INFO) << "do basic fp32 conv"; - conv_basic(dinf, dout_basic_fp32, num, ch_out, hout, wout, chin, hin, win, \ - weightf, biasf, group, kernel_w, kernel_h, stride_w, stride_h, \ - dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); - -// LOG(INFO) << "do basic int8 conv, trans basic int32 to fp32"; -// conv_basic(dinc, dout_basic_int32, num, ch_out, hout, wout, chin, hin, win, \ -// weightc, biasi, group, kernel_w, kernel_h, stride_w, stride_h, \ -// dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); - -// LOG(INFO) << "trans basic int32 to int8"; -// trans_tensor_int32_to_int8(tout_basic_int32, tout_basic_int8, thinf.get_scale()[0], w_scale, &ctx1); - -// trans_tensor_int32_to_fp32(tout_basic_int32, tout_basic_fp32, thinf.get_scale()[0], w_scale, &ctx1); - -// print_tensor(tout_basic_fp32); - // LOG(INFO) << "basic in32 result"; - // print_tensor(tout_basic_int32); - } - - SaberConv2D conv_int8; - - Conv2DParam param(pweihtf.valid_size(), ch_out, group, kernel_w, kernel_h, \ - stride_w, stride_h, pad_w, pad_h, dila_w, dila_h, is_bias, \ - static_cast(pweihtf.data()), static_cast(pbiasf.data()), \ - false, is_relu, Active_relu, 0.f, 1.f, false, nullptr); - - - conv_int8.load_param(¶m); - - conv_int8.compute_output_shape(tvin_int8, tvout_saber_fp32); - - Shape sh_out_saber = tvout_saber_fp32[0]->valid_shape(); - - - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor -// LOG(INFO) << "re-alloc output memory"; - tvout_saber_int32[0]->re_alloc(shape_out, AK_INT32); - tvout_saber_fp32[0]->re_alloc(shape_out, AK_FLOAT); - tvout_saber_int8[0]->re_alloc(shape_out, AK_INT8); - - //! set compute precision -// LOG(INFO) << "set compute precision"; - auto states = conv_int8.set_op_precision(AK_INT8); - CHECK_EQ(states, SaberSuccess) << "Saber conv op precision to int8 failed"; - - //! init the op -// LOG(INFO) << "saber conv impl init"; - states = conv_int8.init(tvin_int8, tvout_saber_fp32, ctx1); - CHECK_EQ(states, SaberSuccess) << "Saber conv init failed"; - - //! compute -// LOG(INFO) << "saber conv compute"; - to = 0; - for (int i = 0; i < g_test_iter; ++i) { - t1.clear(); - t1.start(); - states = conv_int8.dispatch(tvin_int8, tvout_saber_fp32); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; - } - - long long gops = n * ch_out * wout * ch_out * (chin / group) * kernel_w * kernel_h; - LOG(INFO) << "saber conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ - ", GOPS: " << 0.000001 * gops / min_time; - -// print_tensor(tout_saber_fp32); - - if (g_compare_result) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic_fp32, tout_saber_fp32, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - double mean_basic = tensor_mean(tout_basic_fp32); - double mean_saber = tensor_mean(tout_saber_fp32); - LOG(INFO) << "mean_basic: " << mean_basic << ", mean_saber: " << mean_saber; - double max_ratio_thresh = 2e-1f; - long long diff_num = count_diff(static_cast(tout_basic_fp32.data()), \ - static_cast(tout_saber_fp32.data()), tout_saber_fp32.valid_size(), max_ratio_thresh, thinf.get_scale()[0]); - LOG(INFO) << "number of diff ratio > " << max_ratio_thresh << " is: " << diff_num << ", %" \ - << 100.f * diff_num / tout_basic_fp32.valid_size(); -// double mean_diff_ratio = fabs(mean_basic - mean_saber) / (fabs(mean_basic) + fabs(mean_saber)); -// LOG(INFO) << "mean val diff ratio: " << mean_diff_ratio; - if ((float)diff_num / tout_saber_fp32.valid_size() > 0.05/* || mean_diff_ratio > 0.1*/) { - TensorH tdiff; - tdiff.re_alloc(shape_out, AK_FLOAT); - tensor_diff(tout_basic_fp32, tout_saber_fp32, tdiff); - LOG(INFO) << "basic result:"; - print_tensor(tout_basic_fp32); - LOG(INFO) << "saber result:"; - print_tensor(tout_saber_fp32); - LOG(INFO) << "diff result:"; - print_tensor(tdiff); - return SaberInvalidValue; - } -// CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; - } - return SaberSuccess; -} - -#if 1 -TEST(TestSaberLite, test_func_conv_depthwise_3x3_int8) { - - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 16, 24}) { - for (auto& h : {8, 15, 28, 48, 49, 50, 51, 52, 53, 54, 55, 56, 112, 128, 256}) { - for (auto& w : {9, 15, 28, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 112, 128, 256}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - for (auto & stride : {1, 2}){ - int stride_w = stride; - int stride_h = stride; - int group = c; - int pad_w = 1; - int pad_h = 1; - int dila_w = 1; - int dila_h = 1; - int kw = 3; - int kh = 3; - int chout = c; - LOG(INFO) << "conv_depthwise_3x3_int8 OP"; - auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ - pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ - th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 3x3s2_dw conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 3x3s2_dw conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_func_conv_3x3s1_direct_int8) { - - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 16, 32, 64}) { - for (auto& h : {5, 15, 16, 28, 56, 112, 128, 256}) { - for (auto& w : {6, 15, 28, 29, 30, 31, 32, 33, 34, 35, 36, 56, 112, 128, 255, 256}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - for (auto & chout : {3, 8, 9, 10, 11, 12}){ - int stride_w = 1; - int stride_h = 1; - int group = 1; - int pad_w = 1; - int pad_h = 1; - int dila_w = 1; - int dila_h = 1; - int kw = 3; - int kh = 3; - LOG(INFO) << "conv_3x3s1_direct_int8 OP"; - auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ - pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ - th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 3x3s1_direct conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 3x3s1_direct conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_func_conv_3x3s2_direct_int8) { - - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 15}) { - for (auto& h : {15, 28, 56, 112, 128, 224}) { - for (auto& w : {15, 28, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 112, 128, 224}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - for (auto & chout : {2, 3, 8, 15, 16, 17, 18, 32}){ - int stride_w = 2; - int stride_h = 2; - int group = 1; - int pad_w = 1; - int pad_h = 1; - int dila_w = 1; - int dila_h = 1; - int kw = 3; - int kh = 3; - LOG(INFO) << "conv_3x3s1_direct_int8 OP"; - auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ - pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ - th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 3x3s2_direct conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 3x3s2_direct conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << chout << ", group: " << group << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_func_conv_1x1s1_int8) { - - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 16}) { - for (auto& cout : {1, 5, 16, 32}) { - for (auto& g_div : {1, 2}) { - for (auto& h : {15, 28, 56, 112, 128, 150}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - - int w = h; - int g = g_div; - if ((c % g_div != 0) || (cout % g_div != 0)) { - g = 1; - } - auto flag = test_arm_conv_int8(batch, c, h, w, cout, 1, 1, 1, 1, \ - 0, 0, 1, 1, g, flag_bias, flag_relu, th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 1x1s1 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 1x1s1 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_func_conv_gemm_int8) { - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 16}) { - for (auto& cout : {1, 5, 16}) { - for (auto& g_div : {1, 2}) { - for (auto& h : {15, 28, 56, 112, 128, 150, 224, 300}) { - for (auto& kw : {1, 2, 3, 5}) { - for (auto& kh : {1, 2, 3, 5}) { - for (auto& pad : {1, 2}) { - for (auto& stride : {1, 2}) { - for (auto& dila : {1, 2}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - int w = h; - int g = g_div; - if ((c % g_div != 0) || (cout % g_div != 0)) { - g = 1; - } - auto flag = test_arm_conv_int8(batch, c, h, w, cout, kw, kh, stride, stride, \ - pad, pad, dila, dila, g, flag_bias, flag_relu, th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 conv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_conv_int8_costom_size) { - for (int i = 0; i < 100; i++) { - auto flag = test_arm_conv_int8(g_num, g_chin, g_h_in, g_w_in, g_ch_out, g_kw, g_kh, g_stride_w, g_stride_h, \ - g_pad_w, g_pad_h, g_dila_w, g_dila_h, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 conv: batchsize: " << g_num << ", channel: " - << g_chin << ", h & w: " << g_h_in << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(FATAL) << "test int8 conv: batchsize: " << g_num << ", channel: " - << g_chin << ", h & w: " << g_h_in << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " failed!!"; - } - } -} -#endif - -int main(int argc, const char** argv){ - Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - - if (argc >= 2) { - g_basic_test = atoi(argv[1]) > 0; - } - - if (argc >= 3) { - g_cluster = atoi(argv[2]); - } - if (argc >= 4) { - g_threads = atoi(argv[3]); - } - if (argc >= 5) { - g_test_iter = atoi(argv[4]); - } - if (argc >= 6) { - g_compare_result = atoi(argv[5]) > 0; - } - if (argc >= 7) { - g_flag_bias = atoi(argv[6]) > 0; - } - if (argc >= 8) { - g_flag_relu = atoi(argv[7]) > 0; - } - if (argc >= 9) { - if (argc < 18) { - LOG(FATAL) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - return -1; - } - g_num = atoi(argv[8]); - g_chin = atoi(argv[9]); - g_h_in = atoi(argv[10]); - g_w_in = atoi(argv[11]); - g_ch_out = atoi(argv[12]); - g_group = atoi(argv[13]); - g_kw = atoi(argv[14]); - g_kh = g_kw; - g_pad_w = atoi(argv[15]); - g_pad_h = g_pad_w; - g_stride_w = atoi(argv[16]); - g_stride_h = g_stride_w; - g_dila_w = atoi(argv[17]); - g_dila_h = g_dila_w; - } - if (argc > 18) { - g_kh = atoi(argv[18]); - } - if (argc > 19) { - g_pad_h = atoi(argv[19]); - } - if (argc > 20) { - g_stride_h = atoi(argv[20]); - } - if (argc > 21) { - g_dila_h = atoi(argv[21]); - } - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_deconv_lite.cpp b/test/lite/test_deconv_lite.cpp deleted file mode 100644 index c849a3847..000000000 --- a/test/lite/test_deconv_lite.cpp +++ /dev/null @@ -1,296 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_deconv.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int g_cluster = 0; -int g_threads = 1; -int g_test_iter = 10; - -bool g_basic_test = false; - -bool g_compare_result = true; -bool g_flag_bias = true; -bool g_flag_relu = false; - -int g_num = 1; -int g_ch_in = 128; -int g_h_in = 10; -int g_w_in = 10; - -int g_ch_out = 128; -int g_group = 128; -int g_kernel = 4; -int g_pad = 1; -int g_stride = 2; -int g_dila = 1; - -typedef Tensor TensorHf4; - -SaberStatus test_arm_deconv(int n, int c, int h, int w, \ - int ch_out, int kernel, int stride, int pad, \ - int dila, int group, bool flag_bias, bool flag_relu, \ - int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - ctx1.set_run_mode(PowerMode(cluster_id), thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - TensorHf4 thin; - thin.re_alloc(Shape(n, c, h, w), AK_FLOAT); - - std::vector tin; - std::vector tvout_saber; - - tin.push_back(&thin); - tvout_saber.push_back(&tout_saber); - - int num = n; - int chin = c; - int hin = h; - int win = w; - - LOG(INFO) << "deconv param: "; - LOG(INFO) << " img_num = " << num; - LOG(INFO) << " in_channels = " << chin; - LOG(INFO) << " img_h = " << hin; - LOG(INFO) << " img_w = " << win; - LOG(INFO) << " group = " << group; - LOG(INFO) << " pad = " << pad; - LOG(INFO) << " stride = " << stride; - LOG(INFO) << " dilation = " << dila; - LOG(INFO) << " kernel = " << kernel; - LOG(INFO) << " out_channels = " << ch_out; - LOG(INFO) << " bias flag = " << (flag_bias? "true" : "false"); - - int kernel_exten = dila * (kernel - 1) + 1; - int hout = (h - 1) * stride + kernel_exten - 2 * pad; - - kernel_exten = dila * (kernel - 1) + 1; - int wout = (w - 1) * stride + kernel_exten - 2 * pad; - - Shape shape_out{num, ch_out, hout, wout}; - - Shape shw{ch_out, chin / group, kernel, kernel}; - Shape shb{1, ch_out, 1, 1}; - TensorHf4 pweiht(shw); - TensorHf4 pbias(shb); - - fill_tensor_rand(thin, -1.f, 1.f); - fill_tensor_rand(pweiht, -1.f, 1.f); - fill_tensor_rand(pbias, -1.f, 1.f); - -// fill_tensor_const(pweiht, 1.f); -// fill_tensor_const(pbias, 1.f); - - TensorHf4* bias_ptr = nullptr; - if (flag_bias) { - bias_ptr = &pbias; - } - - const float* din = static_cast(thin.data()); - - if (g_compare_result) { - LOG(INFO) << "run basic deconv for precision comparation"; - tout_basic.re_alloc(shape_out); - float* dout = static_cast(tout_basic.mutable_data()); - deconv_basic(din, dout, num, ch_out, hout, wout, chin, hin, win, \ - static_cast(pweiht.data()), static_cast(pbias.data()), \ - group, kernel, kernel, stride, stride, \ - dila, dila, pad, pad, flag_bias, flag_relu); -// print_tensor(tout_basic); - } - - SaberDeconv2D deconv_lite; - - Conv2DParam param(pweiht.valid_size(), ch_out, group, kernel, kernel, \ - stride, stride, pad, pad, dila, dila, flag_bias, pweiht.data(), pbias.data(), false, flag_relu, Active_relu); - - deconv_lite.load_param(¶m); - deconv_lite.compute_output_shape(tin, tvout_saber); - - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber deconv impl init"; - CHECK_EQ(deconv_lite.init(tin, tvout_saber, ctx1), SaberSuccess) << "Saber deconv init failed"; - - //! compute - LOG(INFO) << "saber conv compute"; - to = 0; - - for (int i = 0; i < g_test_iter; ++i) { - t1.clear(); - t1.start(); - deconv_lite.dispatch(tin, tvout_saber); - //tvout_saber[0]->record_event(ctx1.get_compute_stream()); - //tvout_saber[0]->sync(); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "saber deconv running time, ave: " << to / g_test_iter << ", min time: " << min_time; -// print_tensor(*tvout_saber[0]); - - if (g_compare_result) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabsf(max_ratio) > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape()); - tensor_diff(tout_basic, tout_saber, tdiff); - LOG(INFO) << "bias:"; - print_tensor(pbias); - LOG(INFO) << "basic result:"; - print_tensor(tout_basic); - LOG(INFO) << "saber result:"; - print_tensor(tout_saber); - LOG(INFO) << "diff:"; - print_tensor(tdiff); - return SaberInvalidValue; - } -// CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; - } -// printf("out mean: %.5f\n", tensor_mean(tout_saber)); - return SaberSuccess; -} - -TEST(TestSaberLite, test_deconv_custom_size) { - - int num = g_num; - int chin = g_ch_in; - int hin = g_h_in; - int win = g_w_in; - - int dilation = g_dila; - int chout = g_ch_out; - - test_arm_deconv(num, chin, hin, win, chout, g_kernel, g_stride, g_pad, \ - dilation, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); -} - -TEST(TestSaberLite, fp32_deconv_basic_test) { - - if (g_basic_test) { - for (auto& n : {1, 2}) { - for (auto& c : {1, 3, 8, 16}) { - for (auto& h : {3, 8, 15, 32}) { - int w = h; - for (auto& kh : {1, 2, 3, 4}) { - for (auto& cout : {1, 3, 8, 16}) { - for (auto& stride : {1, 2}) { - int pad = kh / 2; - for (auto &dila : {1, 2}) { - for (auto &g : {1, 2}) { - int group = g; - if (c % g != 0 || cout % g != 0) { - group = 1; - } - for (auto &bias : {false, true}) { - for (auto &relu : {false, true}) { - for (auto &threads : {1, 2, 4}) { - auto flag = test_arm_deconv(n, c, h, w, cout, kh, stride, pad, dila, group, bias, relu, threads, 0); - if (flag == SaberSuccess) { - LOG(INFO) << "test fp32 depthwise conv: batchsize: " << n << ", channel: " << c << ", h & w: " << h << \ - "num_out: " << cout << ", group:" << group << ", kernel: " << kh << ", stride: " << stride << \ - ", pad: " << pad << ", dila: " << dila << \ - ", bias: " << (bias? "true" : "false") << ", relu: " << (relu? "true" : "false") << ", threads: " << \ - threads << ", cluster: " << g_cluster << " passed!!"; - } else { - LOG(FATAL) << "test fp32 depthwise conv: batchsize: " << n << ", channel: " << c << ", h & w: " << h << \ - "num_out: " << cout << ", group:" << group << ", kernel: " << kh << ", stride: " << stride << \ - ", pad: " << pad << ", dila: " << dila << \ - ", bias: " << (bias? "true" : "false") << ", relu: " << (relu? "true" : "false") << ", threads: " << \ - threads << ", cluster: " << g_cluster << " failed!!"; - } - - } - } - } - } - } - } - } - } - } - } - } - } -} - - -int main(int argc, const char** argv){ - Env::env_init(); - LOG(INFO) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila"; - if (argc >= 2) { - g_basic_test = atoi(argv[1]) > 0; - } - if (argc >= 3) { - g_cluster = atoi(argv[2]); - } - if (argc >= 4) { - g_threads = atoi(argv[3]); - } - if (argc >= 5) { - g_test_iter = atoi(argv[4]); - } - if (argc >= 6) { - g_compare_result = atoi(argv[5]) > 0; - } - if (argc >= 7) { - g_flag_bias = atoi(argv[6]) > 0; - } - if (argc >= 8) { - g_flag_relu = atoi(argv[7]) > 0; - } - if (argc >= 9) { - if (argc < 18) { - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila"; - return 0; - } - g_num = atoi(argv[8]); - g_ch_in = atoi(argv[9]); - g_h_in = atoi(argv[10]); - g_w_in = atoi(argv[11]); - g_ch_out = atoi(argv[12]); - g_group = atoi(argv[13]); - g_kernel = atoi(argv[14]); - g_pad = atoi(argv[15]); - g_stride = atoi(argv[16]); - g_dila = atoi(argv[17]); - } - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_deconv_lite_int8.cpp b/test/lite/test_deconv_lite_int8.cpp deleted file mode 100644 index 5e61c57de..000000000 --- a/test/lite/test_deconv_lite_int8.cpp +++ /dev/null @@ -1,411 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_deconv.h" -#include "saber/lite/funcs/calibrate_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int g_cluster = 0; -int g_threads = 1; -int g_test_iter = 10; - -bool g_basic_test = false; -bool g_compare_result = true; -bool g_flag_relu = false; -bool g_flag_bias = false; - -int g_num = 1; -int g_chin = 32; -int g_h_in = 112; -int g_w_in = 112; - -int g_ch_out = 32; -int g_group = 32; -int g_kw = 3; -int g_pad_w = 1; -int g_stride_w = 1; -int g_dila_w = 1; -int g_kh = 3; -int g_pad_h = 1; -int g_stride_h = 1; -int g_dila_h = 1; - -typedef Tensor TensorH; - -SaberStatus test_arm_deconv_int8(int n, int c, int h, int w, \ - int ch_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w, int pad_h, \ - int dila_w, int dila_h, int group, bool is_bias, bool is_relu, int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = static_cast(cluster_id); - ctx1.set_run_mode(mode, thread_num); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorH tout_basic_int32; - TensorH tout_basic_int8; - TensorH tout_saber_int32; - TensorH tout_saber_int8; - TensorH tout_basic_fp32; - TensorH tout_saber_fp32; - - TensorH thinf; - TensorH thinc; - Shape shin = {n, c, h, w}; - thinf.re_alloc(shin, AK_FLOAT); - thinc.re_alloc(shin, AK_INT8); - - std::vector tvin_fp32; - std::vector tvin_int8; - std::vector tvout_saber_fp32; - std::vector tvout_saber_int32; - std::vector tvout_saber_int8; - - tvin_fp32.push_back(&thinf); - tvin_int8.push_back(&thinc); - tvout_saber_fp32.push_back(&tout_saber_fp32); - tvout_saber_int32.push_back(&tout_saber_int32); - tvout_saber_int8.push_back(&tout_saber_int8); - - int num = n; - int chin = c; - int hin = h; - int win = w; - - LOG(INFO) << "conv param: "; - LOG(INFO) << " img_num = " << num << " in_channels = " << chin << " img_h = " << hin << " img_w = " << win; - LOG(INFO) << " num_out = " << ch_out << " group = " << group << " kernel_w = " << kernel_w << " kernel_h = " << kernel_h << \ - " stride_width = " << stride_w << " stride_height = " << stride_h << \ - " pad_width = " << pad_w << " pad_height = " << pad_h << \ - " dilation_w = " << dila_w << " dilation_h = " << dila_h; - LOG(INFO) << " bias flag = " << (is_bias? "true" : "false") << ", relu flag = " << (is_relu? "true" : "false"); - - int kernel_exten = dila_h * (kernel_h - 1) + 1; - int hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; - - kernel_exten = dila_w * (kernel_w - 1) + 1; - int wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; - - Shape shape_out{num, ch_out, hout, wout}; - - Shape shw{ch_out, chin / group, kernel_h, kernel_w}; - Shape shb{1, ch_out, 1, 1}; - - TensorH pweihtf; - TensorH pbiasf; - - TensorH pweihtc; - TensorH pbiasi; - - pweihtf.re_alloc(shw, AK_FLOAT); - pbiasf.re_alloc(shb, AK_FLOAT); - - pweihtc.re_alloc(shw, AK_INT8); - pbiasi.re_alloc(shb, AK_INT32); - - fill_tensor_rand(thinf, -20, 20); - fill_tensor_rand(pweihtf, -10, 10); - fill_tensor_rand(pbiasf, -10, 10); -// fill_tensor_const(thinf, 1.f); -// fill_tensor_const(pweihtf, 1.f); -// fill_tensor_const(pbiasf, 1.f); - - //! convert input data type - get_tensor_scale_inplace(thinf, 0, 63.f); -// LOG(INFO) << "input tesnor scale at factor 63.f is " << thinf.get_scale()[0] << ", max_val: " << 63.f * thinf.get_scale()[0]; - trans_tensor_fp32_to_int8(thinf, thinc, &ctx1); - thinc.set_scale(thinf.get_scale()); -// print_tensor(thinc); - - //! convert weight data type - Tensor tmp_w; - Shape act_shape = pweihtf.valid_shape(); - int tmp_c = act_shape[1]; - act_shape[1] = act_shape[0]; - act_shape[0] = tmp_c; - tmp_w.set_shape(act_shape); - tmp_w.share_from(pweihtf); - get_tensor_scale_inplace(tmp_w, 1, 63.f); - std::vector w_scale = tmp_w.get_scale(); -// LOG(INFO) << "input tesnor scale at factor 63.f is "; -// for (int j = 0; j < w_scale.size(); ++j) { -// LOG(INFO) << "|-- " << j << ": " << w_scale[j] << ", max_val: " << 63.f * w_scale[j]; -// } - trans_fp32_weights_to_int8(tmp_w, pweihtc, 63.f, 1, &ctx1); - trans_fp32_bias_to_int32(pbiasf, pbiasi, thinf.get_scale()[0], w_scale, &ctx1); - -// print_tensor(pweihtc); -// print_tensor(pbiasi); - - //! get int8 and fp32 basic result - if (g_compare_result) { - LOG(INFO) << "run basic conv for precision comparation"; - const char* dinc = static_cast(thinc.data()); - const char* weightc = static_cast(pweihtc.data()); - const int* biasi = static_cast(pbiasi.data()); - const float* dinf = static_cast(thinf.data()); - const float* weightf = static_cast(pweihtf.data()); - const float* biasf = static_cast(pbiasf.data()); - tout_basic_fp32.re_alloc(shape_out, AK_FLOAT); - tout_basic_int32.re_alloc(shape_out, AK_INT32); - tout_basic_int8.re_alloc(shape_out, AK_INT8); - - float* dout_basic_fp32 = static_cast(tout_basic_fp32.mutable_data()); - int* dout_basic_int32 = static_cast(tout_basic_int32.mutable_data()); - -// LOG(INFO) << "do basic fp32 conv"; -// conv_arm_basic(dinf, dout_basic_fp32, num, ch_out, hout, wout, chin, hin, win, \ -// weightf, biasf, group, kernel_w, kernel_h, stride_w, stride_h, \ -// dila_w, dila_h, pad_w, pad_h, is_bias, is_relu, &ctx1, nullptr, nullptr); - - LOG(INFO) << "do basic int8 conv, trans basic int32 to fp32"; - deconv_basic(dinc, dout_basic_int32, num, ch_out, hout, wout, chin, hin, win, \ - weightc, biasi, group, kernel_w, kernel_h, stride_w, stride_h, \ - dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); - -// LOG(INFO) << "trans basic int32 to int8"; -// trans_tensor_int32_to_int8(tout_basic_int32, tout_basic_int8, thinf.get_scale()[0], w_scale, &ctx1); - - trans_tensor_int32_to_fp32(tout_basic_int32, tout_basic_fp32, thinf.get_scale()[0], w_scale, &ctx1); - -// print_tensor(tout_basic_fp32); -// print_tensor(tout_basic_int32); - } - - SaberDeconv2D deconv_int8; - - Conv2DParam param(pweihtf.valid_size(), ch_out, group, kernel_w, kernel_h, \ - stride_w, stride_h, pad_w, pad_h, dila_w, dila_h, is_bias, \ - static_cast(pweihtf.data()), static_cast(pbiasf.data()), \ - false, is_relu, Active_relu, 0.f, 1.f, false, nullptr); - - - deconv_int8.load_param(¶m); - -// deconv_int8.compute_output_shape(tvin_int8, tvout_saber_int32); -// Shape sh_out_saber = tvout_saber_int32[0]->valid_shape(); - deconv_int8.compute_output_shape(tvin_int8, tvout_saber_fp32); - Shape sh_out_saber = tvout_saber_fp32[0]->valid_shape(); - - - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor -// LOG(INFO) << "re-alloc output memory"; - tvout_saber_int32[0]->re_alloc(shape_out, AK_INT32); - tvout_saber_fp32[0]->re_alloc(shape_out, AK_FLOAT); - tvout_saber_int8[0]->re_alloc(shape_out, AK_INT8); - - //! set compute precision -// LOG(INFO) << "set compute precision"; - auto states = deconv_int8.set_op_precision(AK_INT8); - CHECK_EQ(states, SaberSuccess) << "Saber conv op precision to int8 failed"; - - //! init the op -// LOG(INFO) << "saber conv impl init"; -// states = deconv_int8.init(tvin_int8, tvout_saber_int32, ctx1); - states = deconv_int8.init(tvin_int8, tvout_saber_fp32, ctx1); - CHECK_EQ(states, SaberSuccess) << "Saber conv init failed"; - - //! compute -// LOG(INFO) << "saber conv compute"; - to = 0; - for (int i = 0; i < g_test_iter; ++i) { - t1.clear(); - t1.start(); -// states = deconv_int8.dispatch(tvin_int8, tvout_saber_int32); - states = deconv_int8.dispatch(tvin_int8, tvout_saber_fp32); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; - } - long long gops = n * ch_out * wout * ch_out * (chin / group) * kernel_w * kernel_h; - LOG(INFO) << "saber conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ - ", GOPS: " << 0.000001 * gops / min_time; - -// print_tensor(tout_saber_fp32); - - if (g_compare_result) { - double max_ratio = 0; - double max_diff = 0; -// tensor_cmp_host(tout_basic_int32, tout_saber_int32, max_ratio, max_diff); - tensor_cmp_host(tout_basic_fp32, tout_saber_fp32, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabsf(max_ratio) > 5e-2f) { - TensorH tdiff; - tdiff.re_alloc(shape_out, AK_INT32); -// tensor_diff(tout_basic_int32, tout_saber_int32, tdiff); - tensor_diff(tout_basic_fp32, tout_saber_fp32, tdiff); - LOG(INFO) << "basic result:"; -// print_tensor(tout_basic_int32); - print_tensor(tout_basic_fp32); - LOG(INFO) << "saber result:"; -// print_tensor(tout_saber_int32); - print_tensor(tout_saber_fp32); - LOG(INFO) << "diff result:"; - print_tensor(tdiff); - return SaberInvalidValue; - } -// CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; - } - return SaberSuccess; -} - -#if 1 -TEST(TestSaberLite, test_func_conv_gemm_int8) { - if (g_basic_test) { - for (auto& batch : {1, 2}) { - for (auto& c : {1, 3, 8, 16}) { - for (auto& cout : {1, 5, 16}) { - for (auto& g_div : {1, 2}) { - for (auto& h : {2, 3, 15, 28, 56, 112, 128, 150, 224, 300}) { - for (auto& kw : {1, 2, 3, 5}) { - for (auto& kh : {1, 2, 3, 5}) { - for (auto& pad : {1, 2}) { - for (auto& stride : {1, 2}) { - for (auto& dila : {1, 2}) { - for (auto &flag_bias : {false, true}) { - for (auto &flag_relu : {false, true}) { - for (auto &th : {1, 2, 4}) { - int w = h; - int g = g_div; - if ((c % g_div != 0) || (cout % g_div != 0)) { - g = 1; - } - auto flag = test_arm_deconv_int8(batch, c, h, w, cout, 1, 1, 1, 1, \ - 0, 0, 1, 1, g, flag_bias, flag_relu, th, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 deconv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(FATAL) << "test int8 deconv: batchsize: " << batch << ", channel: " - << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " - << (flag_relu ? "true" : "false") << ", threads: " << \ - th << ", cluster: " << g_cluster << " failed!!\n"; - } - } - } - } - } - } - } - } - } - } - } - } - } - } - } -} -#endif - -#if 1 -TEST(TestSaberLite, test_conv_int8_costom_size) { - auto flag = test_arm_deconv_int8(g_num, g_chin, g_h_in, g_w_in, g_ch_out, g_kw, g_kh, g_stride_w, g_stride_h, \ - g_pad_w, g_pad_h, g_dila_w, g_dila_h, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); - if (flag == SaberSuccess) { - LOG(INFO) << "test int8 deconv: batchsize: " << g_num << ", channel: " - << g_chin << ", h & w: " << g_h_in << ", num_out: " << g_ch_out << ", group: " << g_group << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " passed!!\n"; - } else { - LOG(INFO) << "test int8 deconv: batchsize: " << g_num << ", channel: " - << g_chin << ", h & w: " << g_h_in << ", num_out: " << g_ch_out << ", group: " << g_group << \ - ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " - << (g_flag_relu ? "true" : "false") << ", threads: " << \ - g_threads << ", cluster: " << g_cluster << " failed!!\n"; - } -} -#endif - -int main(int argc, const char** argv){ - Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - - if (argc >= 2) { - g_basic_test = atoi(argv[1]) > 0; - } - - if (argc >= 3) { - g_cluster = atoi(argv[2]); - } - if (argc >= 4) { - g_threads = atoi(argv[3]); - } - if (argc >= 5) { - g_test_iter = atoi(argv[4]); - } - if (argc >= 6) { - g_compare_result = atoi(argv[5]) > 0; - } - if (argc >= 7) { - g_flag_bias = atoi(argv[6]) > 0; - } - if (argc >= 8) { - g_flag_relu = atoi(argv[7]) > 0; - } - if (argc >= 9) { - if (argc < 18) { - LOG(FATAL) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ - " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ - " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; - return -1; - } - g_num = atoi(argv[8]); - g_chin = atoi(argv[9]); - g_h_in = atoi(argv[10]); - g_w_in = atoi(argv[11]); - g_ch_out = atoi(argv[12]); - g_group = atoi(argv[13]); - g_kw = atoi(argv[14]); - g_kh = g_kw; - g_pad_w = atoi(argv[15]); - g_pad_h = g_pad_w; - g_stride_w = atoi(argv[16]); - g_stride_h = g_stride_w; - g_dila_w = atoi(argv[17]); - g_dila_h = g_dila_w; - } - if (argc > 18) { - g_kh = atoi(argv[18]); - } - if (argc > 19) { - g_pad_h = atoi(argv[19]); - } - if (argc > 20) { - g_stride_h = atoi(argv[20]); - } - if (argc > 21) { - g_dila_h = atoi(argv[21]); - } - - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_eltwise_act_lite.cpp b/test/lite/test_eltwise_act_lite.cpp deleted file mode 100644 index e738c092f..000000000 --- a/test/lite/test_eltwise_act_lite.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_eltwise_act.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -int test_iter = 10; - -int num_in = 9; -int ch_in = 9; -int w_in = 9; -int h_in = 9; -int cluster = 0; -int threads = 4; -int act_type = 2; -int elt_type = 1; - -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 - -void eltwise_active_basic(const Context &ctx, TensorHf4& tensor_out, \ - std::vector &tensor_in, int op_type, std::vector coeffs_ptr, int num_coeff, \ - int act_type, bool channel_shared, float* slope_ptr) { - CHECK_GT(tensor_out.size(), 0) << "output tensor is empty"; - CHECK_GT(tensor_in.size(), 1) << "input tensor is empty"; - - int w_in = tensor_in[0]->width(); - int h_in = tensor_in[0]->height(); - int ch_in = tensor_in[0]->channel(); - int num = tensor_in[0]->num(); - int size_in = w_in * h_in; - - float* data_out = tensor_out.mutable_data(); - const float* data_in0 = tensor_in[0]->data(); - const float* data_in1 = tensor_in[1]->data(); - - if (op_type == 1){ //Operation_PROD - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in0_batch = data_in0 + n * ch_in * size_in; - const float* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in0_channel = data_in0_batch + c * size_in; - const float* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i] * data_in1_channel[i]; - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] * data_in_channel[i]; - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - } - } - if (op_type == 2){ //Operation_SUM - if (num_coeff == 0){ - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in0_batch = data_in0 + n * ch_in * size_in; - const float* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in0_channel = data_in0_batch + c * size_in; - const float* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i] + data_in1_channel[i]; - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] + data_in_channel[i]; - if (act_type ==2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - } - }else{ - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in0_batch = data_in0 + n * ch_in * size_in; - const float* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in0_channel = data_in0_batch + c * size_in; - const float* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i]*coeffs_ptr[0] + \ - data_in1_channel[i]*coeffs_ptr[1]; - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] + \ - data_in_channel[i] * coeffs_ptr[b]; - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - } - } - } - if (op_type == 3){ //Operation_MAX - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in0_batch = data_in0 + n * ch_in * size_in; - const float* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in0_channel = data_in0_batch + c * size_in; - const float* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = std::max(data_in0_channel[i], data_in1_channel[i]); - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - float* data_out_batch = data_out + n * ch_in * size_in; - const float* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - float* data_out_channel = data_out_batch + c * size_in; - const float* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = std::max(data_out_channel[i], data_in_channel[i]); - if (act_type == 2)data_out_channel[i] = data_out_channel[i] > 0 ? data_out_channel[i] : 0.f; - if (act_type == 10){ - data_out_channel[i] = data_out_channel[i] < 0 ? \ - (channel_shared ? data_out_channel[i] * slope_ptr[0] : data_out_channel[i] * slope_ptr[c]) : data_out_channel[i]; - } - } - } - } - } - } - -} - -void test_eltwise_act(std::vector& tin, int operation, \ - std::vector coeffs_ptr, int num_coeff, int threads, int cluster_id, int act_type) { - - // int test_iter = 100; - double to = 0; - double min_time = 1000000; - SaberTimer t1; - SaberTimer t2; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } -TensorHf4 tout_basic; - TensorHf4 tout_saber; - - //TensorHf4* thin = tin[0]; - - std::vector tvout_saber; - std::vector tvout_basic; - - tvout_saber.push_back(&tout_saber); - tvout_basic.push_back(&tout_basic); - - int numin = tin[0]->num(); - int chin = tin[0]->channel(); - int hin = tin[0]->height(); - int win = tin[0]->width(); - int pad = 0; - - LOG(INFO) << "eltwise active param: "; - LOG(INFO) << " img_num = " << numin; - LOG(INFO) << " in_channels = " << chin; - LOG(INFO) << " img_h = " << hin; - LOG(INFO) << " img_w = " << win; - // enum { Eltwise_prod = 1, Eltwise_sum = 2, Eltwise_max = 3 }; - // LOG(INFO) << "operation: " << operation; - if (operation == 1) - LOG(INFO) << " operation = " << Eltwise_prod; - if (operation == 2) - LOG(INFO) << " operation = " << Eltwise_sum; - if (operation == 3) - LOG(INFO) << " operation = " << Eltwise_max; - LOG(INFO) << "active = " << act_type; - - int input_dim = 1; - Shape shape_out = tin[0]->valid_shape(); - for (int i = 0; i < 4; i++){ - shape_out[i] = tin[0]->valid_shape()[i]; - } - //Shape shape_out{num, ch_out, h_out, w_out} - - TensorHf4 tslop; - Shape shape{numin, chin, 1, 1}; - tslop.re_alloc(shape); - fill_tensor_rand(tslop, -1.f, 1.f); - -#ifdef USE_COMPARE - -/* - LOG(INFO) << "initial input tensor data 0:"; - print_tensor_host(*tin[0]); - LOG(INFO) << "initial input tensor data 1:"; - print_tensor_host(*tin[1]); -*/ - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - - LOG(INFO) << "run basic eltwise active for precision comparation"; - tout_basic.re_alloc(shape_out); - - to = 0; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - if (act_type == 2) - eltwise_active_basic(ctx1, tout_basic, tin, operation, coeffs_ptr, num_coeff, act_type, false, nullptr); - if (act_type == 10){ - eltwise_active_basic(ctx1, tout_basic, tin, operation, coeffs_ptr, num_coeff, act_type, false, tslop.data()); - } - - //tvout_basic[0] ->record_event(ctx1.get_compute_stream()); - //tvout_basic[0] ->sync(); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "basic eltwise running time, ave: " << to / test_iter << ", min time: " << min_time; - // print_tensor_host(tout_basic); -#endif - - SaberEltwiseAct eltwise_act_saber; - EltwiseActParam eltwise_act_param((EltwiseType)operation, coeffs_ptr, (ActiveType)act_type, 0.f, 1.f, false, tslop.data()); - // ParamBase* base =new EltwiseActParam(operation, coeffs_ptr, act_type, 0.f, 1.f, false, tslop.data()); - LOG(INFO) << "saber eltwise act load param"; - eltwise_act_saber.load_param(&eltwise_act_param); - //LITE_CHECK(eltwise_act_saber.load_param(&eltwise_act_param)); - - LOG(INFO) << "saber eltwise act compute output shape"; - eltwise_act_saber.compute_output_shape(tin, tvout_saber); - - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape_1: " << sh_out_saber[0] << ", " << sh_out_saber[1] << ", " \ - << sh_out_saber[2] << ", " << sh_out_saber[3]; - //LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber eltwise act impl init"; - CHECK_EQ(eltwise_act_saber.init(tin, tvout_saber, ctx1), SaberSuccess) << "init error"; - //SABER_CHECK(eltwise_act_saber.init(tin, tvout_saber, eltwise_act_param, SPECIFY, SABER_IMPL, ctx1)); - - //! compute - LOG(INFO) << "saber eltwise act compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t2.clear(); - t2.start(); - //eltwise_arm(ctx2, tout_saber, tin, operation, coeffs_ptr, num_coeff); - //eltwise_act_saber(tin, tvout_saber, eltwise_act_param, ctx1); - eltwise_act_saber.dispatch(tin, tvout_saber); - // tvout_saber[0]->record_event(ctx1.get_compute_stream()); - // tvout_saber[0]->sync(); - t2.end(); - //printf("i: %d \n",i); - to += t2.get_average_ms(); - if (t2.get_average_ms() < min_time) { - min_time = t2.get_average_ms(); - } - } - LOG(INFO) << "saber eltwise active running time, ave: " << to / test_iter << ", min time: " << min_time; - // print_tensor_host(tout_saber); - //print_tensor_host(*tvout_saber[0]); - -#ifdef USE_COMPARE - double max_ratio = 0; - double max_diff = 0; - //TensorHf4 tdiff(tout_basic.valid_shape()); - //tensor_diff(tout_basic, tout_saber, tdiff); - //print_tensor_host(tdiff); - // tensor_cmp_host(tout_basic.data(), tout_saber.data(), tout_basic.valid_size(), max_ratio, max_diff); - // LOG(INFO) << "tout_basic"; - // print_tensor_host(tout_basic); - // LOG(INFO) << "tout_saber"; - // print_tensor_host(tout_saber); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} - -#if 1 -TEST(TestSaberLite, test_func_eltwise_act_lite) { - - int num = num_in; - int chin = ch_in; - int hin = h_in; - int win = w_in; - - // bool bias_term = false; - // bool global = true; - // PoolingType type = 1; - - Shape shape_in(num, chin, hin, win); - - //fill_tensor_host_const(tdin, 1.f); - - std::vector tin; - TensorHf4 tdin; - tdin.re_alloc(shape_in); - fill_tensor_rand(tdin, -1.f, 1.f); - TensorHf4 tdin1; - tdin1.re_alloc(shape_in); - fill_tensor_rand(tdin1, -1.f, 1.f); - - tin.push_back(&tdin); - tin.push_back(&tdin1); - - - std::vector coeffs_ptr; - - coeffs_ptr.push_back(1.0f); - coeffs_ptr.push_back(1.0f); - //printf("test_arm_eltwise: GLB_operation: %d \n", GLB_operation); - // LOG(INFO) << "elt_type: " << elt_type; - test_eltwise_act(tin, elt_type, coeffs_ptr, 0, threads, cluster, act_type); - //LOG(WARNING) << "pooling not support yet"; -} -#endif - -int main(int argc, const char** argv){ - - Env::env_init(); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4){ - test_iter = atoi(argv[3]); - } - if (argc >= 5 ) { - elt_type = atoi(argv[4]); - } - if (argc >= 6 ) { - act_type = atoi(argv[5]); - } - if (argc >= 7) { - if (argc < 10) { - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " elt_type act_type num ch_in h_in w_in"; - return 0; - } - num_in = atoi(argv[6]); - ch_in = atoi(argv[7]); - h_in = atoi(argv[8]); - w_in = atoi(argv[9]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_eltwise_lite.cpp b/test/lite/test_eltwise_lite.cpp deleted file mode 100644 index 00b3a7f8e..000000000 --- a/test/lite/test_eltwise_lite.cpp +++ /dev/null @@ -1,410 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_eltwise.h" -#include "saber/saber_types.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -int test_iter = 10; - -int num_in = 9; -int ch_in = 9; -int w_in = 9; -int h_in = 9; -int cluster = 0; -int threads = 4; -int elt_type = 2; -DataType Dtype = AK_FLOAT; -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 - - -template -void eltwise_basic(const Context &ctx, TensorHf4& tensor_out, \ - std::vector &tensor_in, int op_type, std::vector coeffs_ptr, int num_coeff) { - CHECK_GT(tensor_out.size(), 0) << "output tensor is empty"; - CHECK_GT(tensor_in.size(), 1) << "input tensor is empty"; - - int w_in = tensor_in[0]->width(); - int h_in = tensor_in[0]->height(); - int ch_in = tensor_in[0]->channel(); - int num = tensor_in[0]->num(); - int size_in = w_in * h_in; - - dtype* data_out = tensor_out.mutable_data(); - const dtype* data_in0 = tensor_in[0]->data(); - const dtype* data_in1 = tensor_in[1]->data(); - - if (op_type == 1){ //Operation_PROD - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in0_batch = data_in0 + n * ch_in * size_in; - const dtype* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in0_channel = data_in0_batch + c * size_in; - const dtype* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i] * data_in1_channel[i]; - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] * data_in_channel[i]; - } - } - } - } - } - if (op_type == 2){ //Operation_SUM - if (num_coeff == 0){ - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in0_batch = data_in0 + n * ch_in * size_in; - const dtype* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in0_channel = data_in0_batch + c * size_in; - const dtype* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i] + data_in1_channel[i]; - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] + data_in_channel[i]; - } - } - } - } - }else{ - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in0_batch = data_in0 + n * ch_in * size_in; - const dtype* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in0_channel = data_in0_batch + c * size_in; - const dtype* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_in0_channel[i] * coeffs_ptr[0] + \ - data_in1_channel[i] * coeffs_ptr[1]; - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = data_out_channel[i] + \ - data_in_channel[i] * coeffs_ptr[b]; - } - } - } - } - } - } - if (op_type == 3){ //Operation_MAX - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in0_batch = data_in0 + n * ch_in * size_in; - const dtype* data_in1_batch = data_in1 + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in0_channel = data_in0_batch + c * size_in; - const dtype* data_in1_channel = data_in1_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = std::max(data_in0_channel[i], data_in1_channel[i]); - } - } - } - for (int b = 2; b data(); - for (int n = 0; n < num; n++){ - dtype* data_out_batch = data_out + n * ch_in * size_in; - const dtype* data_in_batch = data_in + n * ch_in * size_in; - -#pragma omp parallel for - for (int c = 0; c < ch_in; c++){ - dtype* data_out_channel = data_out_batch + c * size_in; - const dtype* data_in_channel = data_in_batch + c * size_in; - for (int i = 0; i < size_in; i++){ - data_out_channel[i] = std::max(data_out_channel[i], data_in_channel[i]); - } - } - } - } - } -} - -void test_eltwise(DataType datatype, std::vector& tin, int operation, \ - std::vector coeffs_ptr, int num_coeff, int threads, int cluster_id) { - - // int test_iter = 100; - double to = 0; - double min_time = 1000000; - SaberTimer t1; - SaberTimer t2; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtime context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - //TensorHf4* thin = tin[0]; - - std::vector tvout_saber; - std::vector tvout_basic; - - tvout_saber.push_back(&tout_saber); - tvout_basic.push_back(&tout_basic); - - int numin = tin[0]->num(); - int chin = tin[0]->channel(); - int hin = tin[0]->height(); - int win = tin[0]->width(); - - LOG(INFO) << "eltwise param: "; - LOG(INFO) << " img_num = " << numin; - LOG(INFO) << " in_channels = " << chin; - LOG(INFO) << " img_h = " << hin; - LOG(INFO) << " img_w = " << win; - // enum { Eltwise_prod = 1, Eltwise_sum = 2, Eltwise_max = 3 }; - // LOG(INFO) << "operation: " << operation; - if (operation == 1) - LOG(INFO) << " operation = " << Eltwise_prod; - if (operation == 2) - LOG(INFO) << " operation = " << Eltwise_sum; - if (operation == 3) - LOG(INFO) << " operation = " << Eltwise_max; - - int input_dim = 1; - Shape shape_out = tin[0]->valid_shape(); - for (int i = 0; i < 4; i++){ - shape_out[i] = tin[0]->valid_shape()[i]; - } - //Shape shape_out{num, ch_out, h_out, w_out} - -#ifdef COMPARE_RESULT - -/* - LOG(INFO) << "initial input tensor data 0:"; - print_tensor_host(*tin[0]); - LOG(INFO) << "initial input tensor data 1:"; - print_tensor_host(*tin[1]); -*/ - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - - LOG(INFO) << "run basic eltwise for precision comparation"; - tout_basic.re_alloc(shape_out); - - to = 0; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - if (datatype == AK_FLOAT){ - eltwise_basic(ctx1, tout_basic, tin, operation, coeffs_ptr, num_coeff); - } - else if (datatype == AK_INT8){ - eltwise_basic(ctx1, tout_basic, tin, operation, coeffs_ptr, num_coeff); - } - //tvout_basic[0] ->record_event(ctx1.get_compute_stream()); - //tvout_basic[0] ->sync(); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "basic eltwise running time, ave: " << to / test_iter << ", min time: " << min_time; - // print_tensor_host(tout_basic); -#endif - - SaberEltwise eltwise_saber; - EltwiseParam eltwise_param((EltwiseType)operation, coeffs_ptr); - // ParamBase* base =new EltwiseActParam(operation, coeffs_ptr, act_type, 0.f, 1.f, false, tslop.data()); - LOG(INFO) << "saber eltwise load param"; - eltwise_saber.load_param(&eltwise_param); - //LITE_CHECK(eltwise_act_saber.load_param(&eltwise_act_param)); - LOG(INFO) << "saber eltwise compute output shape"; - eltwise_saber.compute_output_shape(tin, tvout_saber); - - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape_1: " << sh_out_saber[0] << ", " << sh_out_saber[1] << ", " \ - << sh_out_saber[2] << ", " << sh_out_saber[3]; - //LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber eltwise act impl init"; - CHECK_EQ(eltwise_saber.init(tin, tvout_saber, ctx1), SaberSuccess) << "init error"; - //SABER_CHECK(eltwise_act_saber.init(tin, tvout_saber, eltwise_act_param, SPECIFY, SABER_IMPL, ctx1)); - - //! compute - LOG(INFO) << "saber eltwise compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t2.clear(); - t2.start(); - //eltwise_arm(ctx2, tout_saber, tin, operation, coeffs_ptr, num_coeff); - //eltwise_act_saber(tin, tvout_saber, eltwise_act_param, ctx1); - eltwise_saber.dispatch(tin, tvout_saber); - // tvout_saber[0]->record_event(ctx1.get_compute_stream()); - // tvout_saber[0]->sync(); - t2.end(); - //printf("i: %d \n",i); - to += t2.get_average_ms(); - if (t2.get_average_ms() < min_time) { - min_time = t2.get_average_ms(); - } - } - LOG(INFO) << "saber eltwise running time, ave: " << to / test_iter << ", min time: " << min_time; - // print_tensor_host(tout_saber); - //print_tensor_host(*tvout_saber[0]); - -#ifdef COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - //TensorHf4 tdiff(tout_basic.valid_shape()); - //tensor_diff(tout_basic, tout_saber, tdiff); - //print_tensor_host(tdiff); - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - // LOG(INFO) << "tout_basic"; - // print_tensor_host(tout_basic); - // LOG(INFO) << "tout_saber"; - // print_tensor_host(tout_saber); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} - -#if 1 -TEST(TestSaberLite, test_func_eltwise_lite) { - - int num = num_in; - int chin = ch_in; - int hin = h_in; - int win = w_in; - - // bool bias_term = false; - // bool global = true; - // PoolingType type = 1; - - Shape shape_in(num, chin, hin, win); - - //fill_tensor_host_const(tdin, 1.f); - - std::vector tin; - TensorHf4 tdin; - tdin.re_alloc(shape_in, Dtype); - TensorHf4 tdin1; - tdin1.re_alloc(shape_in, Dtype); - if (Dtype == AK_FLOAT){ - fill_tensor_rand(tdin, -1.f, 1.f); - fill_tensor_rand(tdin1, -1.f, 1.f); - } else if (Dtype == AK_INT8){ - for (int i = 0; i < tdin.valid_size(); ++i){ - static_cast(tdin.mutable_data())[i] = i % 126 - 63; - static_cast(tdin1.mutable_data())[i] = i % 126 - 63; - } - } - - tin.push_back(&tdin); - tin.push_back(&tdin1); - - - std::vector coeffs_ptr; - - coeffs_ptr.push_back(1.0f); - coeffs_ptr.push_back(1.0f); - //printf("test_arm_eltwise: GLB_operation: %d \n", GLB_operation); - // LOG(INFO) << "elt_type: " << elt_type; - test_eltwise(Dtype, tin, elt_type, coeffs_ptr, 0, threads, cluster); - //LOG(WARNING) << "pooling not support yet"; -} -#endif - -int main(int argc, const char** argv){ - - Env::env_init(); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4){ - test_iter = atoi(argv[3]); - } - if (argc >= 5 ) { - elt_type = atoi(argv[4]); - } - if (argc >= 6){ - Dtype = atoi(argv[5]); - } - if (argc >= 7) { - if (argc < 10) { - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " elt_type datatype num ch_in h_in w_in"; - return 0; - } - num_in = atoi(argv[6]); - ch_in = atoi(argv[7]); - h_in = atoi(argv[8]); - w_in = atoi(argv[9]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite.h b/test/lite/test_lite.h deleted file mode 100644 index 3da03cb02..000000000 --- a/test/lite/test_lite.h +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (c) 2016 Anakin Authors All Rights Reserve. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifndef ANAKIN2_TEST_SABER_TEST_SABER_FUNC_TEST_ARM_H -#define ANAKIN2_TEST_SABER_TEST_SABER_FUNC_TEST_ARM_H - -#include "utils/unit_test/aktest.h" -#include "utils/logger/logger.h" -#include -#include - -#include "saber/lite/core/context_lite.h" -#include "saber/lite/core/tensor_op_lite.h" -#include "saber/lite/funcs/timer_lite.h" - -using namespace anakin::test; - -int read_file(std::vector &results, const char* file_name) { - - std::ifstream infile(file_name); - if (!infile.good()) { - LOG(ERROR) << "Cannot open " << file_name; - return false; - } - LOG(INFO) << "found filename: " << file_name; - std::string line; - while (std::getline(infile, line)) { - results.push_back((float)atof(line.c_str())); - } - return 0; -} - -static int get_rand(int start, int end) { - int i = rand(); - i = (i % (end - start)) + start; - return i; -} - -template -static void basic_gemm(int m, int n, int k, const type* a, const type* b, const type2* bias, type2* c, \ - type2 alpha, type2 beta, \ - bool trans_a = false, bool trans_b = false, bool flag_bias = false, bool flag_relu = false) { -//#pragma omp parallel for - for (int i = 0; i < m; ++i) { - type2 bias_data = (type2)0; - if (flag_bias) { - bias_data = bias[i]; - } - for (int j = 0; j < n; ++j) { - type2 sum = static_cast(0); - for (int l = 0; l < k; ++l) { - type av; - type bv; - if (trans_a) { - av = a[l * m + i]; - } else{ - av = a[i * k + l]; - } - if (trans_b) { - bv = b[j * k + l]; - } else { - bv = b[l * n + j]; - } - sum += av * bv; - } - type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; - if (flag_relu) { - c[i * n + j] = tmp > (type2)0? tmp : (type2)0; - } else { - c[i * n + j] = tmp; - } - } - } -} - -template -static void fill_bias_relu(Dtype* tensor, const Dtype* bias, int channel, int channel_size, \ - bool flag_bias, bool flag_relu) { - Dtype* data = tensor; - for (int j = 0; j < channel; ++j) { - Dtype bias_c = flag_bias? bias[j] : 0; - for (int i = 0; i < channel_size; i++) { - data[i] += bias_c; - if (flag_relu) { - data[i] = data[i] > 0 ? data[i] : 0.f; - } - } - data += channel_size; - } -} - -template -static void do_relu(Dtype* tensor, int size) { - for (int j = 0; j < size; ++j) { - tensor[j] = tensor[j] > 0 ? tensor[j] : (Dtype)0; - } -} - -inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { - return static_cast(a) < static_cast(b); -} - -template -static void col2im(const Dtype* data_col, const int channels, - const int height, const int width, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, - const int stride_h, const int stride_w, - const int dilation_h, const int dilation_w, - Dtype* data_im) { - - memset(data_im, 0, height * width * channels * sizeof(Dtype)); - const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; - const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; - const int channel_size = height * width; - - for (int channel = channels; channel--; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; - - for (int output_rows = output_h; output_rows; output_rows--) { - if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { - data_col += output_w; - } else { - int input_col = -pad_w + kernel_col * dilation_w; - - for (int output_col = output_w; output_col; output_col--) { - if (is_a_ge_zero_and_a_lt_b(input_col, width)) { - data_im[input_row * width + input_col] += *data_col; - } - data_col++; - input_col += stride_w; - } - } - input_row += stride_h; - } - } - } - } -} - -//! for float, dtype1 and type2 is float -//! for int8, dytpe1 is char, dtype2 is int -template -void deconv_basic(const Dtype1* din, Dtype2* dout, \ - int num, int chout, int hout, int wout, \ - int chin, int hin, int win, \ - const Dtype1* weights, const Dtype2* bias, \ - int group, int kernel_w, int kernel_h, int stride_w, \ - int stride_h, int dila_w, int dila_h, \ - int pad_w, int pad_h, bool flag_bias, bool flag_relu) { - - - int m = chout * kernel_w * kernel_h / group; - int n = hin * win; - int k = chin / group; - - if (chin != chout || group != chin) { - CHECK_EQ(chin % group, 0) << "input channel or group size error"; - CHECK_EQ(chout % group, 0) << "output channel or group size error"; - } - - anakin::saber::lite::Tensor workspace_tensor; - anakin::saber::lite::Shape workspace_shape(1, 1, 1, group * m * n); - workspace_tensor.re_alloc(workspace_shape, anakin::saber::AK_FLOAT); - - int group_size_in = win * hin * chin / group; - int group_size_out = wout * hout * chout / group; - int group_size_coldata = m * n; - int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); - bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && \ - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && \ - (dila_w == 1) && (dila_h == 1); - - Dtype2* workspace_ptr = static_cast(workspace_tensor.mutable_data()); - - for (int i = 0; i < num; ++i) { - const Dtype1* din_batch = din + i * chin * hin * win; - Dtype2* dout_batch = dout + i * chout * hout * wout; - - Dtype2* col_data = workspace_ptr; - if (flag_1x1s1p1) { - col_data = dout_batch; - } - for (int g = 0; g < group; ++g) { - const Dtype1* din_group = din_batch + g * group_size_in; - const Dtype1* weights_group = weights + g * group_size_weights; - Dtype2* coldata_group = col_data + g * group_size_coldata; - basic_gemm(m, n, k, weights_group, din_group, nullptr, coldata_group, \ - (Dtype2)1, (Dtype2)0, true, false, false, (!flag_bias && flag_relu)); - } - - if (!flag_1x1s1p1) { - col2im(col_data, chout, hout, wout, kernel_h, kernel_w, pad_h, pad_w, \ - stride_h, stride_w, dila_h, dila_w, dout_batch); - } - //! add bias - if (flag_bias) { - fill_bias_relu(dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); - } - } -} - -/** - * \brief basic direct convolution function - */ -//! for float, dtype1 and type2 is float -//! for int8, dytpe1 is char, dtype2 is int -template -static void conv_basic(const Dtype1* din, Dtype2* dout, \ - int num, int chout, int hout, int wout, \ - int chin, int hin, int win, \ - const Dtype1* weights, const Dtype2* bias, \ - int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ - int pad_w, int pad_h, bool flag_bias, bool flag_relu) { - - Dtype2 beta = 0; - auto src_data = din; - auto dst_data_ref = dout; - auto weights_data = weights; - auto with_bias = flag_bias; - auto bias_data = bias; - - int in_num = num; - int out_channels = chout; - int out_h = hout; - int out_w = wout; - - int in_channel = chin; - int in_h = hin; - int in_w = win; - int out_c_group = out_channels / group; - int in_c_group = in_channel / group; - - for (int n = 0; n < in_num; ++n) { -#pragma omp parallel for collapse(4) - for (int g = 0; g < group; ++g) { - for (int oc = 0; oc < out_c_group; ++oc) { - for (int oh = 0; oh < out_h; ++oh) { - for (int ow = 0; ow < out_w; ++ow) { - int out_idx = n * group * out_c_group * out_h * out_w + g * out_c_group * out_h * out_w - + oc * out_h * out_w + oh * out_w + ow; - Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0; - dst_data_ref[out_idx] = bias_d;// + dst_data_ref[out_idx] * beta; - for (int ic = 0; ic < in_c_group; ++ic) { - for (int kh = 0; kh < kernel_h; ++kh) { - for (int kw = 0; kw < kernel_w; ++kw) { - int iw = ow * stride_w - pad_w + kw * (dila_w); - int ih = oh * stride_h - pad_h + kh * (dila_h); - if (iw < 0 || iw >= in_w) continue; - if (ih < 0 || ih >= in_h) continue; - - int iidx = n * in_channel * in_h * in_w - + g * in_c_group * in_h * in_w - + ic * in_h * in_w - + ih * in_w - + iw; - int widx = g * out_c_group * in_c_group * kernel_h * kernel_w - + oc * in_c_group * kernel_h * kernel_w - + ic * kernel_h * kernel_w - + kh * kernel_w - + kw; - - dst_data_ref[out_idx] - += src_data[iidx] - * weights_data[widx]; - } - } - } - if (flag_relu) { - dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 ? dst_data_ref[out_idx] : (Dtype2)0; - } - } - } - } - } - } -} - -template -int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio, float tensor_scale) { - double sum_abs1 = 0.0; - double sum_abs2 = 0.0; - for (int i = 0; i < size; ++i) { - sum_abs1 += fabs(src1[i]); - sum_abs2 += fabs(src2[i]); - } - double mean_abs1 = sum_abs1 / size; - double mean_abs2 = sum_abs2 / size; - double mean_val = (mean_abs1 + mean_abs2) / 2.0; - if (max_ratio <= 0) { - max_ratio = 0.1; - } - int count = 0; - for (int i = 0; i < size; ++i) { - double abs_diff = fabs(src1[i] - src2[i]); - double ratio = abs_diff / (fabs(src1[i] + src2[i]) + 1e-12); - if (ratio > max_ratio && abs_diff > (tensor_scale + 1e-5f) && abs_diff > mean_val * 0.1f) { - ++count; - } - } - return count; -} - -class TestSaberLite : public Test { -public: - TestSaberLite() {} - ~TestSaberLite() {} - -protected: - virtual void setup() {} - virtual void teardown() {} - -}; - -#endif //ANAKIN2_TEST_SABER_TEST_SABER_FUNC_TEST_ARM_H diff --git a/test/lite/test_lite_aot_model.cpp b/test/lite/test_lite_aot_model.cpp deleted file mode 100644 index 429bb8189..000000000 --- a/test/lite/test_lite_aot_model.cpp +++ /dev/null @@ -1,186 +0,0 @@ -#include "test_lite.h" -//!change here according to your own model -//#include "mobilenet.h" -#include - -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string model_file_name; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 1; -int FLAGS_epoch = 1; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; - -TEST(TestSaberLite, test_lite_model) { - - //! create runtime context - LOG(INFO) << "create runtime context"; - Context* ctx1 = new Context; - ctx1->set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - //! test threads - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - //! change here according to your own model - //bool load_flag = mobilenet_load_param(model_file_name.c_str()); - //CHECK_EQ(load_flag, true) << "load model: " << model_file_name << " failed"; - LOG(INFO) << "load model: " << model_file_name << " successed"; - -//! load model from memory -// std::fstream fp(model_file_name, std::ios::in | std::ios::binary); -// std::stringstream str_str; -// str_str << fp.rdbuf(); -// std::string str(str_str.str()); -// LOG(INFO) << "get fstream"; -// const char* w_ptr = str.c_str(); -// bool load_flag = mobilenet_load_weights(w_ptr); -// LOG(WARNING) << "load anakin model file from " << model_file_name << " ..."; -// CHECK_EQ(load_flag, true) << "load model: " << model_file_name << " failed"; -// LOG(INFO) << "load model: " << model_file_name << " successed"; - - //! init net - //! change here according to your own model - //bool init_flag = mobilenet_init(*ctx1); - //CHECK_EQ(init_flag, true) << "init failed"; - LOG(INFO) << "init successed"; - - //! change here according to your own model - std::vector vtin_mobilenet;// = mobilenet_get_in(); - LOG(INFO) << "number of input tensor: " << vtin_mobilenet.size(); - for (int i = 0; i < vtin_mobilenet.size(); ++i) { - TensorHf* tin_mobilenet = vtin_mobilenet[i]; - - //!input shape can be changed at each prediction, after reshape input, call xx_init() api; - //tin_mobilenet->reshape(Shape(1, 3, 224, 224)); - - LOG(INFO) << "input tensor size: "; - Shape shin_mobilenet = tin_mobilenet->valid_shape(); - for (int j = 0; j < tin_mobilenet->dims(); ++j) { - LOG(INFO) << "|---: " << shin_mobilenet[j]; - } - //! feed data to input - //! feed input image to input tensor - fill_tensor_const(*tin_mobilenet, 1.f); - } - - //! call init api after reshape input - //mobilenet_init(*ctx1); - - //! change here according to your own model - std::vector vtout_mobilenet;// = mobilenet_get_out(); - LOG(INFO) << "number of output tensor: " << vtout_mobilenet.size(); - for (int i = 0; i < vtout_mobilenet.size(); i++) { - TensorHf* tout = vtout_mobilenet[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - SaberTimer my_time; - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(); - SaberTimer t1; - for (int i = 0; i < FLAGS_epoch; i++) { - - for (int j = 0; j < vtin_mobilenet.size(); ++j) { - fill_tensor_const(*vtin_mobilenet[j], 1.f); - printf("input mean val: %.6f\n", tensor_mean(*vtin_mobilenet[j])); - } - t1.clear(); - t1.start(); - //! change here according to your own model - //mobilenet_prediction(); - t1.end(); - float tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - LOG(INFO) << "mobilenet iter: " << i << ", time: " << tdiff << "ms"; - for (int i = 0; i < vtout_mobilenet.size(); ++i) { - double mean_val = tensor_mean(*vtout_mobilenet[i]); - LOG(INFO) << "mobilenet output mean: " << mean_val; - } - } - my_time.end(); - - LOG(INFO) << model_file_name << " batch_size " << FLAGS_num << " average time " << to/ FLAGS_epoch << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; - - for (int i = 0; i < vtout_mobilenet.size(); ++i) { - double mean_val = tensor_mean(*vtout_mobilenet[i]); - LOG(INFO) << "mobilenet output mean: " << mean_val; - } - - -#ifdef ENABLE_OP_TIMER - OpTimer::print_timer(); -#endif //ENABLE_OP_TIMER - - //! change here according to your own model - //mobilenet_release_resource(); - delete ctx1; -} -int main(int argc, const char** argv){ - - Env::env_init(); - // initial logger - logger::init(argv[0]); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " model_file: path to model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 10"; - LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores"; - LOG(INFO)<< " threads: set openmp threads"; - if (argc < 2) { - LOG(ERROR) << "You should fill in the variable model_dir and model_file at least."; - return 0; - } - if (argc > 1) { - model_file_name = argv[1]; - } - - if (argc > 2) { - FLAGS_num = atoi(argv[2]); - } - if (argc > 3) { - FLAGS_warmup_iter = atoi(argv[3]); - } - if (argc > 4) { - FLAGS_epoch = atoi(argv[4]); - } - if (argc > 5) { - FLAGS_cluster = atoi(argv[5]); - if (FLAGS_cluster < 0) { - FLAGS_cluster = 0; - } - if (FLAGS_cluster > 1) { - FLAGS_cluster = 1; - } - } - if (argc > 6) { - FLAGS_threads = atoi(argv[6]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_merged_model.cpp b/test/lite/test_lite_merged_model.cpp deleted file mode 100644 index 9f17896f4..000000000 --- a/test/lite/test_lite_merged_model.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/net/net_lite.h" -#include "saber/lite/net/saber_factory_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string lite_model; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 1; -int FLAGS_epoch = 1; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; -bool FLAGS_set_archs = false; -ARMArch FLAGS_arch = A73; - -TEST(TestSaberLite, test_lite_model) { - //! create net, with power mode and threads - Net net((PowerMode)FLAGS_cluster, FLAGS_threads); - //! you can also set net param according to your device - net.set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - if (FLAGS_set_archs) { - net.set_device_arch(FLAGS_arch); - LOG(INFO) << "arm arc: " << FLAGS_arch; - } - net.set_device_cache(32 * 1024, 512* 1024); - //! load merged model - SaberStatus flag = net.load_model(lite_model.c_str()); - CHECK_EQ(flag, SaberSuccess) << "load model: " << lite_model << " failed"; - LOG(INFO) << "load model: " << lite_model << " successed"; - - std::vector vtin = net.get_input(); - LOG(INFO) << "number of input tensor: " << vtin.size(); - for (int i = 0; i < vtin.size(); ++i) { - TensorHf* tin = vtin[i]; - //! reshape input before prediction - Shape shin = tin->valid_shape(); - shin[0] = FLAGS_num; - tin->reshape(shin); - //tin->reshape(Shape(1, 3, 224, 224)); - LOG(INFO) << "input tensor size: "; - //Shape shin = tin->valid_shape(); - for (int j = 0; j < tin->dims(); ++j) { - LOG(INFO) << "|---: " << shin[j]; - } - //! feed data to input - //! feed input image to input tensor - fill_tensor_const(*tin, 1.f); - } - - //! change here according to your own model - std::vector vtout = net.get_output(); - LOG(INFO) << "number of output tensor: " << vtout.size(); - for (int i = 0; i < vtout.size(); i++) { - TensorHf* tout = vtout[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - for (int i = 0; i < FLAGS_warmup_iter; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - net.prediction(); - } - SaberTimer my_time; - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(); - SaberTimer t1; - for (int i = 0; i < FLAGS_epoch; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - t1.clear(); - t1.start(); - net.prediction(); - t1.end(); - float tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; - } - for (int i = 0; i < vtout.size(); ++i) { -#ifdef ENABLE_DEBUG - const float* ptr = vtout[i]->data(); - for (int j = 0; j < vtout[i]->valid_size(); ++j) { - printf("%f ", ptr[j]); - if ((j + 1) % 10 == 0) { - printf("\n"); - } - } - printf("\n"); -#endif - double mean_val = tensor_mean(*vtout[i]); - LOG(INFO) << "output mean: " << mean_val; - } - my_time.end(); - LOG(INFO) << lite_model << " batch_size " << FLAGS_num << " average time " << to / FLAGS_epoch << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; -#ifdef ENABLE_OP_TIMER - OpTimer::print_timer(); -#endif //ENABLE_OP_TIMER -} -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - - Env::env_init(); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " lite_model: path to anakin lite model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 10"; - LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores, 2: all cores, 3: threads not bind to specify cores"; - LOG(INFO)<< " threads: set openmp threads"; - if(argc < 2) { - LOG(ERROR) << "You should fill in the variable lite model at least."; - return 0; - } - lite_model = argv[1]; - - if (argc > 2) { - FLAGS_num = atoi(argv[2]); - } - if (argc > 3) { - FLAGS_warmup_iter = atoi(argv[3]); - } - if (argc > 4) { - FLAGS_epoch = atoi(argv[4]); - } - if (argc > 5) { - FLAGS_cluster = atoi(argv[5]); - if (FLAGS_cluster < 0) { - FLAGS_cluster = 0; - } - if (FLAGS_cluster > 3) { - FLAGS_cluster = 3; - } - } - if (argc > 6) { - FLAGS_threads = atoi(argv[6]); - } - if (argc > 7) { - FLAGS_set_archs = true; - if (atoi(argv[7]) > 0) { - FLAGS_arch = (ARMArch)atoi(argv[7]); - } else { - FLAGS_arch = ARM_UNKOWN; - } - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_merged_model_from_mem.cpp b/test/lite/test_lite_merged_model_from_mem.cpp deleted file mode 100644 index 742e484eb..000000000 --- a/test/lite/test_lite_merged_model_from_mem.cpp +++ /dev/null @@ -1,155 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/net/net_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string lite_model; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 1; -int FLAGS_epoch = 1; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; - -TEST(TestSaberLite, test_lite_model) { - - //! create net, with power mode and threads - Net net((PowerMode)FLAGS_cluster, FLAGS_threads); - //! you can also set net param according to your device - //net.set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - //net.set_device_cache(32000, 2000000); - - //! load model from memory - std::fstream fp_merge(lite_model, std::ios::in | std::ios::binary); - - fp_merge.seekg (0, std::ios::end); - long long len_merge = fp_merge.tellg(); - fp_merge.seekg (0, std::ios::beg); - - char* merge_ptr = static_cast(fast_malloc(len_merge)); - - fp_merge.read(merge_ptr, len_merge); - - //SaberStatus flag = net.load_model(lite_info.c_str(), lite_weights.c_str()); - SaberStatus flag = net.load_model(merge_ptr, len_merge); - - CHECK_EQ(flag, SaberSuccess) << "load model: " << lite_model << " failed"; - LOG(INFO) << "load model: " << lite_model << " successed"; - - fast_free(fp_merge); - - std::vector vtin = net.get_input(); - LOG(INFO) << "number of input tensor: " << vtin.size(); - for (int i = 0; i < vtin.size(); ++i) { - TensorHf* tin = vtin[i]; - //! reshape input before prediction - //tin->reshape(Shape(1, 3, 224, 224)); - LOG(INFO) << "input tensor size: "; - Shape shin = tin->valid_shape(); - for (int j = 0; j < tin->dims(); ++j) { - LOG(INFO) << "|---: " << shin[j]; - } - //! feed data to input - //! feed input image to input tensor - fill_tensor_const(*tin, 1.f); - } - - //! change here according to your own model - std::vector vtout = net.get_output(); - LOG(INFO) << "number of output tensor: " << vtout.size(); - for (int i = 0; i < vtout.size(); i++) { - TensorHf* tout = vtout[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - for (int i = 0; i < FLAGS_warmup_iter; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - net.prediction(); - } - SaberTimer my_time; - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(); - SaberTimer t1; - for (int i = 0; i < FLAGS_epoch; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - t1.clear(); - t1.start(); - net.prediction(); - t1.end(); - float tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; - for (int i = 0; i < vtout.size(); ++i) { - double mean_val = tensor_mean(*vtout[i]); - LOG(INFO) << "output mean: " << mean_val; - } - } - my_time.end(); - LOG(INFO) << lite_model << ", batch_size " << FLAGS_num << " average time " << to / FLAGS_epoch << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; -#ifdef ENABLE_OP_TIMER - OpTimer::print_timer(); -#endif //ENABLE_OP_TIMER -} -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - - Env::env_init(); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " lite_model: path to anakin lite model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 10"; - LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores"; - LOG(INFO)<< " threads: set openmp threads"; - if (argc < 2) { - LOG(ERROR) << "You should fill in the variable lite model and lite weights at least."; - return 0; - } - lite_model = argv[1]; - - if (argc > 2) { - FLAGS_num = atoi(argv[2]); - } - if (argc > 3) { - FLAGS_warmup_iter = atoi(argv[3]); - } - if (argc > 4) { - FLAGS_epoch = atoi(argv[4]); - } - if (argc > 5) { - FLAGS_cluster = atoi(argv[5]); - if (FLAGS_cluster < 0) { - FLAGS_cluster = 0; - } - if (FLAGS_cluster > 1) { - FLAGS_cluster = 1; - } - } - if (argc > 6) { - FLAGS_threads = atoi(argv[6]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_model.cpp b/test/lite/test_lite_model.cpp deleted file mode 100644 index 1d7b182b4..000000000 --- a/test/lite/test_lite_model.cpp +++ /dev/null @@ -1,144 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/net/net_lite.h" -#include "saber/lite/net/saber_factory_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string lite_info; -std::string lite_weights; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 1; -int FLAGS_epoch = 1; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; - -TEST(TestSaberLite, test_lite_model) { - - //! create net, with power mode and threads - Net net((PowerMode)FLAGS_cluster, FLAGS_threads); - //! you can also set net param according to your device - //net.set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - //net.set_device_cache(32000, 2000000); - //! load model - SaberStatus flag = net.load_model(lite_info.c_str(), lite_weights.c_str()); - CHECK_EQ(flag, SaberSuccess) << "load model: " << lite_info << ", " << lite_weights << " failed"; - LOG(INFO) << "load model: " << lite_info << ", " << lite_weights << " successed"; - - std::vector vtin = net.get_input(); - LOG(INFO) << "number of input tensor: " << vtin.size(); - for (int i = 0; i < vtin.size(); ++i) { - TensorHf* tin = vtin[i]; - //! reshape input before prediction - //tin->reshape(Shape(1, 3, 224, 224)); - LOG(INFO) << "input tensor size: "; - Shape shin = tin->valid_shape(); - for (int j = 0; j < tin->dims(); ++j) { - LOG(INFO) << "|---: " << shin[j]; - } - //! feed data to input - //! feed input image to input tensor - fill_tensor_const(*tin, 1.f); - } - - //! change here according to your own model - std::vector vtout = net.get_output(); - LOG(INFO) << "number of output tensor: " << vtout.size(); - for (int i = 0; i < vtout.size(); i++) { - TensorHf* tout = vtout[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - for (int i = 0; i < FLAGS_warmup_iter; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - net.prediction(); - } - SaberTimer my_time; - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(); - SaberTimer t1; - for (int i = 0; i < FLAGS_epoch; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - t1.clear(); - t1.start(); - net.prediction(); - t1.end(); - float tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; - for (int i = 0; i < vtout.size(); ++i) { - double mean_val = tensor_mean(*vtout[i]); - LOG(INFO) << "output mean: " << mean_val; - } - } - my_time.end(); - LOG(INFO) << lite_info << ", " << lite_weights << " batch_size " << FLAGS_num << " average time " << to / FLAGS_epoch << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; -#ifdef ENABLE_OP_TIMER - OpTimer::print_timer(); -#endif //ENABLE_OP_TIMER -} -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - - Env::env_init(); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " lite_info: path to anakin lite model"; - LOG(INFO)<< " lite_weights: path to anakin lite model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 10"; - LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores"; - LOG(INFO)<< " threads: set openmp threads"; - if (argc < 2) { - LOG(ERROR) << "You should fill in the variable lite model and lite weights at least."; - return 0; - } - lite_info = argv[1]; - lite_weights = argv[2]; - - if (argc > 3) { - FLAGS_num = atoi(argv[3]); - } - if (argc > 4) { - FLAGS_warmup_iter = atoi(argv[4]); - } - if (argc > 5) { - FLAGS_epoch = atoi(argv[5]); - } - if (argc > 6) { - FLAGS_cluster = atoi(argv[6]); - if (FLAGS_cluster < 0) { - FLAGS_cluster = 0; - } - if (FLAGS_cluster > 1) { - FLAGS_cluster = 1; - } - } - if (argc > 7) { - FLAGS_threads = atoi(argv[7]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_model_from_mem.cpp b/test/lite/test_lite_model_from_mem.cpp deleted file mode 100644 index 2642aab1b..000000000 --- a/test/lite/test_lite_model_from_mem.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/net/net_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string lite_info; -std::string lite_weights; -int FLAGS_num = 1; -int FLAGS_warmup_iter = 1; -int FLAGS_epoch = 1; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; - -TEST(TestSaberLite, test_lite_model) { - - //! create net, with power mode and threads - Net net((PowerMode)FLAGS_cluster, FLAGS_threads); - //! you can also set net param according to your device - //net.set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - //net.set_device_cache(32000, 2000000); - - //! load model from memory - std::fstream fp_info(lite_info, std::ios::in | std::ios::binary); - std::fstream fp_w(lite_weights, std::ios::in | std::ios::binary); - - fp_w.seekg (0, std::ios::end); - long long len_w = fp_w.tellg(); - fp_w.seekg (0, std::ios::beg); - - fp_info.seekg (0, std::ios::end); - long long len_info = fp_info.tellg(); - fp_info.seekg (0, std::ios::beg); - - - char* w_ptr = static_cast(fast_malloc(len_w)); - char* info_ptr = static_cast(fast_malloc(len_info)); - - fp_w.read(w_ptr, len_w); - fp_info.read(info_ptr, len_info); - - //SaberStatus flag = net.load_model(lite_info.c_str(), lite_weights.c_str()); - SaberStatus flag = net.load_model(info_ptr, len_info, w_ptr, len_w); - - CHECK_EQ(flag, SaberSuccess) << "load model: " << lite_info << ", " << lite_weights << " failed"; - LOG(INFO) << "load model: " << lite_info << ", " << lite_weights << " successed"; - - fast_free(w_ptr); - fast_free(info_ptr); - - std::vector vtin = net.get_input(); - LOG(INFO) << "number of input tensor: " << vtin.size(); - for (int i = 0; i < vtin.size(); ++i) { - TensorHf* tin = vtin[i]; - //! reshape input before prediction - //tin->reshape(Shape(1, 3, 224, 224)); - LOG(INFO) << "input tensor size: "; - Shape shin = tin->valid_shape(); - for (int j = 0; j < tin->dims(); ++j) { - LOG(INFO) << "|---: " << shin[j]; - } - //! feed data to input - //! feed input image to input tensor - fill_tensor_const(*tin, 1.f); - } - - //! change here according to your own model - std::vector vtout = net.get_output(); - LOG(INFO) << "number of output tensor: " << vtout.size(); - for (int i = 0; i < vtout.size(); i++) { - TensorHf* tout = vtout[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - for (int i = 0; i < FLAGS_warmup_iter; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - net.prediction(); - } - SaberTimer my_time; - double to = 0; - double tmin = 1000000; - double tmax = 0; - my_time.start(); - SaberTimer t1; - for (int i = 0; i < FLAGS_epoch; ++i) { - for (int i = 0; i < vtin.size(); ++i) { - fill_tensor_const(*vtin[i], 1.f); - } - t1.clear(); - t1.start(); - net.prediction(); - t1.end(); - float tdiff = t1.get_average_ms(); - if (tdiff > tmax) { - tmax = tdiff; - } - if (tdiff < tmin) { - tmin = tdiff; - } - to += tdiff; - LOG(INFO) << "iter: " << i << ", time: " << tdiff << "ms"; - for (int i = 0; i < vtout.size(); ++i) { - double mean_val = tensor_mean(*vtout[i]); - LOG(INFO) << "output mean: " << mean_val; - } - } - my_time.end(); - LOG(INFO) << lite_info << ", " << lite_weights << " batch_size " << FLAGS_num << " average time " << to / FLAGS_epoch << \ - ", min time: " << tmin << "ms, max time: " << tmax << " ms"; -#ifdef ENABLE_OP_TIMER - OpTimer::print_timer(); -#endif //ENABLE_OP_TIMER -} -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - - Env::env_init(); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " lite_info: path to anakin lite model"; - LOG(INFO)<< " lite_weights: path to anakin lite model"; - LOG(INFO)<< " num: batchSize default to 1"; - LOG(INFO)<< " warmup_iter: warm up iterations default to 10"; - LOG(INFO)<< " epoch: time statistic epoch default to 10"; - LOG(INFO)<< " cluster: choose which cluster to run, 0: big cores, 1: small cores"; - LOG(INFO)<< " threads: set openmp threads"; - if (argc < 2) { - LOG(ERROR) << "You should fill in the variable lite model and lite weights at least."; - return 0; - } - lite_info = argv[1]; - lite_weights = argv[2]; - - if (argc > 3) { - FLAGS_num = atoi(argv[3]); - } - if (argc > 4) { - FLAGS_warmup_iter = atoi(argv[4]); - } - if (argc > 5) { - FLAGS_epoch = atoi(argv[5]); - } - if (argc > 6) { - FLAGS_cluster = atoi(argv[6]); - if (FLAGS_cluster < 0) { - FLAGS_cluster = 0; - } - if (FLAGS_cluster > 1) { - FLAGS_cluster = 1; - } - } - if (argc > 7) { - FLAGS_threads = atoi(argv[7]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_seg_precision.cpp b/test/lite/test_lite_seg_precision.cpp deleted file mode 100644 index 9eb0c81b3..000000000 --- a/test/lite/test_lite_seg_precision.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/net/net_lite.h" -#include "saber/lite/net/saber_factory_lite.h" - -#ifdef USE_OPENCV -#include "opencv2/opencv.hpp" -using namespace cv; -using namespace anakin::saber; -using namespace anakin::saber::lite; -typedef Tensor TensorHf; - -std::string g_lite_model; -std::string g_img_list; -std::string g_gt_list; -int FLAGS_threads = 1; -int FLAGS_cluster = 0; -bool FLAGS_set_archs = false; -ARMArch FLAGS_arch = A73; - -void fill_tensor_with_cvmat(const Mat& img_in, TensorHf& tout, const int num, \ - const int width, const int height, const float* mean, const float* scale) { - cv::Mat im; - cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f); - float* ptr_data_in = static_cast(tout.mutable_data()); - int stride = width * height; - for (int i = 0; i < num; i++) { - float* ptr_in = ptr_data_in + i * tout.channel() * tout.height() * tout.width(); - for (int r = 0; r < height; r++) { - for (int c = 0; c < width; c++) { - ptr_in[r * width + c] = (im.at(r, c)[0] - mean[0]) * scale[0]; - ptr_in[stride + r * width + c] = (im.at(r, c)[1] - mean[1]) * scale[1]; - ptr_in[2 * stride + r * width + c] = (im.at(r, c)[2] - mean[2]) * scale[2]; - } - } - } -} - -void cmp_seg_result(const Mat& gt_img, const TensorHf& tin, long long& diff_count, double& accuracy) { - int height = tin.height(); - int width = tin.width(); - diff_count = 0; - const float* din = static_cast(tin.data()); - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - int gt = gt_img.at(h, w); - int test = *(din++) > 0.5; - if (gt != test) { - diff_count++; - } - } - } - accuracy = (double)diff_count / (height * width); -} - -TEST(TestSaberLite, test_seg_precision) { - - std::vector img_list; - std::vector gt_list; - //! load test image list and ground truth image list - std::fstream fp_img(g_img_list); - std::string line; - while (getline(fp_img, line)) { - img_list.push_back(line); - } - LOG(INFO) << "total test image number: " << img_list.size(); - fp_img.close(); - - std::fstream fp_gt(g_gt_list); - while (getline(fp_gt, line)) { - gt_list.push_back(line); - } - LOG(INFO) << "total ground truth image number: " << gt_list.size(); - CHECK_EQ(gt_list.size(), img_list.size()) << "test image number must = ground truth image number"; - - LOG(INFO) << "finish load test image list"; - - //! create net, with power mode and threads - Net net((PowerMode)FLAGS_cluster, FLAGS_threads); - //! you can also set net param according to your device - net.set_run_mode((PowerMode)FLAGS_cluster, FLAGS_threads); - if (FLAGS_set_archs) { - net.set_device_arch(FLAGS_arch); - LOG(INFO) << "arm arc: " << FLAGS_arch; - } - net.set_device_cache(32 * 1024, 512* 1024); - //! load merged model - SaberStatus flag = net.load_model(g_lite_model.c_str()); - CHECK_EQ(flag, SaberSuccess) << "load model: " << g_lite_model << " failed"; - LOG(INFO) << "load model: " << g_lite_model << " successed"; - - std::vector vtin = net.get_input(); - LOG(INFO) << "number of input tensor: " << vtin.size(); - for (int i = 0; i < vtin.size(); ++i) { - TensorHf* tin = vtin[i]; - //! reshape input before prediction - Shape shin = tin->valid_shape(); - LOG(INFO) << "input tensor size: "; - for (int j = 0; j < tin->dims(); ++j) { - LOG(INFO) << "|---: " << shin[j]; - } - } - - int hin = vtin[0]->height(); - int win = vtin[0]->width(); - - //! change here according to your own model - std::vector vtout = net.get_output(); - LOG(INFO) << "number of output tensor: " << vtout.size(); - for (int i = 0; i < vtout.size(); i++) { - TensorHf* tout = vtout[i]; - LOG(INFO) << "output tensor size: "; - Shape shout = tout->valid_shape(); - for (int j = 0; j < tout->dims(); ++j) { - LOG(INFO) << "|---: " << shout[j]; - } - } - - float mean_val[3] = {104.008f, 116.669f, 122.675f}; - float scale_val[3] = {1.f, 1.f, 1.f}; - - double acc = 0.0; - - for (int k = 0; k < img_list.size(); ++k) { - //! pre-processing - Mat img = imread(img_list[k], CV_LOAD_IMAGE_COLOR); - fill_tensor_with_cvmat(img, *vtin[0], 1, win, hin, mean_val, scale_val); - LOG(INFO) << "test image name: " << img_list[k] << ", gt image name: " << gt_list[k]; - Mat img_gt = imread(gt_list[k], CV_LOAD_IMAGE_UNCHANGED); - if (img.empty() || img_gt.empty()) { - LOG(FATAL) << "load image failed"; - } - Mat img_gt_resize; - cv::resize(img_gt, img_gt_resize, cv::Size(192, 192)); - double to = 0; - SaberTimer t1; - t1.start(); - net.prediction(); - t1.end(); - to = t1.get_average_ms(); - LOG(INFO) << "time consumption: " << to << " ms"; - for (int i = 0; i < vtout.size(); ++i) { - double mean = tensor_mean(*vtout[i]); - LOG(INFO) << "output mean: " << mean; - } - - //! post processing - long long diff_count = 0; - double acc_curr = 0.0; - cmp_seg_result(img_gt_resize, *vtout[0], diff_count, acc_curr); - acc += acc_curr; - LOG(INFO) << "image : " << img_list[k] << ", diff count: " << diff_count << ", accuracy: " << acc_curr; - } - LOG(INFO) << "test accuracy is: " << acc / img_list.size(); -} - -int main(int argc, const char** argv) { - // initial logger - logger::init(argv[0]); - - Env::env_init(); - - LOG(INFO)<< "usage:"; - LOG(INFO)<< argv[0] << " "; - LOG(INFO)<< " lite_model: path to anakin lite model"; - LOG(INFO)<< " image_list: path to test image list"; - LOG(INFO)<< " gt_image_list: path to test image ground truth list"; - LOG(INFO)<< " threads: set openmp threads"; - if(argc < 4) { - LOG(ERROR)<< argv[0] << " "; - return 0; - } - g_lite_model = argv[1]; - g_img_list = argv[2]; - g_gt_list = argv[3]; - - if (argc > 4) { - FLAGS_threads = atoi(argv[4]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} -#else -int main(int argc, const char** argv) { - LOG(ERROR)<< "turn on opencv"; - return 0; -} -#endif //USE_OPENCV \ No newline at end of file diff --git a/test/lite/test_lite_sgemm.cpp b/test/lite/test_lite_sgemm.cpp deleted file mode 100644 index 90477ef88..000000000 --- a/test/lite/test_lite_sgemm.cpp +++ /dev/null @@ -1,225 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/sgemm_arm.h" -#include "saber/lite/funcs/neon/impl/sgemm_conv.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; - -bool Basic_test = false; - -int M = 512; -int N = 512; -int K = 512; -bool traA = false; -bool traB = false; -bool flag_relu = false; -bool flag_bias = false; - -int test_iter = 1; - -bool COMPARE_RESULT = false; - -typedef Tensor TensorHf4; - -SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bias, bool flag_relu) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = (PowerMode)cluster; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - Shape sha(1, 1, M, K); - Shape shb(1, 1, N, K); - Shape shc(1, 1, M, N); - - TensorHf4 ta; - TensorHf4 tb; - - TensorHf4 tbias; - - ta.reshape(sha); - tb.reshape(shb); - tbias.reshape(Shape(M)); - - fill_tensor_rand(ta, -1.f, 1.f); - fill_tensor_rand(tb, -1.f, 1.f); - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - tout_saber.reshape(shc); - - int m = M; - int n = N; - int k = K; - - LOG(INFO) << "sgemm M: " << m << ", N: " << n << ", K: " << k; - LOG(INFO) << "transA: " << (tra? "true" : "false") << ", transB: " << (trb? "true" : "false"); - LOG(INFO) << "relu: " << (flag_relu? "true" : "false") << ", bias: " << (flag_bias? "true" : "false"); - LOG(INFO) << "test iter: " << test_iter; - LOG(INFO) << "compare result with basic sgemm: " << (COMPARE_RESULT? "true" : "false"); - - const float* da = static_cast(ta.data()); - const float* db = static_cast(tb.data()); - - if(COMPARE_RESULT) { - LOG(INFO) << "run basic conv for precision comparation"; - tout_basic.reshape(shc); - float* dc_basic = static_cast(tout_basic.mutable_data()); - basic_gemm(m, n, k, da, db, static_cast(tbias.data()), dc_basic, 1.f, 0.f, traA, traB, flag_relu, flag_bias); - //print_tensor(tout_basic); - } - //! sgemm init - int l1_cache = Env::cur_env()._L1_cache; - int l2_cache = Env::cur_env()._L2_cache; - //! if L1 cache size is not provided, set to 32K - l1_cache = l1_cache > 0? l1_cache : 32 * 1024; - //! if L2 cache size is not provided, set to 2M - l2_cache = l2_cache > 0? l2_cache : 512 * 1024; - Sgemm gemmer; - gemmer.init(l1_cache, l2_cache, m, n, k, traA, traB, threads); - //! compute - LOG(INFO) << "saber sgemm compute"; - to = 0; - int lda, ldb, ldc; - if (traA) { - lda = m; - } else { - lda = k; - } - if (traB) { - ldb = k; - } else { - ldb = n; - } - ldc = n; - - long long ops = m * n * k; - - float* dc_saber = static_cast(tout_saber.mutable_data()); - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - gemmer(da, lda, db, ldb, dc_saber, ldc, 1.f, 0.f); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "saber gemm running time, ave: " << to / test_iter << ", min time: " << min_time; - LOG(WARNING) << "mean gops: " << 0.000001f * ops * test_iter / to << " GFLOPS, max gops: " \ - << 0.000001f * ops / min_time << " GFLOPS"; - //print_tensor(tout_saber); - - if (COMPARE_RESULT) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - if (fabs(max_ratio) > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape()); - tensor_diff(tout_basic, tout_saber, tdiff); - print_tensor(tdiff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabs(max_ratio) > 1e-4f) { - return SaberInvalidValue; - } - } - return SaberSuccess; -} - -TEST(TestSaberLite, test_func_sgemm_arm) { - if (Basic_test) { - LOG(INFO) << "run basic sgemm test"; - for (auto& m : {1, 8, 16, 111, 256, 397, 512, 777, 1024}) { - for (auto& n : {1, 3, 13, 141, 256, 345, 512, 789, 1024}) { - for (auto& k : {1, 4, 15, 59, 128, 234, 512, 678, 1024}) { - for (auto& tra : {false, true}) { - for (auto& trb : {false, true}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - SaberStatus flag = test_arm_sgemm(m, n, k, traA, traB, flag_bias, flag_relu); - if (flag == SaberSuccess) { - LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << ", trans A: " << (tra? "true" : "false") << \ - ", trans B: " << (trb? "true" : "false") << " passed"; - } else { - LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << ", trans A: " << (tra? "true" : "false") << \ - ", trans B: " << (trb? "true" : "false") << " failed"; - } - } - } - } - } - } - } - } - } - -} - -TEST(TestSaberLite, test_func_sgemm_arm_custom) { - - test_arm_sgemm(M, N, K, traA, traB, flag_bias, flag_relu); - LOG(INFO) << "test m = " << M << ", n=" << N << ", k=" << K << "passed"; - -} - -int main(int argc, const char** argv){ - anakin::saber::lite::Env::env_init(); - - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [k] [transA] [transB] [relu] [bias] [test iter] [compare result]"; - - if (argc > 1) { - Basic_test = atoi(argv[1]) > 0; - } - - if (argc > 2) { - cluster = atoi(argv[2]); - } - if (argc > 3) { - threads = atoi(argv[3]); - } - if(argc > 4) { - if (argc < 10) { - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [k] [transA] [transB] [relu] [bias] [test iter] [compare result]"; - return 0; - } - M = atoi(argv[4]); - N = atoi(argv[5]); - K = atoi(argv[6]); - traA = atoi(argv[7]) > 0; - traB = atoi(argv[8]) > 0; - flag_relu = atoi(argv[9]) > 0; - flag_bias = atoi(argv[10]) > 0; - } - if (argc > 11) { - test_iter = atoi(argv[11]); - } - if (argc > 12) { - COMPARE_RESULT = atoi(argv[12]) > 0; - } - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_lite_sgemm_prepacked_int8.cpp b/test/lite/test_lite_sgemm_prepacked_int8.cpp deleted file mode 100644 index 48dbd74eb..000000000 --- a/test/lite/test_lite_sgemm_prepacked_int8.cpp +++ /dev/null @@ -1,217 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/sgemm_prepacked_int8.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; -int cluster = 0; -int threads = 1; - -bool Basic_test = false; - -int M = 1024; -int N = 1024; -int K = 1024; -bool traA = false; -bool traB = false; -bool flag_relu = false; -bool flag_bias = false; -ARMArch flag_arch = A73; -int test_iter = 1; -bool COMPARE_RESULT = false; -typedef Tensor TensorHf4; - -SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bias, bool flag_relu, int in_th) { - double to = 0; - double min_time = 1000000; - SaberTimer t1; - Context ctx1; - PowerMode mode = (PowerMode)cluster; - ctx1.set_run_mode(mode, in_th); - //ctx1.set_arch(flag_arch); - //LOG(INFO) << "CPU ARCH: A" << flag_arch; - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << in_th; -#endif - } - Shape sha(M, K); - Shape shb(N, K); - Shape shc(M, N); - TensorHf4 ta; - TensorHf4 tb; - TensorHf4 tbias; - ta.re_alloc(sha, AK_INT8); - tb.re_alloc(shb, AK_INT8); - tbias.re_alloc(Shape(M), AK_INT32); - fill_tensor_rand(ta, -64, 63); - //fill_tensor_const(ta, 1); - fill_tensor_rand(tb, -64, 63); - //fill_tensor_const(tb, 1); - fill_tensor_rand(tbias, -65536, 65535); - //print_tensor(ta); - //print_tensor(tb); - //print_tensor(tbias); - TensorHf4 tout_basic; - TensorHf4 tout_saber; - tout_saber.re_alloc(shc, AK_INT32); - int m = M; - int n = N; - int k = K; - LOG(INFO) << "sgemm M: " << m << ", N: " << n << ", K: " << k; - LOG(INFO) << "transA: " << (tra? "true" : "false") << ", transB: " << (trb? "true" : "false"); - LOG(INFO) << "relu: " << (flag_relu? "true" : "false") << ", bias: " << (flag_bias? "true" : "false"); - LOG(INFO) << "test iter: " << test_iter; - LOG(INFO) << "compare result with basic sgemm: " << (COMPARE_RESULT? "true" : "false"); - const char* da = static_cast(ta.data()); - const char* db = static_cast(tb.data()); - if (COMPARE_RESULT) { - LOG(INFO) << "run basic conv for precision comparation"; - tout_basic.re_alloc(shc, AK_INT32); - int* dc_basic = static_cast(tout_basic.mutable_data()); - basic_gemm(m, n, k, da, db, static_cast(tbias.data()), \ - dc_basic, 1, 0, tra, trb, flag_bias, flag_relu); -// LOG(WARNING) << "basic result"; -// print_tensor(tout_basic); - } - long long ops = m * n * k; - int* dc_saber = static_cast(tout_saber.mutable_data()); - to = 0; - min_time = 1000000; - int hblock = get_hblock_int8(ctx1.get_arch()); - int round_up_a = ((hblock + m - 1) / hblock) * hblock; - TensorHf4 tpackedA(Shape(K, round_up_a), AK_INT8); - //fill_tensor_const(tpackedA, 1); - int lda = k; - if (tra) { - lda = m; - } - prepackA_int8(static_cast(tpackedA.mutable_data()), da, lda, 0, m, 0, k, tra, &ctx1); - //! compute - LOG(INFO) << "saber sgemm compute"; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - sgemm_prepack_int8(static_cast(tpackedA.data()), db, \ - static_cast(tbias.data()), dc_saber, m, n, k, flag_bias, flag_relu, trb, &ctx1); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "saber packed gemm running time, ave: " << to / test_iter << ", min time: " << min_time; - LOG(WARNING) << "mean gops: " << 0.000001f * ops * test_iter / to \ - << " GFLOPS, max gops: " << 0.000001f * ops / min_time << " GFLOPS"; - if (COMPARE_RESULT) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - if (fabs(max_ratio) > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape(), AK_INT32); - tensor_diff(tout_basic, tout_saber, tdiff); - LOG(WARNING) << "basic result"; - print_tensor(tout_basic); - LOG(WARNING) << "saber result"; - print_tensor(tout_saber); - LOG(WARNING) << "diff tensor"; - print_tensor(tdiff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabs(max_ratio) > 1e-4f) { - return SaberInvalidValue; - } - } - return SaberSuccess; -} -TEST(TestSaberLite, test_func_sgemm_prepacked) { - if (Basic_test) { - LOG(INFO) << "run basic sgemm test"; - for (auto& m : {1, 8, 16, 111, 256, 397, 512, 777, 1024}) { - for (auto& n : {1, 3, 13, 141, 256, 345, 512, 789, 1024}) { - for (auto& k : {1, 4, 15, 59, 128, 234, 512, 678, 1024}) { - for (auto& tra : {false, true}) { - for (auto& trb : {false, true}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - for (auto& th : {1, 2, 4}) { - SaberStatus flag = test_arm_sgemm(m, n, k, tra, trb, flag_bias, flag_relu, th); - if (flag == SaberSuccess) { - LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << ", trans A: " << (tra? "true" : "false") << \ - ", trans B: " << (trb? "true" : "false") << " passed"; - } else { - LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << ", trans A: " << (tra? "true" : "false") << \ - ", trans B: " << (trb? "true" : "false") << " failed"; - } - } - } - } - } - } - } - } - } - } -} -TEST(TestSaberLite, test_func_sgemm_prepacked_custom) { - if (test_arm_sgemm(M, N, K, traA, traB, flag_bias, flag_relu, threads) == SaberSuccess) { - LOG (INFO) << "test m = " << M << ", n=" << N << ", k=" << K << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << \ - (flag_relu ? "true" : "false") << ", trans A: " << (traA ? "true" : "false") << \ - ", trans B: " << (traB ? "true" : "false") << " passed"; - } else { - LOG (FATAL) << "test m = " << M << ", n=" << N << ", k=" << K << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << \ - (flag_relu ? "true" : "false") << ", trans A: " << (traA ? "true" : "false") << \ - ", trans B: " << (traB ? "true" : "false") << " failed"; - } -} -int main(int argc, const char** argv){ - anakin::saber::lite::Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [k] [transA] [transB] [relu] [bias] [test iter] [compare result]"; - if (argc > 1) { - Basic_test = atoi(argv[1]) > 0; - } - if (argc > 2) { - cluster = atoi(argv[2]); - } - if (argc > 3) { - threads = atoi(argv[3]); - } - if (argc > 4) { - if (argc < 10) { - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [k] [transA] [transB] [relu] [bias] [test iter] [compare result]"; - return 0; - } - M = atoi(argv[4]); - N = atoi(argv[5]); - K = atoi(argv[6]); - traA = atoi(argv[7]) > 0; - traB = atoi(argv[8]) > 0; - flag_relu = atoi(argv[9]) > 0; - flag_bias = atoi(argv[10]) > 0; - } - if (argc > 11) { - test_iter = atoi(argv[11]); - } - if (argc > 12) { - COMPARE_RESULT = atoi(argv[12]) > 0; - } - if (argc > 13) { - if (atoi(argv[13]) > 0) { - flag_arch = A72; - } else { - flag_arch = A73; - } - } - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_sgemv_int8.cpp b/test/lite/test_lite_sgemv_int8.cpp deleted file mode 100644 index 7e5a4b102..000000000 --- a/test/lite/test_lite_sgemv_int8.cpp +++ /dev/null @@ -1,233 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/sgemv_arm_int8.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; -int cluster = 0; -int threads = 1; - -bool Basic_test = false; - -int M = 1024; -int N = 1024; -int K = 1024; -bool traA = false; -bool traB = false; -bool flag_relu = false; -bool flag_bias = false; -ARMArch flag_arch = A73; -int test_iter = 2; -bool COMPARE_RESULT = false; -typedef Tensor TensorHf4; -void basic_sgemv(int m, int n, const signed char* a, const signed char* b, const int* bias, int* c, \ - bool trans_b = false, bool flag_bias = false, bool flag_relu = false) { -//#pragma omp parallel for - for (int i = 0; i < m; i++){ - int sum = 0; - if (flag_bias)sum = bias[i]; - const signed char* ptr_din = b; - const signed char* ptr_wei = a + i * n; - for (int j = 0; j < n; j++){ - sum += (int)(ptr_din[j] * ptr_wei[j]); - } - if (flag_relu) sum = sum > 0 ? sum : 0; - *c++ = sum; - } -} -SaberStatus test_arm_sgemv(int M, int N, bool flag_bias, bool flag_relu, int in_th) { - double to = 0; - double min_time = 1000000; - SaberTimer t1; - Context ctx1; - PowerMode mode = (PowerMode)cluster; - ctx1.set_run_mode(mode, in_th); - //ctx1.set_arch(flag_arch); - //LOG(INFO) << "CPU ARCH: A" << flag_arch; - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << in_th; -#endif - } - Shape sha(M, N); - Shape shin(N); - Shape shout(M); - TensorHf4 ta; - TensorHf4 tb; - TensorHf4 tbias; - ta.re_alloc(sha, AK_INT8); //weights - tb.re_alloc(shin, AK_INT8); //x - tbias.re_alloc(shout, AK_INT32);//y - fill_tensor_rand(ta, -64, 63); - // fill_tensor_const(ta, 1); - fill_tensor_rand(tb, -64, 63); - // fill_tensor_const(tb, 1); - fill_tensor_rand(tbias, -65536, 65535); - // print_tensor(ta); - // print_tensor(tb); - //print_tensor(tbias); - TensorHf4 tout_basic; - TensorHf4 tout_saber; - tout_saber.re_alloc(shout, AK_INT32); - int m = M; - int n = N; - LOG(INFO) << "sgemv M: " << m << ", N: " << n; - LOG(INFO) << "relu: " << (flag_relu? "true" : "false") << ", bias: " << (flag_bias? "true" : "false"); - LOG(INFO) << "test iter: " << test_iter; - LOG(INFO) << "compare result with basic sgemv: " << (COMPARE_RESULT? "true" : "false"); - const signed char* da = static_cast(ta.data()); - const signed char* db = static_cast(tb.data()); - if (COMPARE_RESULT) { - LOG(INFO) << "run basic conv for precision comparation"; - tout_basic.re_alloc(shout, AK_INT32); - int* dc_basic = static_cast(tout_basic.mutable_data()); - basic_sgemv(m, n, da, db, static_cast(tbias.data()), dc_basic, \ - false, flag_bias, flag_relu); - // LOG(WARNING) << "basic result"; - // print_tensor(tout_basic); - } - long long ops = m * n; - //! compute - int* dc_saber = static_cast(tout_saber.mutable_data()); - LOG(INFO) << "saber sgemm compute"; - for (int i = 0; i < test_iter; ++i) { - // t1.clear(); - // t1.start(); - if (flag_bias){ - if (flag_relu){ - t1.clear(); - t1.start(); - sgemv_bias_relu_int8(false, m, n, da, db, dc_saber, static_cast(tbias.data())); - t1.end(); - }else{ - t1.clear(); - t1.start(); - sgemv_bias_int8(false, m, n, da, db, dc_saber, static_cast(tbias.data())); - t1.end(); - } - }else{ - if (flag_relu){ - t1.clear(); - t1.start(); - sgemv_relu_int8(false, m, n, da, db, dc_saber); - t1.end(); - }else{ - t1.clear(); - t1.start(); - sgemv_int8(false, m, n, da, db, dc_saber); - t1.end(); - } - } - // sgemv_bias_relu_int8(false, m, n, da, db, dc_saber, static_cast(tbias.data())); - // sgemv_relu_int8(false, m, n, da, db, dc_saber); - // t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - // LOG(WARNING) << "saber result"; - // print_tensor(tout_saber); - - LOG(INFO) << "saber sgemv running time, ave: " << to / test_iter << ", min time: " << min_time; - LOG(WARNING) << "mean gops: " << 0.000001f * ops * test_iter / to \ - << " GFLOPS, max gops: " << 0.000001f * ops / min_time << " GFLOPS"; - if (COMPARE_RESULT) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - if (fabs(max_ratio) > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape(), AK_INT32); - tensor_diff(tout_basic, tout_saber, tdiff); - LOG(WARNING) << "basic result"; - print_tensor(tout_basic); - LOG(WARNING) << "saber result"; - print_tensor(tout_saber); - LOG(WARNING) << "diff tensor"; - print_tensor(tdiff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabs(max_ratio) > 1e-4f) { - return SaberInvalidValue; - } - } - return SaberSuccess; -} -TEST(TestSaberLite, test_func_sgemm_prepacked) { - if (Basic_test) { - LOG(INFO) << "run basic sgemm test"; - for (auto& m : {1, 8, 16, 111, 256, 397, 512, 777, 1024}) { - for (auto& n : {1, 3, 13, 141, 256, 345, 512, 789, 1024}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - for (auto& th : {1, 2, 4}) { - SaberStatus flag = test_arm_sgemv(m, n, flag_bias, flag_relu, th); - if (flag == SaberSuccess) { - LOG(INFO) << "test m = " << m << ", n=" << n << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << " passed"; - } else { - LOG(FATAL) << "test m = " << m << ", n=" << n << \ - ", bias: " << (flag_bias? "true" : "false") << ", relu: " << \ - (flag_relu? "true" : "false") << " failed"; - } - } - } - } - } - } - } -} -TEST(TestSaberLite, test_func_sgemm_prepacked_custom) { - if (test_arm_sgemv(M, N, flag_bias, flag_relu, threads) == SaberSuccess) { - LOG (INFO) << "test m = " << M << ", n=" << N << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << \ - (flag_relu ? "true" : "false") << " passed"; - } else { - LOG (FATAL) << "test m = " << M << ", n=" << N << \ - ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << \ - (flag_relu ? "true" : "false") << " failed"; - } -} -int main(int argc, const char** argv){ - anakin::saber::lite::Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [relu] [bias] [test iter] [compare result]"; - if (argc > 1) { - Basic_test = atoi(argv[1]) > 0; - } - if (argc > 2) { - cluster = atoi(argv[2]); - } - if (argc > 3) { - threads = atoi(argv[3]); - } - if (argc > 4) { - if (argc < 7) { - LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [relu] [bias] [test iter] [compare result]"; - return 0; - } - M = atoi(argv[4]); - N = atoi(argv[5]); - flag_relu = atoi(argv[6]) > 0; - flag_bias = atoi(argv[7]) > 0; - } - if (argc > 8) { - test_iter = atoi(argv[8]); - } - if (argc > 9) { - COMPARE_RESULT = atoi(argv[9]) > 0; - } - if (argc > 10) { - if (atoi(argv[10]) > 0) { - flag_arch = A72; - } else { - flag_arch = A73; - } - } - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/lite/test_lite_utils.cpp b/test/lite/test_lite_utils.cpp deleted file mode 100644 index 5a47dd05e..000000000 --- a/test/lite/test_lite_utils.cpp +++ /dev/null @@ -1,1330 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/utils/cv_utils.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 1; -int h = 1920; -int w = 720; -int ww = 112; -int hh = 288; -int angle = 90; -int flip_num = 1; -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 - -void resize_uv_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out, float width_scale, float height_scale) { - - const int resize_coef_bits = 11; - const int resize_coef_scale = 1 << resize_coef_bits; - // LOG(INFO) << "input w, h:" << w_in << ", " << h_in; - // LOG(INFO) << "output w, h:" << w_out << ", " << h_out; - - int spatial_in = h_in * w_in; - int spatial_out = h_out * w_out; - - int* buf = new int[w_out * 2 + h_out * 2]; - int* xofs = buf;//new int[w]; - int* yofs = buf + w_out;//new int[h]; - - float* ialpha = new float[w_out * 2];//new short[w * 2]; - float* ibeta = new float[h_out * 2];//new short[h * 2]; - - float fx = 0.f; - float fy = 0.f; - int sx = 0; - int sy = 0; - - for (int dx = 0; dx < w_out / 2; dx++){ - fx = (float)((dx + 0.5) * width_scale - 0.5); - sx = floor(fx); - //printf("%.2f, %d, %d\n", fx, dx, sx); - fx -= sx; - - if (sx < 0){ - sx = 0; - fx = 0.f; - } - if (sx >= w_in - 1){ - sx = w_in - 2; - fx = 1.f; - } - - xofs[dx] = sx; - - float a0 = (1.f - fx); - float a1 = fx; - - ialpha[dx * 2] = a0; - ialpha[dx * 2 + 1] = a1; - } - - for (int dy = 0; dy < h_out; dy++) { - fy = (float)((dy + 0.5) * height_scale - 0.5); - sy = floor(fy); - fy -= sy; - - if (sy < 0){ - sy = 0; - fy = 0.f; - } - if (sy >= h_in - 1){ - sy = h_in - 2; - fy = 1.f; - } - - yofs[dy] = sy; - - float b0 = (1.f - fy); - float b1 = fy; - - ibeta[dy * 2] = b0; - ibeta[dy * 2 + 1] = b1; - } - // for (int i = 0; i < w_out; i++) - // printf("%.2f ", ialpha[i]); - // printf("\n"); - // for (int i = 0; i < h_out * 2; i++) - // printf("%.2f ", ibeta[i]); - // printf("\n"); - // for (int i = 0; i < w_out / 2; i++) - // printf("%d ", xofs[i]); - // printf("\n"); - // for (int i = 0; i < h_out; i++) - // printf("%d ", yofs[i]); - // printf("\n"); - -#pragma omp parallel for - for (int i = 0; i < count; ++i){ - for (int dy = 0; dy < h_out; dy++){ - unsigned char* out_ptr = out_data + dy * w_out; - int y_in_start = yofs[dy]; - int y_in_end = y_in_start + 1; - float b0 = ibeta[dy * 2]; - float b1 = ibeta[dy * 2 + 1]; - for (int dx = 0; dx < w_out; dx += 2){ - int tmp = dx / 2; - int x_in_start = xofs[tmp] * 2; //0 - int x_in_end = x_in_start + 2; //2 - // printf("x_in: %d, y_in: %d \n", x_in_start, y_in_start); - float a0 = ialpha[tmp * 2]; - float a1 = ialpha[tmp * 2 + 1]; - - int tl_index = y_in_start * w_in + x_in_start; //0 - int tr_index = y_in_start * w_in + x_in_end; //2 - int bl_index = y_in_end * w_in + x_in_start; - int br_index = y_in_end * w_in + x_in_end; - - int tl = in_data[tl_index + i * spatial_in]; - int tr = in_data[tr_index + i * spatial_in]; - int bl = in_data[bl_index + i * spatial_in]; - int br = in_data[br_index + i * spatial_in]; - - float outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1; - - out_ptr[dx] = outval; - - tl_index++; - tr_index++; - bl_index++; - br_index++; - - tl = in_data[tl_index + i * spatial_in]; - tr = in_data[tr_index + i * spatial_in]; - bl = in_data[bl_index + i * spatial_in]; - br = in_data[br_index + i * spatial_in]; - - outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1; - - out_ptr[dx + 1] = outval; - - } - } - } - delete[] ialpha; - delete[] ibeta; - delete[] buf; -} - -void resize_y_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out, float width_scale, float height_scale) { - - // LOG(INFO) << "input w, h:" << w_in << ", " << h_in; - // LOG(INFO) << "output w, h:" << w_out << ", " << h_out; - - int spatial_in = h_in * w_in; - int spatial_out = h_out * w_out; - - int* buf = new int[w_out * 2 + h_out * 2]; - int* xofs = buf;//new int[w]; - int* yofs = buf + w_out;//new int[h]; - - float* ialpha = new float[w_out * 2];//new short[w * 2]; - float* ibeta = new float[h_out * 2];//new short[h * 2]; - - float fx = 0.f; - float fy = 0.f; - int sx = 0; - int sy = 0; - - for (int dx = 0; dx < w_out; dx++){ - fx = (float)((dx + 0.5) * width_scale - 0.5); - sx = floor(fx); - fx -= sx; - - if (sx < 0){ - sx = 0; - fx = 0.f; - } - if (sx >= w_in - 1){ - sx = w_in - 2; - fx = 1.f; - } - - xofs[dx] = sx; - - float a0 = (1.f - fx); - float a1 = fx; - - ialpha[dx * 2] = a0; - ialpha[dx * 2 + 1] = a1; - } - - for (int dy = 0; dy < h_out; dy++) { - fy = (float)((dy + 0.5) * height_scale - 0.5); - sy = floor(fy); - fy -= sy; - - if (sy < 0){ - sy = 0; - fy = 0.f; - } - if (sy >= h_in - 1){ - sy = h_in - 2; - fy = 1.f; - } - - yofs[dy] = sy; - - float b0 = (1.f - fy); - float b1 = fy; - - ibeta[dy * 2] = b0; - ibeta[dy * 2 + 1] = b1; - } - -#pragma omp parallel for - for (int i = 0; i < count; ++i){ - for (int s = 0; s < spatial_out; ++s){ - int x_out = s % w_out; - int y_out = s / w_out; - - int x_in_start = xofs[x_out]; //(int)x_in; - int y_in_start = yofs[y_out]; - - int x_in_end = x_in_start + 1; - int y_in_end = y_in_start + 1; - - float a0 = ialpha[x_out * 2]; - float a1 = ialpha[x_out * 2 + 1]; - float b0 = ibeta[y_out * 2]; - float b1 = ibeta[y_out * 2 + 1]; - - int tl_index = y_in_start * w_in + x_in_start; - int tr_index = y_in_start * w_in + x_in_end; - int bl_index = y_in_end * w_in + x_in_start; - int br_index = y_in_end * w_in + x_in_end; - - int tl = in_data[tl_index + i * spatial_in]; - int tr = in_data[tr_index + i * spatial_in]; - int bl = in_data[bl_index + i * spatial_in]; - int br = in_data[br_index + i * spatial_in]; - - float outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1; - - out_data[s + i * spatial_out] = outval; - } - } - delete[] ialpha; - delete[] ibeta; - // delete[] buf; - -} - -void resize_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out, float width_scale, float height_scale) { - if (w_out == w_in && h_out == h_in) - { - memcpy(out_data, in_data, sizeof(char) * w_in * w_in); - return; - } - // dst = new unsigned char[h_out * w_out]; - //if (dst == nullptr) - // return; - int y_h = h_in * 2 / 3; - int uv_h = h_in - y_h; - const unsigned char* y_ptr = in_data; - const unsigned char* uv_ptr = in_data + y_h * w_in; - //out - int dst_y_h = h_out * 2 / 3; - int dst_uv_h = h_out - dst_y_h; - unsigned char* dst_ptr = out_data + dst_y_h * w_out; - - //resize_y_basic(in_data, 1, h_in, w_in, out_data, h_out, w_out, width_scale, height_scale); - //y - resize_y_basic(y_ptr, 1, y_h, w_in, out_data, dst_y_h, w_out, width_scale, height_scale); - //uv - resize_uv_basic(uv_ptr, 1, uv_h, w_in, dst_ptr, dst_uv_h, w_out, width_scale, height_scale); -} - -void nv21_to_tensor_basic(const unsigned char* nv21, TensorHf& output, int width, int height, \ - float* means, float* scales) { - - LCHECK_EQ(width, output.width(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(height, output.height(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(3, output.channel(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(1, output.num(), "sizes of two valid shapes must be the same"); - int size = width * height; - float* ptr0 = output.mutable_data(); - float* ptr1 = output.mutable_data() + size; - float* ptr2 = output.mutable_data() + size * 2; - float r_means = means[0]; - float g_means = means[1]; - float b_means = means[2]; - float r_scales = scales[0]; - float g_scales = scales[1]; - float b_scales = scales[2]; - const unsigned char* uv_start = nv21 + size; - - for (int h = 0; h < height; h++){ - int y = 0; - int u = 0; - int v = 0; - int size_h = h * width; - int u_size_h = (h / 2) * width; - for (int i = 0; i < width; i++){ - y = nv21[size_h + i]; - if (i % 2 == 0){ - v = uv_start[u_size_h + i]; - u = uv_start[u_size_h + i + 1]; - } - //printf("y0: %d, u: %d, v: %d\n", y, u, v); - *ptr0 = ((y + 0.14 * (v - 128)) - r_means) * r_scales; - *ptr1 = ((y - (0.34 * (u - 128)) - (0.71 * (v - 128)))- g_means) * g_scales; - *ptr2 = ((y + (1.77 * (u - 128))) - b_means) * b_scales; - - ptr0++; - ptr1++; - ptr2++; - } - } -} - -void nv12_to_tensor_basic(const unsigned char* nv12, TensorHf& output, int width, int height, \ - float* means, float* scales) { - - LCHECK_EQ(width, output.width(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(height, output.height(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(3, output.channel(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(1, output.num(), "sizes of two valid shapes must be the same"); - int size = width * height; - float* ptr0 = output.mutable_data(); - float* ptr1 = output.mutable_data() + size; - float* ptr2 = output.mutable_data() + size * 2; - float r_means = means[0]; - float g_means = means[1]; - float b_means = means[2]; - float r_scales = scales[0]; - float g_scales = scales[1]; - float b_scales = scales[2]; - const unsigned char* uv_start = nv12 + size; - - float r_meanxscale = r_means * r_scales; - float g_meanxscale = g_means * g_scales; - float b_meanxscale = b_means * b_scales; - - for (int h = 0; h < height; h++){ - int y = 0; - int u = 0; - int v = 0; - int size_h = h * width; - int u_size_h = (h / 2) * width; - for (int i = 0; i < width; i++){ - y = nv12[size_h + i]; - if (i % 2 == 0){ - u = uv_start[u_size_h + i]; - v = uv_start[u_size_h + i + 1]; - } - //printf("y0: %d, u: %d, v: %d\n", y, u, v); - *ptr0 = ((y + 0.14 * (v - 128)) - r_means) * r_scales; - *ptr1 = ((y - (0.34 * (u - 128)) - (0.71 * (v - 128)))- g_means) * g_scales; - *ptr2 = ((y + (1.77 * (u - 128))) - b_means) * b_scales; - - ptr0++; - ptr1++; - ptr2++; - } - } -} - -void rotate90_basic(const unsigned char* in_data, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out){ - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[y * w_out + x] = in_data[x * w_in + y]; //(y,x) = in(x,y) - } - } -} - -void rotate180_basic(const unsigned char* in_data, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out){ - int w = w_in - 1; - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[x * w_out + w - y] = in_data[x * w_in + y]; //(y,x) = in(x,y) - } - } -} -void rotate270_basic(const unsigned char* in_data, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out){ - int h = h_out - 1; - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[(h - y) * w_out + x] = in_data[x * w_in + y]; //(y,x) = in(x,y) - } - } -} - -void rotate_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out, int angle){ - if (angle == 90){ - LOG(INFO) << "90"; - rotate90_basic(in_data, h_in, w_in, out_data, h_out, w_out); - } - if (angle == 180){ - LOG(INFO) << "180"; - rotate180_basic(in_data, h_in, w_in, out_data, h_in, w_in); - } - if (angle == 270){ - LOG(INFO) << "270"; - rotate270_basic(in_data, h_in, w_in, out_data, h_out, w_out); - } - //LOG(INFO) << "end"; - -} -void flipx_basic(const unsigned char* in_data, int h_in, int w_in, unsigned char* out_data){ - int h = h_in - 1; - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[(h - x) * w_in + y] = in_data[x * w_in + y]; //(y,x) = in(x,y) - } - } -} - -void flipy_basic(const unsigned char* in_data, int h_in, int w_in, unsigned char* out_data){ - int w = w_in - 1; - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[x * w_in + w - y] = in_data[x * w_in + y]; //(y,x) = in(x,y) - } - } -} -void flipxy_basic(const unsigned char* in_data, int h_in, int w_in, unsigned char* out_data){ - int w = w_in - 1; - int h = h_in - 1; - for (int x = 0; x < h_in; x++){ - for (int y = 0; y < w_in; y++){ - out_data[(h - x) * w_in + w - y] = in_data[x * w_in + y]; //(h-y,w-x) = in(x,y) - } - } -} - -void flip_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out, int flip_num){ - if (flip_num == 1){ //x - LOG(INFO) << "x"; - flipx_basic(in_data, h_in, w_in, out_data); - } - if (flip_num == -1){ - LOG(INFO) << "y"; - flipy_basic(in_data, h_in, w_in, out_data); - } - if (flip_num == 0){ - LOG(INFO) << "xy"; - flipxy_basic(in_data, h_in, w_in, out_data); - } - //LOG(INFO) << "end"; - -} - -void nv12_bgr_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out){ - int y_h = h_in * 2 / 3; - const unsigned char* y = in_data; - const unsigned char* vu = in_data + y_h * w_in; - for (int i = 0; i < y_h; i++){ - const unsigned char* ptr_y1 = y + i * w_in; - const unsigned char* ptr_vu = vu + (i / 2) * w_in; - unsigned char* ptr_bgr1 = out_data + (i * 3) * w_out; - unsigned char* ptr_bgr2 = ptr_bgr1 + w_out; - unsigned char* ptr_bgr3 = ptr_bgr2 + w_out; - int j = 0; - for (; j < w_in; j += 2){ - unsigned char _y0 = ptr_y1[0]; - unsigned char _y1 = ptr_y1[1]; - unsigned char _v = ptr_vu[1]; - unsigned char _u = ptr_vu[0]; - - int ra = floor((179 * (_v - 128)) >> 7); - int ga = floor((44 * (_u - 128) + 91 * (_v-128)) >> 7); - int ba = floor((227 * (_u - 128)) >> 7); - - int r = _y0 + ra; - int g = _y0 - ga; - int b = _y0 + ba; - - int r1 = _y1 + ra; - int g1 = _y1 - ga; - int b1 = _y1 + ba; - - r = r < 0 ? 0 : (r > 255) ? 255 : r; - g = g < 0 ? 0 : (g > 255) ? 255 : g; - b = b < 0 ? 0 : (b > 255) ? 255 : b; - - r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; - g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; - b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; - - *ptr_bgr1++ = b; - *ptr_bgr2++ = g; - *ptr_bgr3++ = r; - - *ptr_bgr1++ = b1; - *ptr_bgr2++ = g1; - *ptr_bgr3++ = r1; - - ptr_y1 += 2; - ptr_vu += 2; - - } - if (j < w_in) { - unsigned char _y = ptr_y1[0]; - unsigned char _v = ptr_vu[1]; - unsigned char _u = ptr_vu[0]; - - int r = _y + ((179 * (_v - 128)) >> 7); - int g = _y - ((44 * (_u - 128) - 91 * (_v-128)) >> 7); - int b = _y + ((227 * (_u - 128)) >> 7); - - r = r < 0 ? 0 : (r > 255) ? 255 : r; - g = g < 0 ? 0 : (g > 255) ? 255 : g; - b = b < 0 ? 0 : (b > 255) ? 255 : b; - - ptr_bgr1[0] = b; - ptr_bgr1[1] = g; - ptr_bgr1[2] = r; - } - } -} - -void nv21_bgr_basic(const unsigned char* in_data, int count, int h_in, int w_in, \ - unsigned char* out_data, int h_out, int w_out){ - int y_h = h_in * 2 / 3; - const unsigned char* y = in_data; - const unsigned char* vu = in_data + y_h * w_in; - for (int i = 0; i < y_h; i++){ - const unsigned char* ptr_y1 = y + i * w_in; - const unsigned char* ptr_vu = vu + (i / 2) * w_in; - unsigned char* ptr_bgr1 = out_data + (i * 3) * w_out; - unsigned char* ptr_bgr2 = ptr_bgr1 + w_out; - unsigned char* ptr_bgr3 = ptr_bgr2 + w_out; - int j = 0; - for (; j < w_in; j += 2){ - unsigned char _y0 = ptr_y1[0]; - unsigned char _y1 = ptr_y1[1]; - unsigned char _v = ptr_vu[0]; - unsigned char _u = ptr_vu[1]; - - int ra = floor((179 * (_v - 128)) >> 7); - int ga = floor((44 * (_u - 128) + 91 * (_v-128)) >> 7); - int ba = floor((227 * (_u - 128)) >> 7); - - // float ra_1 = ((179 * (_v - 128)) / 128.0); - // float ga_1 = ((44 * (_u - 128) + 91 * (_v-128)) / 128.0); - // float ba_1 = ((227 * (_u - 128)) / 128.0); - - // int ra = ra_1 < 0 ? ceil(ra_1) : floor(ra_1); - // int ga = ga_1 < 0 ? ceil(ga_1) : floor(ga_1); - // int ba = ba_1 < 0 ? ceil(ba_1) : floor(ba_1); - - // printf("ga_1, ra, ga, ba: %.3f, %d, %d, %d \n", ga_1, ra, ga, ba); - - int r = _y0 + ra; - int g = _y0 - ga; - int b = _y0 + ba; - - int r1 = _y1 + ra; - int g1 = _y1 - ga; - int b1 = _y1 + ba; - - r = r < 0 ? 0 : (r > 255) ? 255 : r; - g = g < 0 ? 0 : (g > 255) ? 255 : g; - b = b < 0 ? 0 : (b > 255) ? 255 : b; - - r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; - g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; - b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; - - *ptr_bgr1++ = b; - *ptr_bgr2++ = g; - *ptr_bgr3++ = r; - - *ptr_bgr1++ = b1; - *ptr_bgr2++ = g1; - *ptr_bgr3++ = r1; - - ptr_y1 += 2; - ptr_vu += 2; - - } - if (j < w_in) { - unsigned char _y = ptr_y1[0]; - unsigned char _v = ptr_vu[0]; - unsigned char _u = ptr_vu[1]; - - int r = _y + ((179 * (_v - 128)) >> 7); - int g = _y - ((44 * (_u - 128) - 91 * (_v-128)) >> 7); - int b = _y + ((227 * (_u - 128)) >> 7); - - r = r < 0 ? 0 : (r > 255) ? 255 : r; - g = g < 0 ? 0 : (g > 255) ? 255 : g; - b = b < 0 ? 0 : (b > 255) ? 255 : b; - - ptr_bgr1[0] = b; - ptr_bgr1[1] = g; - ptr_bgr1[2] = r; - } - } -} - -void bgr_to_tensor_basic(const unsigned char* bgr, TensorHf& output, int width, int height, \ - float* means, float* scales) { - - LCHECK_EQ(width, output.width(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(height, output.height() * 3, "sizes of two valid shapes must be the same"); - LCHECK_EQ(3, output.channel(), "sizes of two valid shapes must be the same"); - LCHECK_EQ(1, output.num(), "sizes of two valid shapes must be the same"); - int size = width * height / 3; - float* ptr0 = output.mutable_data(); - float r_means = means[0]; - float g_means = means[1]; - float b_means = means[2]; - float r_scales = scales[0]; - float g_scales = scales[1]; - float b_scales = scales[2]; - - for (int h = 0; h < height; h += 3){ - const unsigned char* ptr_b = bgr + (h * 3) * width; - const unsigned char* ptr_g = ptr_b + width; - const unsigned char* ptr_r = ptr_g + width; - float* ptr0_b = ptr0 + (h / 3)* width; - float* ptr1_g = ptr0_b + size; - float* ptr2_r = ptr1_g + size; - for (int i = 0; i < width; i++){ - *ptr0_b++ = (*ptr_b - b_means) * b_scales; - *ptr1_g++ = (*ptr_g - g_means) * g_scales; - *ptr2_r++ = (*ptr_r - r_means) * r_scales; - - *ptr_b++; - *ptr_g++; - *ptr_r++; - } - } -} -#if 0 -TEST(TestSaberLite, test_func_cv_bgr_tensor) { - LOG(INFO) << "test_func_cv_bgr_tensor start"; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - - Shape shape_in(1, 1, h_in, w_in); - Shape shape_out(1, 3, h_in / 3, w_in); - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - - //Tensor thin(shape_in); - int size = h_in * w_in ; - unsigned char* bgr = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - bgr[i] = (unsigned char)i; - } - - TensorHf4 tout(shape_out); - TensorHf4 tout_basic(shape_out); - - float means[3] = {127.5f, 127.5f, 127.5f}; - float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; - -#if COMPARE_RESULT - // nv21_to_tensor_basic(nv21, tout_basic, w_in, h_in, means, scales); - bgr_to_tensor_basic(bgr, tout_basic, w_in, h_in, means, scales); - //print_tensor(tout_basic); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv bgrtoTensor compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - //nv21_to_tensor(nv21, tout, w_in, h_in, means, scales); - bgr_to_tensor(bgr, tout, w_in, h_in, means, scales); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber bgrtoTensor total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout, tout_basic, max_ratio, max_diff); - - TensorHf4 diff(shape_out); - tensor_diff(tout_basic, tout, diff); - if (fabsf(max_ratio) > 1e-3f) { - LOG(INFO) << "diff: "; - print_tensor(diff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} -#endif -#if 0 -TEST(TestSaberLite, test_func_cv_nv21_bgr) { - LOG(INFO) << "test_func_cv_nv21_bgr start"; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - // int w_out = ww; - // int h_out = hh; - int w_out = w_in; - int h_out = h_in * 2; - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - LOG(INFO) << " flip_num = " << flip_num; - - //Tensor thin(shape_in); - int size = h_in * w_in; - unsigned char* nv21 = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - nv21[i] = (unsigned char)(i + 10); - } - unsigned char* out = new unsigned char[size * 3]; - unsigned char* tv_out = new unsigned char[size * 3]; - -#if COMPARE_RESULT - //nv21_bgr_basic(nv21, 1, h_in, w_in, out, h_out, w_out); - nv12_bgr_basic(nv21, 1, h_in, w_in, out, h_out, w_out); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv flip compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - //nv21_to_bgr(nv21, tv_out, w_in, h_in, w_out, h_out); - nv12_to_bgr(nv21, tv_out, w_in, h_in, w_out, h_out); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber flip total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - LOG(INFO) << "diff: " ; - size = w_out * h_out; - for (int i = 0; i < size; i++){ - int a = out[i]; - int b = tv_out[i]; - int diff1 = a - b; - int diff = diff1 >= 0 ? diff1 : -1 * diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % w_out == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - printf("\n"); - if (fabsf(max_ratio) > 1e-5f){ - LOG(INFO) << "in"; - for (int i = 0; i < h_in; i++){ - for (int j = 0; j < w_in; j++){ - printf("%d ", nv21[i*w_in+j]); - } - printf("\n"); - } - LOG(INFO) << "out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", out[i*w_out+j]); - } - printf("\n"); - } - LOG(INFO) << "tv_out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", tv_out[i*w_out+j]); - } - printf("\n"); - } - - } - - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif - delete[] out; - delete[] tv_out; -} -#endif -#if 0 -TEST(TestSaberLite, test_func_cv_flip) { - LOG(INFO) << "test_func_cv_flip start"; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - int w_out = ww; - int h_out = hh; - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - LOG(INFO) <<" flip_num = "<< flip_num; - - //Tensor thin(shape_in); - int size = h_in * w_in; - unsigned char* nv21 = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - nv21[i] = (unsigned char)i; - } - unsigned char* out = new unsigned char[size]; - unsigned char* tv_out = new unsigned char[size]; - - -#if COMPARE_RESULT - // nv21_to_tensor_basic(nv21, tout_basic, w_in, h_in, means, scales); - flip_basic(nv21, 1, h_in, w_in, out, h_out, w_out, flip_num); - //print_tensor(tout_basic); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv flip compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - //nv21_to_tensor(nv21, tout, w_in, h_in, means, scales); - flip(nv21, tv_out, w_in, h_in, w_out, h_out, flip_num); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber flip total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - LOG(INFO) << "diff: " ; - for (int i = 0; i < size; i++){ - int a = out[i]; - int b = tv_out[i]; - int diff1 = a - b; - int diff = diff1 >= 0 ? diff1 : -1 * diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - if (i != 0 && i % w_out == 0) - printf("\n"); - printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - printf("\n"); - if (fabsf(max_ratio) > 1e-5f){ - LOG(INFO) << "in"; - for (int i = 0; i < h_in; i++){ - for (int j = 0; j < w_in; j++){ - printf("%d ", nv21[i*w_in+j]); - } - printf("\n"); - } - LOG(INFO) << "out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", out[i*w_out+j]); - } - printf("\n"); - } - LOG(INFO) << "tv_out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", tv_out[i*w_out+j]); - } - printf("\n"); - } - } - - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif - delete[] out; - delete[] tv_out; -} -#endif -#if 0 -TEST(TestSaberLite, test_func_cv_rotate) { - LOG(INFO) << "test_func_cv_rotate start"; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - int w_out = ww; - int h_out = hh; - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - LOG(INFO) <<" angle = "<< angle; - - //Tensor thin(shape_in); - int size = h_in * w_in; - unsigned char* nv21 = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - nv21[i] = (unsigned char)i; - } - unsigned char* out = new unsigned char[size]; - unsigned char* tv_out = new unsigned char[size]; - - -#if COMPARE_RESULT - // nv21_to_tensor_basic(nv21, tout_basic, w_in, h_in, means, scales); - rotate_basic(nv21, 1, h_in, w_in, out, h_out, w_out, angle); - //print_tensor(tout_basic); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv rotate compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - //nv21_to_tensor(nv21, tout, w_in, h_in, means, scales); - rotate(nv21, tv_out, w_in, h_in, w_out, h_out, angle); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber rotate total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - const double eps = 1e-6f; - LOG(INFO) << "diff: " ; - for (int i = 0; i < size; i++){ - int a = out[i]; - int b = tv_out[i]; - int diff1 = a - b; - int diff = diff1 >= 0 ? diff1 : -1 * diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - if (i != 0 && i % w_out == 0) - printf("\n"); - printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - printf("\n"); - if (fabsf(max_ratio) > 1e-5f){ - LOG(INFO) << "in"; - for (int i = 0; i < h_in; i++){ - for (int j = 0; j < w_in; j++){ - printf("%d ", nv21[i*w_in+j]); - } - printf("\n"); - } - LOG(INFO) << "out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", out[i*w_out+j]); - } - printf("\n"); - } - LOG(INFO) << "tv_out"; - for (int i = 0; i < h_out; i++){ - for (int j = 0; j < w_out; j++){ - printf("%d ", tv_out[i*w_out+j]); - } - printf("\n"); - } - } - - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif - delete[] out; - delete[] tv_out; -} -#endif -#if 0 -TEST(TestSaberLite, test_func_cv_resize) { - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - int w_out = ww; - int h_out = hh; - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_out << ", width=" << w_out; - - //Tensor thin(shape_in); - int size = h_in * w_in; - unsigned char* nv21 = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - nv21[i] = (unsigned char)i; - } - - int out_size = h_out * w_out; - unsigned char* tout = new unsigned char[out_size]; - unsigned char* tout_basic = new unsigned char[out_size]; - - float width_scale = (float)w_in / w_out; - float height_scale = (float)h_in / h_out; - -#if COMPARE_RESULT - LOG(INFO) << "saber cv basic resize compute"; - resize_basic(nv21, 1, h_in, w_in, tout_basic, h_out, w_out, width_scale, height_scale); - //print_tensor(tout_basic); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv resize compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - // LOG(INFO) << "resize"; - resize(nv21, tout, w_in, h_in, w_out, h_out); - - LOG(INFO) << "nv21"; - Shape shape_out = {1, 3, w_out, h_out * 2/3}; - TensorHf4 tout_tensor(shape_out); - float means[3] = {127.5f, 127.5f, 127.5f}; - float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; - nv12_to_tensor(tout, tout_tensor, w_out, h_out * 2/3, means, scales); - - LOG(INFO) << "end"; - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber resize total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - // LOG(INFO) << "basic result, size: " << out_size; - // for (int i = 0; i < out_size; i++){ - // if (i != 0 && i % w_out == 0) - // printf("\n"); - // printf("%d ", tout_basic[i]); - // } - // printf("\n"); - // LOG(INFO) << "resize result, size: " << out_size; - // for (int i = 0; i < out_size; i++){ - // if (i != 0 && i % w_out == 0) - // printf("\n"); - // printf("%d ", tout[i]); - // } - // printf("\n"); - //tensor_cmp_host(tout_basic, tout, out_size, max_ratio, max_diff); - const double eps = 1e-6f; - LOG(INFO) << "diff, size: " << out_size; - for (int i = 0; i < out_size; i++){ - int a = tout[i]; - int b = tout_basic[i]; - int diff1 = a - b; - int diff = diff1 >= 0 ? diff1 : -1 * diff1; - if (max_diff < diff) { - max_diff = diff; - max_ratio = 2.0 * max_diff / (a + b + eps); - } - // if (i != 0 && i % w_out == 0) - // printf("\n"); - // printf("%d ", diff); - // if (diff1 != 0) - // printf("i: %d, out: %d, a: %d, b: %d \n", i, diff, a, b); - } - printf("\n"); - // LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - // CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif - delete[] tout; - delete[] tout_basic; - // LOG(INFO) << "resize end"; -} -#endif - -#if 0 -TEST(TestSaberLite, test_func_cv_nv21_tensor) { - LOG(INFO) << "test_func_cv_nv21_tensor start"; - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int w_in = w; - int h_in = h; - - Shape shape_in(1, 1, h_in, w_in); - Shape shape_out(1, 3, h_in, w_in); - - LOG(INFO) << " input tensor size, num=" << 1 << ", channel=" << \ - 1 << ", height=" << h_in << ", width=" << w_in; - - //Tensor thin(shape_in); - int size = h_in * w_in * 3; - size = size >> 1; - unsigned char* nv21 = new unsigned char[size]; - for (int i = 0; i < size; ++i) { - nv21[i] = (unsigned char)i; - } - - TensorHf4 tout(shape_out); - TensorHf4 tout_basic(shape_out); - - float means[3] = {127.5f, 127.5f, 127.5f}; - float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; - -#if COMPARE_RESULT - // nv21_to_tensor_basic(nv21, tout_basic, w_in, h_in, means, scales); - nv12_to_tensor_basic(nv21, tout_basic, w_in, h_in, means, scales); - //print_tensor(tout_basic); -#endif - - SaberTimer t1; - - LOG(INFO) << "saber cv nv21toTensor compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - //nv21_to_tensor(nv21, tout, w_in, h_in, means, scales); - nv12_to_tensor(nv21, tout, w_in, h_in, means, scales); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber nv21toTensor total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(tout); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic.data(), tout.data(), tout_basic.valid_size(), max_ratio, max_diff); - TensorHf4 diff(shape_out); - tensor_diff(tout_basic, tout, diff); - if (fabsf(max_ratio) > 1e-3f) { - LOG(INFO) << "diff: "; - print_tensor(diff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} -#endif -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - // Env::env_init(4); - Env::env_init(); - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads h " << \ - " w hh ww angle"; - if (argc >= 2) { - cluster = atoi(argv[1]); - } - - if (argc >= 3) { - threads = atoi(argv[2]); - } - - if (argc >= 4) { - h = atoi(argv[3]); - } - if (argc >= 5) { - w = atoi(argv[4]); - } - if (argc >= 6) { - hh = atoi(argv[5]); - } - if (argc >= 7) { - ww = atoi(argv[6]); - } - if (argc >= 8){ - flip_num = atoi(argv[7]); - } - if (argc >= 9){ - angle = atoi(argv[8]); - } - - InitTest(); - //RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_pooling_lite.cpp b/test/lite/test_pooling_lite.cpp deleted file mode 100755 index 7347db157..000000000 --- a/test/lite/test_pooling_lite.cpp +++ /dev/null @@ -1,413 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_pooling.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; -int test_iter = 10; - -bool compare_result = false; -bool global_pool = false; - -int num = 1; -int ch_in = 32; -int h_in = 112; -int w_in = 112; - -int kernel = 2; -int pad = 0; -int stride = 2; - -PoolingType type = Pooling_max; - -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 -void pooling_basic(const float* din, float* dout, \ - int num, int chout, int hout, int wout, \ - int chin, int hin, int win, \ - PoolingType type, bool global, int kernel_w, int kernel_h, \ - int stride_w, int stride_h, int pad_w, int pad_h) { - //no need to pad input tensor, border is zero pad inside this function - int size_channel_in = win * hin; - int size_channel_out = wout * hout; - - float* data_out = dout; - const float* data_in = din; - - if (global) { - switch (type) { - case Pooling_max: - for (int n = 0; n < num; ++n) { - float* data_out_batch = data_out + n * chout * size_channel_out; - const float* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const float* data_in_channel = data_in_batch + c * size_channel_in;//in address - data_out_batch[c] = data_in_channel[0]; - for (int i = 0; i < size_channel_in; ++i) { - data_out_batch[c] = data_out_batch[c] > data_in_channel[i] ? \ - data_out_batch[c] : data_in_channel[i]; - } - } - } - break; - - case Pooling_average_include_padding: - - case Pooling_average_exclude_padding: - for (int n = 0; n < num; ++n) { - float* data_out_batch = data_out + n * chout * size_channel_out; - const float* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const float* data_in_channel = data_in_batch + c * size_channel_in;//in address - float sum = 0.f; - for (int i = 0; i < size_channel_in; ++i) { - sum += data_in_channel[i]; - } - data_out_batch[c] = sum / size_channel_in; - } - } - break; - default: - printf("not support\n"); - } - return; - } - - switch (type) { - case Pooling_max: - for (int n = 0; n < num; ++n) { - float* data_out_channel = data_out + n * chout * size_channel_out; - const float* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - float* data_out_row = data_out_channel + q * size_channel_out; - const float* data_in_channel = data_in_batch + q * size_channel_in; - - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - data_out_row[j] = data_in_channel[hstart * win + wstart]; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - data_out_row[j] = data_out_row[j] > \ - data_in_channel[h * win + w] ? \ - data_out_row[j] : data_in_channel[h * win + w]; - } - } - } - data_out_row += wout; - } - } - } - break; - - case Pooling_average_include_padding: - for (int n = 0; n < num; ++n) { - int pool_size = kernel_w * kernel_h;//(hend - hstart) * (wend - wstart);//problem - float* data_out_channel = data_out + n * chout * size_channel_out; - const float* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - float* data_out_row = data_out_channel + q * size_channel_out; - const float* data_in_channel = data_in_batch + q * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - data_out_row[j] = data_in_channel[hstart * win + wstart]; - float sum = 0.f; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += data_in_channel[h * win + w]; - } - } - data_out_row[j] = sum / pool_size; - } - data_out_row += wout; - } - } - } - break; - case Pooling_average_exclude_padding: - for (int n = 0; n < num; ++n) { - float* data_out_channel = data_out + n * chout * size_channel_out; - const float* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - float* data_out_row = data_out_channel + q * size_channel_out; - const float* data_in_channel = data_in_batch + q * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - data_out_row[j] = data_in_channel[hstart * win + wstart]; - float sum = 0.f; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += data_in_channel[h * win + w]; - } - } - int pool_size = (hend - hstart) * (wend - wstart); - data_out_row[j] = sum / pool_size; - } - data_out_row += wout; - } - } - } - break; - default: - printf("not support\n"); - } -} - -void test_arm_pooling(std::vector& tin, \ - int kernel, int stride, int pad, \ - PoolingType type, bool global, int threads, int cluster_id) { - - //int test_iter = 1000; - double to = 0; - double min_time = 1000000; - SaberTimer t1; - SaberTimer t2; - - Context ctx1; - PowerMode mode = cluster_id == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorHf4 tout_basic; - TensorHf4 tout_saber; - - TensorHf4* thin = tin[0]; - std::vector vin; - std::vector tvout_saber; - std::vector tvout_basic; - //vin.push_back(&thin); - tvout_saber.push_back(&tout_saber); - tvout_basic.push_back(&tout_basic); - - int num = tin[0]->num(); - int chin = tin[0]->channel(); - int hin = tin[0]->height(); - int win = tin[0]->width(); - - LOG(INFO) << "pooling param: "; - LOG(INFO) << " img_num = " << num; - LOG(INFO) << " in_channels = " << chin; - LOG(INFO) << " img_h = " << hin; - LOG(INFO) << " img_w = " << win; - LOG(INFO) << "kernel size = " << kernel; - LOG(INFO) << "stride = " << stride; - LOG(INFO) << "pad = " << pad; - LOG(INFO) << "type = " << type; - int wout = 1; - int hout = 1; - if (!global) { - int hin = tin[0]->height(); // P - hout = static_cast(std::max(0.f,ceilf(static_cast( - hin + 2 * pad - kernel) / stride))) + 1; - int win = tin[0]->width(); // Q - wout = static_cast(std::max(0.f,ceilf(static_cast( - win + 2 * pad - kernel) / stride))) + 1; - } - Shape shape_out{num, chin, hout, wout}; - PoolParam pooling_param(type,global,kernel,kernel, stride,stride,pad, pad); - //LOG(INFO) << "input tensor"; - //print_tensor_host(*tin[0]); - - if (compare_result) { - LOG(INFO) << "run basic pooling for precision comparation"; - tout_basic.re_alloc(shape_out); - //pooling_basic(tout_basic, *thin, type,global, kernel, \ - kernel, stride, stride, pad, pad); - //print_tensor_host(tout_basic); - LOG(INFO) << "basic pooling compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - const float* in=thin->data(); - float* out =tout_basic.mutable_data(); - - pooling_basic(in,out, num, chin,hout,wout,chin,hin,win,type,global, kernel, \ - kernel, stride, stride, pad, pad); - - //float* out1 =tout_saber.mutable_data(); - - // pooling_basic(in,out1, num, chin,hout,wout,chin,hin,win,3,global, kernel, \ - kernel, stride, stride, pad, pad); - //tvout_basic[0]->record_event(ctx1.get_compute_stream()); - //tvout_basic[0]->sync(); - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "basic pooling running time, ave: " << to / test_iter << ", min time: " << min_time; - // print_tensor_host(tout_basic); - - } - - SaberPooling pooling_saber; - pooling_saber.load_param(&pooling_param); - pooling_saber.compute_output_shape(tin, tvout_saber); - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - LOG(INFO) << "output shape_1: " << sh_out_saber[0] << ", " << sh_out_saber[1] << ", " \ - << sh_out_saber[2] << ", " << sh_out_saber[3]; - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - LOG(INFO) << "saber pooling impl init"; - pooling_saber.init(tin, tvout_saber, ctx1); - - //print_tensor_host(*thin); - - //! compute - LOG(INFO) << "saber pooling compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t2.clear(); - t2.start(); - //const float* in=thin->data(); - //float* out1 =tout_saber.mutable_data(); - - //pooling_basic(in,out1, num, chin,hout,wout,chin,hin,win,3,global, kernel, \ - kernel, stride, stride, pad, pad); - pooling_saber.dispatch(tin,tvout_saber); - //pooling3x3s2_max(tout_saber,*thin,type,global,kernel, \ - kernel, stride, stride, pad, pad); - //tvout_saber[0]->record_event(ctx1.get_compute_stream()); - //tvout_saber[0]->sync(); - //pooling_basic() - t2.end(); - to += t2.get_average_ms(); - if (t2.get_average_ms() < min_time) { - min_time = t2.get_average_ms(); - } - } - LOG(INFO) << "saber pooling running time, ave: " << to / test_iter << ", min time: " << min_time; - //print_tensor_host(tout_saber); - - if (compare_result) { - double max_ratio = 0; - double max_diff = 0; - TensorHf4 tdiff(tout_basic.valid_shape()); - tensor_cmp_host(tout_saber, tout_basic, max_ratio, max_diff); - // LOG(INFO) << "tout_basic"; - // print_tensor_host(tout_basic); - // LOG(INFO) << "tout_saber"; - // print_tensor_host(tout_saber); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; - } -} - -#if 1 -TEST(TestSaberLite, test_func_pooling_global_arm) { - - Shape shape_in(num, ch_in, h_in, w_in); - - TensorHf4 tdin; - - tdin.re_alloc(shape_in); - float* in = tdin.mutable_data(); - for (int i = 0; i < tdin.size(); i++){ - *in = -1.0f - i; - in++; - } - //fill_tensor_rand(tdin, -1.f, 1.f); - //fill_tensor_host_const(tdin, 1.f); - - std::vector tin; - tin.push_back(&tdin); - - test_arm_pooling(tin, kernel, stride, pad, type, global_pool, threads, cluster); -} -#endif - - - -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - Env::env_init(); - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4) { - test_iter = atoi(argv[3]); - } - if (argc >= 5) { - compare_result = atoi(argv[4]) > 0; - } - if (argc >= 6) { - global_pool = atoi(argv[5]) > 0; - } - if (argc >= 7) { - if (argc < 14) { - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " compare_result global_pool num ch_in h_in w_in kernel pad stride pool_type"; - return 0; - } - num = atoi(argv[6]); - ch_in = atoi(argv[7]); - h_in = atoi(argv[8]); - w_in = atoi(argv[9]); - kernel = atoi(argv[10]); - pad = atoi(argv[11]); - stride = atoi(argv[12]); - type = (PoolingType)atoi(argv[13]); - } - - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_pooling_lite_int8.cpp b/test/lite/test_pooling_lite_int8.cpp deleted file mode 100644 index b3b56df9a..000000000 --- a/test/lite/test_pooling_lite_int8.cpp +++ /dev/null @@ -1,422 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/pooling_arm_impl.h" -#include "saber/lite/funcs/saber_pooling.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; -int test_iter = 10; -bool compare_result = false; -int num = 1; -int ch_in = 32; -int h_in = 112; -int w_in = 112; -int pool_case = 0; - -typedef void (*POOL_FUNC)(const void*, void*, int, int, int, int, \ - int, int, int, PoolingType, bool, int, int, int, int, int, int); -typedef Tensor TensorH; - -void pooling_basic_test(const void* din, void* dout, \ - int num, int chout, int hout, int wout, \ - int chin, int hin, int win, \ - PoolingType type, bool global, int kernel_w, int kernel_h, \ - int stride_w, int stride_h, int pad_w, int pad_h) { - //no need to pad input tensor, border is zero pad inside this function - - int size_channel_in = win * hin; - int size_channel_out = wout * hout; - - signed char* data_out = static_cast(dout); - const signed char* data_in = static_cast(din); - - if (global) { - switch (type) { - case Pooling_max: - for (int n = 0; n < num; ++n) { - signed char* data_out_batch = data_out + n * chout * size_channel_out; - const signed char* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const signed char* data_in_channel = data_in_batch + c * size_channel_in;//in address - signed char max_val = std::numeric_limits::min(); - for (int i = 0; i < size_channel_in; ++i) { - if (max_val < data_in_channel[i]){ - max_val = data_in_channel[i]; - } - data_out_batch[c] = max_val; - } - } - } - break; - - case Pooling_average_include_padding: - - case Pooling_average_exclude_padding: - for (int n = 0; n < num; ++n) { - signed char* data_out_batch = data_out + n * chout * size_channel_out; - const signed char* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int c = 0; c < chout; ++c) { - const signed char* data_in_channel = data_in_batch + c * size_channel_in;//in address - int sum = 0; - for (int i = 0; i < size_channel_in; ++i) { - sum += int(data_in_channel[i]); - } - data_out_batch[c] = (signed char)(sum / size_channel_in); - } - } - break; - default: - //printf("not support\n"); - LOGE("not support\n"); - } - return; - } - - switch (type) { - case Pooling_max: - for (int n = 0; n < num; ++n) { - signed char* data_out_channel = data_out + n * chout * size_channel_out; - const signed char* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - signed char* data_out_row = data_out_channel + q * size_channel_out; - const signed char* data_in_channel = data_in_batch + q * size_channel_in; - - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - signed char max_val = std::numeric_limits::min(); - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - if (data_in_channel[h * win + w] > max_val){ - max_val = data_in_channel[h * win + w]; - } - } - } - data_out_row[j] = max_val; - } - data_out_row += wout; - } - } - } - break; - - case Pooling_average_include_padding: - for (int n = 0; n < num; ++n) { - int pool_size = kernel_w * kernel_h; - signed char* data_out_channel = data_out + n * chout * size_channel_out; - const signed char* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - signed char* data_out_row = data_out_channel + q * size_channel_out; - const signed char* data_in_channel = data_in_batch + q * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - int sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += int(data_in_channel[h * win + w]); - } - } - data_out_row[j] = (signed char)(sum / pool_size); - } - data_out_row += wout; - } - } - } - break; - case Pooling_average_exclude_padding: - for (int n = 0; n < num; ++n) { - signed char* data_out_channel = data_out + n * chout * size_channel_out; - const signed char* data_in_batch = data_in + n * chin * size_channel_in; -#pragma omp parallel for - for (int q = 0; q < chout; q++) { - - signed char* data_out_row = data_out_channel + q * size_channel_out; - const signed char* data_in_channel = data_in_batch + q * size_channel_in; - for (int i = 0; i < hout; i++) { - for (int j = 0; j < wout; j++) { - int hstart = i * stride_h - pad_h; - int wstart = j * stride_w - pad_w; - int hend = std::min(hstart + kernel_h, hin + pad_h); - int wend = std::min(wstart + kernel_w, win + pad_w); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); - hend = std::min(hend, hin); - wend = std::min(wend, win); - - int sum = 0; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - sum += int(data_in_channel[h * win + w]); - } - } - int pool_size = (hend - hstart) * (wend - wstart); - data_out_row[j] = (signed char)(sum / pool_size); - } - data_out_row += wout; - } - } - } - break; - default: - //printf("not support\n"); - LOGE("not support\n"); - } -} -void test_arm_pooling_int8(TensorH& tin, int threads, int cluster_id, int pool_case) { - -#ifdef __aarch64__ - LOG(INFO) << "using arm64"; -#else - LOG(INFO) << "using armv7"; -#endif - double to = 0; - double min_time = 1000000; - SaberTimer t1; - SaberTimer t2; - - Context ctx1; - PowerMode mode = cluster_id == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - TensorH tout_basic; - TensorH tout_saber; - - - int num = tin.num(); - int chin = tin.channel(); - int hin = tin.height(); - int win = tin.width(); - - LOG(INFO) << "pooling param: "; - LOG(INFO) << "img_num = " << num; - LOG(INFO) << "in_channels = " << chin; - LOG(INFO) << "img_h = " << hin; - LOG(INFO) << "img_w = " << win; - - int kernel = 2; - int stride = 2; - int pad = 0; - bool global = false; - POOL_FUNC pool_func = nullptr; - PoolingType type = Pooling_max; - - switch (pool_case){ - case 0: //global - global = true; - pool_func = pooling_global_int8; - type = Pooling_max; - LOG(INFO) << "pool case: global pooling"; - break; - case 1: //2x2s2 max - kernel = 2; - stride = 2; - pad = 0; - global = false; - pool_func = pooling2x2s2_max_int8; - type = Pooling_max; - LOG(INFO) << "pool case: pooling2x2s2_max"; - break; - case 2: //3x3s1p1 max - kernel = 3; - stride = 1; - pad = 1; - global = false; - pool_func = pooling3x3s1p1_max_int8; - type = Pooling_max; - LOG(INFO) << "pool case: pooling3x3s1p1_max"; - break; - case 3: //3x3s2p1 max - kernel = 3; - stride = 2; - pad = 1; - global = false; - pool_func = pooling3x3s2p1_max_int8; - type = Pooling_max; - LOG(INFO) << "pool case: pooling3x3s2p1_max"; - break; - case 4: //3x3s2p0 max - kernel = 3; - stride = 2; - pad = 0; - global = false; - pool_func = pooling3x3s2p0_max_int8; - type = Pooling_max; - LOG(INFO) << "pool case: pooling3x3s2p0_max"; - break; - case 5: //2x2s2 ave - kernel = 2; - stride = 2; - pad = 0; - global = false; - pool_func = pooling2x2s2_ave_int8; - type = Pooling_average_exclude_padding; - LOG(INFO) << "pool case: pooling2x2s2_ave"; - break; - default: - LOG(FATAL) << "kernel: " << kernel << ", stride: " << stride << ", pad: " \ - << pad << ", no implement"; - break; - } - int wout = 1; - int hout = 1; - if (!global) { - int hin = tin.height(); // P - hout = static_cast(std::max(0.f, ceilf(static_cast( - hin + 2 * pad - kernel) / stride))) + 1; - int win = tin.width(); // Q - wout = static_cast(std::max(0.f, ceilf(static_cast( - win + 2 * pad - kernel) / stride))) + 1; - } - Shape shape_out(num, chin, hout, wout); - if (compare_result) { - tout_basic.re_alloc(shape_out, AK_INT8); - LOG(INFO) << "basic pooling compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - const void* in = (const void*)tin.data(); - void* out = (void*)tout_basic.mutable_data(); - - pooling_basic_test(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - - t1.end(); - to += t1.get_average_ms(); - if (t1.get_average_ms() < min_time) { - min_time = t1.get_average_ms(); - } - } - LOG(INFO) << "basic pooling running time, ave: " << to / test_iter << ", min time: " << min_time; - } - - tout_saber.re_alloc(shape_out, AK_INT8); - LOG(INFO) << "saber pooling compute"; - to = 0; - min_time = 1000000; - for (int i = 0; i < test_iter; ++i) { - t2.clear(); - t2.start(); - const void* in = (const void*)tin.data(); - void* out = (void*)tout_saber.mutable_data(); - //pooling_global_int8(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - //pooling2x2s2_max_int8(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - //pooling3x3s1p1_max_int8(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - //pooling3x3s2p1_max_int8(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - //pooling3x3s2p0_max_int8(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - pool_func(in, out, num, chin, hout, wout, chin, hin, win, type, global, kernel, \ - kernel, stride, stride, pad, pad); - t2.end(); - to += t2.get_average_ms(); - if (t2.get_average_ms() < min_time) { - min_time = t2.get_average_ms(); - } - LOG(INFO) << "saber pooling running time, ave: " << to / test_iter << ", min time: " << min_time; - } - - if (compare_result) { - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - print_tensor(tin); - print_tensor(tout_basic); - print_tensor(tout_saber); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; - } -} - -#if 1 -TEST(TestSaberLite, test_func_pooling_global_arm) { - - Shape shape_in(num, ch_in, h_in, w_in); - - TensorH tdin; - tdin.re_alloc(shape_in, AK_INT8); - signed char* in = (signed char*)tdin.mutable_data(); - srand(time(NULL)); - for (int i = 0; i < tdin.size(); i++){ - *in = char(rand() % 256 - 128); - in++; - } - - test_arm_pooling_int8(tdin, threads, cluster, pool_case); -} -#endif - - - -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - Env::env_init(); - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4) { - test_iter = atoi(argv[3]); - } - if (argc >= 5) { - compare_result = atoi(argv[4]) > 0; - } - if (argc >= 6) { - if (argc < 10) { - LOG(ERROR) << "usage: ./" << argv[0] << " cluster threads test_iter " << \ - " compare_result num ch_in h_in w_in"; - return 0; - } - num = atoi(argv[5]); - ch_in = atoi(argv[6]); - h_in = atoi(argv[7]); - w_in = atoi(argv[8]); - pool_case = atoi(argv[9]); - } - - - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_priorbox_lite.cpp b/test/lite/test_priorbox_lite.cpp deleted file mode 100644 index eaf6fa6e1..000000000 --- a/test/lite/test_priorbox_lite.cpp +++ /dev/null @@ -1,154 +0,0 @@ -#include "saber/lite/funcs/saber_priorbox.h" -#include "test_lite.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 1; - -const bool FLAG_RELU = false; - -typedef Tensor TensorHf4; - -void test_arm_priorbox(std::vector& tin, \ - int thread_num, int cluster_id) { - - double to = 0; - double min_time = 1000000; - SaberTimer t1; - - Context ctx1; - PowerMode mode = SABER_POWER_HIGH; - ctx1.set_run_mode(mode, 1); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - const int test_iter = 100; - - TensorHf4 tout_saber; - std::vector tvout_saber; - tvout_saber.push_back(&tout_saber); - - LOG(INFO) << "create priorbox param"; - std::vector min_size{60.f}; - std::vector max_size; - std::vector aspect_ratio{2}; - std::vector fixed_size{256.f}; - std::vector density{1.0f}; - std::vector fixed_ratio{1.0f}; - std::vector variance{0.1f, 0.1f, 0.2f, 0.2f}; - bool flip = true; - bool clip = false; - float step_h = 0; - float step_w = 0; - int img_w = 0; - int img_h = 0; - float offset = 0.5; - - std::vector order; - - order.push_back(PRIOR_MIN); - order.push_back(PRIOR_MAX); - order.push_back(PRIOR_COM); - - SaberPriorBox priorbox_saber; - - //PriorBoxParam param(variance, flip, clip, img_w, img_h, step_w, step_h, offset, order, \ - min_size, max_size, aspect_ratio); - PriorBoxParam param(variance, flip, clip, img_w, img_h, step_w, step_h, offset, order, \ - std::vector(), std::vector(), std::vector(), \ - fixed_size, fixed_ratio, density); - - - - LOG(INFO) << "saber priorbox impl init"; - priorbox_saber.load_param(¶m); - - priorbox_saber.compute_output_shape(tin, tvout_saber); - Shape sh_out_saber = tvout_saber[0]->valid_shape(); - Shape shape_out{1, 2, tin[0]->width() * tin[0]->height() * 4 * param._prior_num}; - - LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ - << shape_out[2] << ", " << shape_out[3]; - CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; - - //! re_alloc mem for output tensor - tvout_saber[0]->re_alloc(shape_out); - - // SABER_CHECK(priorbox_saber.init(tin, tvout_saber, param, SPECIFY, SABER_IMPL, ctx1)); - LOG(INFO) << "PriorBox initialization"; - priorbox_saber.init(tin, tvout_saber, ctx1); - - //! compute - LOG(INFO) << "saber priorbox compute"; - to = 0; - t1.clear(); - t1.start(); - - for (int i = 0; i < test_iter; ++i) { - priorbox_saber.dispatch(tin, tvout_saber); - } - - t1.end(); - float ts = t1.get_average_ms(); - printf("total time : %.4f, avg time : %.4f\n", ts, ts / test_iter); - print_tensor(*tvout_saber[0]); - -} - - -TEST(TestSaberLite, test_func_priorbox_arm) { - - int width = 300; - int height = 300; - int channel = 3; - int num = 1; - int w_fea = 19; - int h_fea = 19; - int c_fea = 512; - - LOG(INFO) << " input data size, num=" << num << ", channel=" << \ - channel << ", height=" << height << ", width=" << width; - - LOG(INFO) << " input feature tensor size, num=" << num << ", channel=" << \ - c_fea << ", height=" << h_fea << ", width=" << w_fea; - //! create input output tensor - Shape sh_fea{num, c_fea, h_fea, w_fea}; - Shape sh_data{num, channel, height, width}; - TensorHf4 tfea(sh_fea); - TensorHf4 tdata(sh_data); - - std::vector tin; - - tin.push_back(&tfea); - tin.push_back(&tdata); - - test_arm_priorbox(tin, threads, cluster); -} - -int main(int argc, const char** argv){ - - Env::env_init(); - - // initial logger - //logger::init(argv[0]); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_resize_lite.cpp b/test/lite/test_resize_lite.cpp deleted file mode 100644 index 464dd191e..000000000 --- a/test/lite/test_resize_lite.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_resize.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; -int w_in = 128; -int h_in = 128; -int num_in = 1; -int ch_in = 3; -float width_scale = 2.0f; -float height_scale = 2.0f; -int log_flag = 0; -typedef Tensor TensorHf4; -#define COMPARE_RESULT 1 - -void resize_basic(const float* in_data, int count, int h_in, int w_in, \ - float* out_data, int h_out, int w_out, float width_scale, float height_scale) { - - int spatial_in = h_in * w_in; - int spatial_out = h_out * w_out; -#pragma omp parallel for - for (int i = 0; i < count; ++i){ - for (int s = 0; s < spatial_out; ++s){ - int x_out = s % w_out; - int y_out = s / w_out; - float x_in = x_out * width_scale; - float y_in = y_out * height_scale; - int x_in_start = (int)x_in; - int y_in_start = (int)y_in; - x_in -= x_in_start; - y_in -= y_in_start; - - int x_in_end = x_in_start + 1; - int y_in_end = y_in_start + 1; - - const float w00 = (1.0f - y_in) * (1.0f - x_in); - const float w01 = x_in * (1.0 - y_in); - const float w10 = y_in * (1.0 - x_in); - const float w11 = x_in * y_in; - - int tl_index = y_in_start * w_in + x_in_start; - int tr_index = y_in_start * w_in + x_in_end; - int bl_index = y_in_end * w_in + x_in_start; - int br_index = y_in_end * w_in + x_in_end; - - float tl = in_data[tl_index + i * spatial_in]; - float tr = (x_in_end >= w_in) ? 0 : in_data[tr_index + i * spatial_in]; - float bl = (y_in_end >= h_in) ? 0 : in_data[bl_index + i * spatial_in]; - float br = ((x_in_end >= w_in) || (y_in_end >= h_in)) ? 0 : in_data[br_index + i * spatial_in]; - out_data[s + i * spatial_out] = w00 * tl + w01 * tr + w10 * bl + w11 * br; - } - } -} - -TEST(TestSaberLite, test_func_resize_arm) { - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 10; - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out(num_in, ch_in, int(h_in * height_scale), int(w_in * width_scale)); - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << ch_in << ", height=" << h_in << ", width=" << w_in; - - std::vector vin; - std::vector vout; - - Tensor thin(shape_in); - - fill_tensor_rand(thin, -1.0, 1.0); - TensorHf4 tout(shape_out); - TensorHf4 tout_basic(shape_out); - vin.push_back(&thin); - - - SaberTimer timer; - timer.clear(); - timer.start(); - resize_basic((const float*)thin.data(),shape_out[0] * shape_out[1], shape_in[2], shape_in[3], \ - (float*)tout_basic.mutable_data(), shape_out[2], shape_out[3], 1.0f / width_scale, 1.0f / height_scale); - timer.end(); - double basic_tdiff = timer.get_average_ms(); - - - SaberResize resize_lite; - ResizeParam param(width_scale, height_scale); - resize_lite.load_param(¶m); - - LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ - shape_out[2] << ", " << shape_out[3]; - - vout.push_back(&tout); - resize_lite.compute_output_shape(vin, vout); - CHECK_EQ(shape_out == vout[0]->valid_shape(), true) << "compute shape error"; - - LOG(INFO) << "re-alloc tensor buffer"; - vout[0]->re_alloc(vout[0]->valid_shape()); - - LOG(INFO) << "resize initialized to saber impl"; - resize_lite.init(vin, vout, ctx1); - - SaberTimer t1; - - LOG(INFO) << "saber resize compute"; - double sum = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - resize_lite.dispatch(vin, vout); - t1.end(); - double tdiff = t1.get_average_ms(); - sum += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - tensor_cmp_host(tout_basic, tout, max_ratio, max_diff); - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error" \ - << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio;; -#endif - printf("basic resize time: %.4fms\n", basic_tdiff); - printf("saber resize total time : %.4fms, avg time : %.4fms\n", sum, sum / test_iter, min_time); - //print_tensor(*vin[0]); - //print_tensor(tout_basic); - //print_tensor(*vout[0]); -} - -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - Env::env_init(4); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4){ - num_in = atoi(argv[3]); - } - if (argc >= 5){ - ch_in = atoi(argv[4]); - } - if (argc >= 6){ - h_in = atoi(argv[5]); - } - if (argc >= 7){ - w_in = atoi(argv[6]); - } - if (argc >= 8){ - width_scale = atof(argv[7]); - } - if (argc >= 9){ - height_scale = atof(argv[8]); - } - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_softmax_lite.cpp b/test/lite/test_softmax_lite.cpp deleted file mode 100644 index 62dfd66e2..000000000 --- a/test/lite/test_softmax_lite.cpp +++ /dev/null @@ -1,192 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/funcs/saber_softmax.h" - -using namespace anakin::saber; -using namespace anakin::saber::lite; - -int cluster = 0; -int threads = 4; -int num = 1; -int ch = 1971; -int h = 21; -int w = 1; -int axis = 2; -typedef Tensor TensorHf4; - -#define COMPARE_RESULT 1 - -void softmax_basic(TensorHf4& tin, int axis, TensorHf4& tout) { - Shape shin = tin.valid_shape(); - Shape shtmp = shin; - int axis_size = shin[axis]; - shtmp[axis] = 1; - - int cnt = shtmp.count(); - int inner_num = tin.count(axis + 1, tin.dims()); - int outer_num = tin.count(0, axis); - - //TensorHf4 tmax(shtmp); - - const float* din = tin.data(); - float* dout = tout.mutable_data(); - //float* dtmp = tmax.mutable_data(); - - for (int i = 0; i < cnt; ++i) { - int idx_inner = i % inner_num; - int idx_outer = (i / inner_num) * axis_size; - int real_index = idx_outer * inner_num + idx_inner; - - float max_data = din[real_index]; - //! get max - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - max_data = din[real_index] > max_data? din[real_index] : max_data; - } - //printf("max data: %.2f\n", max_data); - - real_index = idx_outer * inner_num + idx_inner; - //! sub, exp and sum - dout[real_index] = expf(din[real_index] - max_data); - float sum_data = dout[real_index]; - for (int j = 1; j < axis_size; ++j) { - real_index += inner_num; - dout[real_index] = expf(din[real_index] - max_data); - sum_data += dout[real_index]; - } - - //printf("sum exp data: %.2f\n", sum_data); - - float sum_inv = 1.f / sum_data; - - real_index = idx_outer * inner_num + idx_inner; - //! get softmax result - for (int j = 0; j < axis_size; ++j) { - dout[real_index] *= sum_inv; - real_index += inner_num; - } - } -} - -TEST(TestSaberLite, test_func_softmax_arm) { - // start Reshape & doInfer - Context ctx1; - LOG(INFO) << "set runtine context"; - PowerMode mode = cluster == 0? SABER_POWER_HIGH : SABER_POWER_LOW; - ctx1.set_run_mode(mode, threads); - LOG(INFO) << "test threads activated"; -#pragma omp parallel - { -#ifdef USE_OPENMP - int thread = omp_get_num_threads(); - LOG(INFO) << "number of threads: " << thread; -#endif - } - - int test_iter = 1; - - int softmax_axis = axis; // channel - int w_in = w; - int h_in = h; - int ch_in = ch; - int num_in = num; - - Shape shape_in(num_in, ch_in, h_in, w_in); - Shape shape_out = shape_in; - - LOG(INFO) << " input tensor size, num=" << num_in << ", channel=" << \ - ch_in << ", height=" << h_in << ", width=" << w_in; - - LOG(INFO) << "softmax axis= " << softmax_axis; - - std::vector vin; - std::vector vout; - - Tensor thin(shape_in); - float* din = static_cast(thin.mutable_data()); - for (int i = 0; i < thin.size(); ++i) { - din[i] = i % 4; - } - TensorHf4 tout; - TensorHf4 tout_basic(shape_out); - vin.push_back(&thin); - -#if COMPARE_RESULT - softmax_basic(thin, softmax_axis, tout_basic); - //print_tensor(tout_basic); -#endif - - SaberSoftmax softmax_lite; - SoftmaxParam param(softmax_axis); - softmax_lite.load_param(¶m); - - LOG(INFO) << "shape out 4d: " << shape_out[0] << ", " << shape_out[1] << ", " << \ - shape_out[2] << ", " << shape_out[3]; - - vout.push_back(&tout); - softmax_lite.compute_output_shape(vin, vout); - CHECK_EQ(shape_out == vout[0]->valid_shape(), true) << "compute shape error"; - - LOG(INFO) << "re-alloc tensor buffer"; - vout[0]->re_alloc(vout[0]->valid_shape()); - - LOG(INFO) << "softmax initialized to saber impl"; - softmax_lite.init(vin, vout, ctx1); - - SaberTimer t1; - - LOG(INFO) << "saber softmax compute"; - double to = 0; - double min_time = 100000; - for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); - softmax_lite.dispatch(vin, vout); - t1.end(); - double tdiff = t1.get_average_ms(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } - } - - printf("saber softmax total time : %.4f, avg time : %.4f\n", to, to / test_iter, min_time); - //print_tensor(*vout[0]); - -#if COMPARE_RESULT - double max_ratio = 0; - double max_diff = 0; - //TensorHf4 tdiff(tout_basic.valid_shape()); - //tensor_diff(tout_basic, tout_saber, tdiff); - //print_tensor_host(tdiff); - tensor_cmp_host(tout_basic, tout, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - CHECK_EQ(fabsf(max_ratio) < 1e-5f, true) << "compute result error"; -#endif -} - -int main(int argc, const char** argv){ - // initial logger - //logger::init(argv[0]); - Env::env_init(4); - - if (argc >= 2) { - cluster = atoi(argv[1]); - } - if (argc >= 3) { - threads = atoi(argv[2]); - } - if (argc >= 4) { - axis = atoi(argv[3]); - } - if (argc >= 5 && argc <= 8) { - num = atoi(argv[4]); - ch = atoi(argv[5]); - h = atoi(argv[6]); - w = atoi(argv[7]); - } - - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} - diff --git a/test/lite/test_tensor_lite.cpp b/test/lite/test_tensor_lite.cpp deleted file mode 100644 index 650f40caa..000000000 --- a/test/lite/test_tensor_lite.cpp +++ /dev/null @@ -1,228 +0,0 @@ -#include "test_lite.h" -#include "saber/lite/core/tensor_op_lite.h" -using namespace anakin::saber; -using namespace anakin::saber::lite; - -typedef Tensor Tensor4f; -//typedef Tensor Tensor2f; - -TEST(TestSaberLite, test_tensor_constructor) { - -//! test empty constructor - LOG(INFO) << "test default (empty) constructor"; - Tensor4f thost0; - -//! test tensor re_alloc function empty constructor - Shape sh0(2, 3, 10, 10); - LOG(INFO) << "|--test tensor re_alloc function on empty tensor"; - thost0.re_alloc(sh0); - LOG(INFO) << "|--tensor size of host: " << thost0.size(); - CHECK_EQ(thost0.size(), 600) << "error with tensor size"; - -//! test tensor re_alloc function on tensor with data - LOG(INFO) << "|--test tensor re_alloc function on tensor with data"; - Shape sh1(1, 3, 10, 10); - thost0.re_alloc(sh1); - LOG(INFO) << "|--tensor size of host: " << thost0.size(); - CHECK_EQ(thost0.size(), 300) << "error with tensor size"; - - -//! test tensor shape() function - LOG(INFO) << "|--test tensor shape() function"; - Shape sho = thost0.shape(); - LOG(INFO) << "|--shape of tensor: " << sho[0] << ", " << sho[1] << "," << sho[2] << "," << sho[3]; - LOG(INFO) << "|--test get tensor n, c, h, w function, num = " \ - << thost0.num() << ", channel = " << thost0.channel() << ", height = " \ - << thost0.height() << ", width = " << thost0.width(); - -//! test tensor mutable_data() function - LOG(INFO) << "|--test tensor mutable_data() function, write tensor data buffer with 1.f"; - fill_tensor_const(thost0, 1.f); - LOG(INFO) << "|--test tensor data() function, show the const data, 1.f"; - print_tensor(thost0); - -//! test tensor constructor with shape - LOG(INFO) << "test tensor constructor with shape"; - Tensor4f thost1(sh1); - -//! test tensor copy_from() function - LOG(INFO) << "test copy_from() function, input tensor could be any target"; - thost1.copy_from(thost0); - print_tensor(thost1); - -//! test tensor constructor with data, if target is different, create buffer, and copy the data - LOG(INFO) << "test tensor constructor with data, if target is different, create buffer, and copy the data"; - float* host_data_ptr; - void* tmp_ptr; - tmp_ptr = fast_malloc(sizeof(float) * sh1.count()); - host_data_ptr = static_cast(tmp_ptr); - for (int i = 0; i < sh1.count(); ++i) { - host_data_ptr[i] = i; - } - LOG(INFO) << "|--construct host tensor from host data ptr"; - Tensor4f thost3(host_data_ptr, sh1); - print_tensor(thost3); - -//! test tensor copy constructor - LOG(INFO) << "test tensor copy constructor"; - LOG(INFO) << "|--normal copy constructor"; - Tensor4f thost4(thost3); - - LOG(INFO) << "|--push back to vector"; - std::vector vthost; - vthost.push_back(thost0); - vthost.push_back(thost1); - vthost.push_back(thost3); - vthost.push_back(thost4); - print_tensor(vthost[3]); - -//! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied - LOG(INFO) << "test share_from function"; - Tensor4f thost5; - Shape sh2(1, 3, 5, 5); - thost5.set_shape(sh2); - thost5.share_from(thost3); - print_tensor(thost5); - -//! test share_from function, if targets are the same, buffer is shared, otherwise, buffer is copied - LOG(INFO) << "test share_sub_buffer function"; - Tensor4f thost6; - Shape offset(0, 0, 5, 5); - LOG(INFO) << "|--share sub buffer"; - //thost5.set_shape(sh2, thost3.shape(), offset); - thost6.share_sub_buffer(thost3, sh2, offset); - print_tensor(thost6); - //thost5.share_from(thost3); - - LOG(INFO) << "|--change data in shared tensor"; - Shape sh_real = thost6.shape(); - Shape sh_act = thost6.valid_shape(); - Shape offset_act = thost6.offset(); -// int start_w = offset_act[3]; -// int start_h = offset_act[2]; -// int start_c = offset_act[1]; -// int start_n = offset_act[0]; - int stride_h = sh_real.count(3); - int stride_c = sh_real.count(2); - int stride_n = sh_real.count(1); -//int stride_n = sh_real.count(0); - int w = thost6.width(); - int h = thost6.height(); - int c = thost6.channel(); - int n = thost6.num(); - float* ptr_host = thost6.mutable_data(); - for (int in = 0; in < n; ++in) { - float* ptr_batch = ptr_host + in * stride_n; - for (int ic = 0; ic < c; ++ic) { - float* ptr_channel = ptr_batch + ic * stride_c; - for (int ih = 0; ih < h; ++ih) { - float* ptr_row = ptr_channel + ih * stride_h; - for (int iw = 0; iw < w; ++iw) { - ptr_row[iw] = 1.f; - } - } - } - } - - LOG(INFO) << "|--show root tensor while data is changed by shared tensor"; - print_tensor(thost3); - print_tensor(thost6); - //print_tensor_valid(thost6); -} -#if 0 -TEST(TestSaberTensorARM, test_tensor_deepcopy) { - //! tensor constructor with alloc data, if target is different, create buffer, and copy the data - LOG(INFO) << "tensor constructor with data, if target is different, create buffer, and copy the data"; - - Shape sh0(2, 4, 8, 8); - Shape va_sh0(2, 4, 4, 4); - Shape off_sh0(0, 0, 2, 2); - Shape sh1(2, 4, 10, 4); - Shape va_sh1(va_sh0); - Shape off_sh1(0, 0, 4, 0); - Shape sh2(4, 64); - Shape va_sh2(2, 64); - Shape off_sh2(1, 0); - - LOG(INFO) << "|--construct host tensor from host data ptr"; - //! create thost0, thost1, thost01 are source tensor - Tensor4f thost0(sh0); - for (int i = 0; i < sh0.count(); ++i) { - thost0.mutable_data()[i] = i; - } - print_tensor_host(thost0); - //! create shared tensor, with valid shape and offset - Tensor4f thost01; - thost01.set_shape(va_sh0, sh0, off_sh0); - thost01.share_from(thost0); - //! create tensor with entire shape, valid shape and offset - Tensor4f thost1(va_sh0); - for (int i = 0; i < va_sh0.count(); ++i) { - thost1.mutable_data()[i] = i; - } - - //! create thost2, thost3, thost21 as dst tensor, same layout with src - Tensor4f thost2(sh1); - fill_tensor_host_const(thost2, 0.f); - Tensor4f thost21; - thost21.set_shape(va_sh1, sh1, off_sh1); - thost21.share_from(thost2); - Tensor4f thost3(va_sh1); - - //! create thost4, thost5, thost41 as dst tensor, different layout with src - Tensor2f thost4(sh2); - fill_tensor_host_const(thost4, 0.f); - Tensor2f thost41; - thost41.set_shape(va_sh2, sh2, off_sh2); - thost41.share_from(thost4); - Tensor2f thost5(va_sh2); - - //! test tensor deep copy, entire buffer copy - LOG(INFO) << "test tensor deep copy, entire buffer copy"; - thost3.copy_from(thost1); - print_tensor_host(thost3); - - //! test tensor deep copy, src with roi - LOG(INFO) << "test tensor deep copy, src with roi"; - thost3.copy_from(thost01); - print_tensor_host(thost3); - - //! test tensor deep copy, dst with roi - LOG(INFO) << "test tensor deep copy, dst with roi"; - thost21.copy_from(thost1); - print_tensor_host(thost21); - - //! test tensor deep copy, src and dst are with roi - LOG(INFO) << "test tensor deep copy, src and dst are with roi"; - thost21.copy_from(thost01); - print_tensor_host(thost21); - - //! test tensor deep copy, entire buffer copy - LOG(INFO) << "test tensor deep copy, entire buffer copy, different layout"; - thost5.copy_from(thost1); - print_tensor_host(thost5); - - //! test tensor deep copy, src with roi - LOG(INFO) << "test tensor deep copy, src with roi, different layout"; - thost5.copy_from(thost01); - print_tensor_host(thost5); - - //! test tensor deep copy, dst with roi - LOG(INFO) << "test tensor deep copy, dst with roi, different layout"; - thost41.copy_from(thost1); - print_tensor_host(thost41); - - //! test tensor deep copy, src and dst are with roi - LOG(INFO) << "test tensor deep copy, src and dst are with roi, different layout"; - thost41.copy_from(thost01); - print_tensor_host(thost41); -} -#endif - -int main(int argc, const char** argv){ - // initial logger - logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/saber/conv_func_helper.h b/test/saber/conv_func_helper.h index 714ef43e6..3fd4bb6cf 100644 --- a/test/saber/conv_func_helper.h +++ b/test/saber/conv_func_helper.h @@ -13,30 +13,194 @@ limitations under the License. */ -#ifndef ANAKIN_CONV_FUNC_HELPER_H -#define ANAKIN_CONV_FUNC_HELPER_H - +#ifndef ANAKIN_TEST_SABER_CONV_FUNC_HELPER_H +#define ANAKIN_TEST_SABER_CONV_FUNC_HELPER_H +#include #include "saber/core/context.h" #include "saber/core/tensor.h" #include "saber/saber_funcs_param.h" #include "saber/funcs/conv.h" #include "saber/saber_types.h" -#include +#include "saber/funcs/saber_util.h" namespace anakin { namespace saber { template +void pool_basic_check_int8(Tensor &tensor_in,Tensor &tensor_out, + int kernel_w, int kernel_h, int stride_w, int stride_h, + int pad_w, int pad_h, PoolingType pooling_type, round_mode rm = nearest) { + CHECK(tensor_in.get_dtype()==AK_UINT8||tensor_in.get_dtype()==AK_INT8)<<"only support int8 in"; + CHECK(tensor_out.get_dtype()==AK_UINT8||tensor_out.get_dtype()==AK_INT8)<<"only support int8 out"; + auto src_ptr = static_cast(tensor_in.data()); + auto dst_ptr = static_cast(tensor_out.mutable_data()); + + int in_n = tensor_in.num(); + int in_c = tensor_in.channel(); + int in_h = tensor_in.height(); + int in_w = tensor_in.width(); + int size_in_n = in_c * in_h * in_w; + int size_in_c = 1; + + int out_h = tensor_out.height(); + int out_w = tensor_out.width(); + int size_out_n = in_c * out_h * out_w; + int size_out_c = 1; + + for (int ind_n = 0; ind_n < in_n; ++ind_n) { + for (int ind_h = 0; ind_h < out_h; ++ind_h) { + int sh = ind_h * stride_h; + int eh = sh + kernel_h; + if (pad_h > 0) { + sh = (sh-pad_h) < 0 ? 0 : sh-pad_h; + eh = (eh-pad_h) > in_h ? in_h : eh-pad_h; + } + for (int ind_w = 0; ind_w < out_w; ++ind_w) { + int sw = ind_w * stride_w; + int ew = sw + kernel_w; + if (pad_w > 0) { + sw = (sw - pad_w) < 0 ? 0 : sw-pad_w; + ew = (ew - pad_w) > in_w ? in_w:ew-pad_w; + } + + float result = 0; + for (int ind_c = 0; ind_c < in_c; ++ind_c) { + int dst_ind = ind_n * size_out_n + ind_h * out_w * in_c + ind_w * in_c + ind_c; + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_ind = ind_n * size_in_n + kh * in_w * in_c + kw * in_c + ind_c; + if (kh == sh && kw == sw) { + result = src_ptr[src_ind]; + } else { + if (pooling_type == Pooling_max) { + result = result >= src_ptr[src_ind] ? result : src_ptr[src_ind]; + } + if (pooling_type == Pooling_average_include_padding) { + result += src_ptr[src_ind]; + } + if (pooling_type == Pooling_average_exclude_padding) { + result += src_ptr[src_ind]; + } + } + } + } + if (pooling_type == Pooling_average_include_padding) { + result /= kernel_h * kernel_w; + } + if (pooling_type == Pooling_average_exclude_padding) { + result /= (ew-sw) * (eh-sh); + } + + dst_ptr[dst_ind] = (unsigned char)nearbyintf(result); + } + } + } + } + +} + +template +void conv_basic_check_int8(Tensor &tensor_in,Tensor &tensor_out, + const char *weights, const int *bias, int group, + int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, + int pad_w, int pad_h, bool flag_bias, bool flag_relu, std::vector &scale, EltwiseParam *elt_param = NULL, + float beta = 0.f, round_mode rm = nearest) { + auto src_data_uint8 = reinterpret_cast(tensor_in.data()); + auto src_data_int8 = reinterpret_cast(tensor_in.data()); + auto dst_data_ref = reinterpret_cast(tensor_out.mutable_data()); + auto weights_data = weights; + bool with_bias = flag_bias; + auto bias_data = bias; + + int in_num = tensor_out.num(); + int out_channels = tensor_out.channel(); + int out_h = tensor_out.height(); + int out_w = tensor_out.width(); + + int in_channel = tensor_in.channel(); + int in_h = tensor_in.height(); + int in_w = tensor_in.width(); + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + float sum_scale = 1.f; + if (elt_param && (elt_param->operation == Eltwise_sum)) { + sum_scale = elt_param->coeff[1]; + } + + if (tensor_in.get_layout() == Layout_NHWC && tensor_out.get_layout() == Layout_NHWC) { +#pragma omp parallel for num_threads(8) collapse(5) schedule(static) + for (int n = 0; n < in_num; ++n) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + int out_idx = n * out_h * out_w * group * out_c_group + + oh * out_w * group * out_c_group + ow * group * out_c_group + g * out_c_group + oc; + float bias_d = with_bias ? (float)(bias_data[g * out_c_group + oc]) : 0.f; + float computing_v = bias_d + dst_data_ref[out_idx] * beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dilation_w); + int ih = oh * stride_h - pad_h + kh * (dilation_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_h * in_w * in_channel + + ih * in_w * group * in_c_group + + iw * group * in_c_group + + g * in_c_group + + ic; + int widx = g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + + kh * kernel_w + + kw; + + if (tensor_in.get_dtype() == AK_INT8) { + computing_v += (float)src_data_int8[iidx] * weights_data[widx]; + } + else { + computing_v += (float)src_data_uint8[iidx] * weights_data[widx]; + } + } + } + } + computing_v = computing_v * scale[g * out_c_group + oc]; + + if (elt_param && (elt_param->operation == Eltwise_sum)) { + computing_v += dst_data_ref[out_idx] * sum_scale; + } + + if (flag_relu) { + computing_v = computing_v > 0.f ? computing_v : 0.f; + } + + switch (rm) { + case nearest: dst_data_ref[out_idx] = saturate((int32_t)nearbyintf(computing_v)); break; + case down: dst_data_ref[out_idx] = saturate((int32_t)floorf(computing_v)); break ; + } + // LOG(INFO) << "computing_v:" << computing_v << " scale[g*out_c_group + oc]" << scale[g*out_c_group + oc] << " out_idx:" << out_idx; + // LOG(INFO) << "out_idx:" << out_idx << " dst_data_ref[out_idx]:" << (int)dst_data_ref[out_idx]; + } + } + } + } + } + } +} + + + +template void conv_basic_check(Tensor &tensor_in,Tensor &tensor_out, - const float *weights, const float *bias, int group, + const in_dtype *weights, const out_dtype *bias, int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, - int pad_w, int pad_h, bool flag_bias, bool flag_relu, float beta = 0.f) { + int pad_w, int pad_h, bool flag_bias, bool flag_relu, float beta = 0.f, float alpha = 1.f) { - auto src_data = reinterpret_cast(tensor_in.data()); - auto dst_data_ref = reinterpret_cast(tensor_out.mutable_data()); - Tensor bk; - bk.re_alloc(tensor_out.valid_shape(), AK_FLOAT); - bk.copy_from(tensor_out); + auto src_data = reinterpret_cast(tensor_in.data()); + auto dst_data_ref = reinterpret_cast(tensor_out.mutable_data()); auto weights_data = weights; bool with_bias = flag_bias; auto bias_data = bias; @@ -60,7 +224,7 @@ void conv_basic_check(Tensor &tensor_in,Tensor &tensor_o int out_idx = n * group * out_c_group * out_h * out_w + g * out_c_group * out_h * out_w + oc * out_h * out_w + oh * out_w + ow; float bias_d = with_bias ? (float)(bias_data[g * out_c_group + oc]) : 0.f; - dst_data_ref[out_idx] = bias_d + dst_data_ref[out_idx] * beta; + dst_data_ref[out_idx] = dst_data_ref[out_idx] * beta; for (int ic = 0; ic < in_c_group; ++ic) { for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { @@ -81,11 +245,14 @@ void conv_basic_check(Tensor &tensor_in,Tensor &tensor_o + kw; dst_data_ref[out_idx] - += src_data[iidx] - * weights_data[widx]; + += (out_dtype)src_data[iidx] + * (out_dtype)weights_data[widx]; +// LOG(INFO) << "out_idx = " << out_idx << " iidx = " << iidx << " res = " << dst_data_ref[out_idx]; } } } + dst_data_ref[out_idx] *= alpha; + dst_data_ref[out_idx] += bias_d; if (flag_relu) { dst_data_ref[out_idx] = dst_data_ref[out_idx] > 0.f ? dst_data_ref[out_idx] : 0.f; } diff --git a/test/saber/test_direct_conv_int8.cpp b/test/saber/test_direct_conv_int8.cpp new file mode 100644 index 000000000..429fd8133 --- /dev/null +++ b/test/saber/test_direct_conv_int8.cpp @@ -0,0 +1,753 @@ +#include "anakin_config.h" +#include "core/context.h" +#include "test_saber_func.h" +#include "saber/core/tensor.h" +#include "saber/funcs/debug.h" +#include "saber/funcs/calibrate.h" +#include "tensor_op.h" +#include "saber_types.h" +#include "conv_func_helper.h" +#include +#ifdef USE_CUDA +#include "saber/funcs/impl/cuda/saber_conv_eltwise.h" +#include "saber/funcs/impl/cuda/saber_conv.h" +#include "saber/funcs/impl/cuda/saber_conv_direct.h" +#include "saber/funcs/impl/cuda/saber_conv_gemmlike.h" +#endif + +using namespace anakin::saber; +template +void transpose_filter_KCRS_2_CRSKC4(const Dtype *input, Dtype *temp, Dtype *output, \ + int K, int C, int R, int S) { + const int CRS = C * R * S; + for (int var_k = 0; var_k < K; var_k++) { + for (int var_crs = 0; var_crs < CRS; var_crs++) { + temp[var_crs * K + var_k] = input[var_k * CRS + var_crs]; + } + } + int read_in = 0; + int write_out = 0; + int out_loop = C / 4; + int inner_loop = K * R * S * 4; + for (int i = 0; i < out_loop; ++i) { + for (int j = 0; j < inner_loop; ++j) { + write_out = i * inner_loop + j; + read_in = ((i * 4) + (j % 4)) * (inner_loop / 4) + j / 4; + output[write_out] = temp[read_in]; + } + } +} + +template +void transpose_img_NCHW_2_NCHWC4(const Dtype* input, Dtype *output, + int N, int C, int H, int W) { + int read_in = 0; + int write_out = 0; + int out_loop = N * C / 4; + int inner_loop = H * W * 4; + for (int i = 0; i < out_loop; ++i) { + for (int j = 0; j < inner_loop; ++j) { + write_out = i * inner_loop + j; + read_in = ((i * 4) + (j % 4)) * (inner_loop / 4) + j / 4; + output[write_out] = input[read_in]; + } + } +} + +#ifdef USE_CUDA +TEST(TestSaberFunc, test_saber_conv_int8_results) { + + Env::env_init(); + Env::env_init(); + + bool with_relu = true; + float alpha = 1.0f; + int input_num = 1; + int in_channels = 128; + int out_channels = 256; + int height = 64; + int width = 64; + + int kernel_h = 3; + int kernel_w = 3; + int pad_h = 1; + int pad_w = 1; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + int group = 1; + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape output_s({input_num, out_channels, height, width}, Layout_NCHW); + // trans to input_num, in_channels/4, height, width, inner_channels(4) + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + // trans to in_channels/4, kernel_h, kernel_w, out_channels, inner_channels(4); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + Tensor input_dev; + Tensor weights_dev; + Tensor bias_dev; + Tensor output_dev; + + Tensor input_host; + Tensor weights_host; + Tensor bias_host; + Tensor output_host; + Tensor check_output; + + input_dev.re_alloc(input_s, AK_INT8); + input_host.re_alloc(input_s, AK_INT8); + + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + + output_dev.re_alloc(output_s, AK_FLOAT); + output_host.re_alloc(output_s, AK_FLOAT); + check_output.re_alloc(output_s, AK_FLOAT); + + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + + fill_tensor_rand(input_host, -10, 10); + fill_tensor_rand(weights_host, -10, 10); + fill_tensor_rand(bias_dev, -10, 10); + bias_host.copy_from(bias_dev); + + Context ctx(0, 0, 1); + int generate_arch = Env::cur_env()[ctx.get_device_id()]._info._generate_arch; + // only support 61 arch for now. + bool arch_check = (generate_arch == 61); + if (!arch_check) { + LOG(INFO) << "device not support int8 op!!"; + return; + } + auto stream = ctx.get_compute_stream(); + { + Tensor input_temp; + input_temp.re_alloc(input_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) input_host.data(), + (char *) input_temp.mutable_data(), + input_host.num(), + input_host.channel(), + input_host.height(), + input_host.width()); + input_dev.copy_from(input_temp); + } + bool use_1x1 = true; + use_1x1 = use_1x1 && (kernel_h == 1); + use_1x1 = use_1x1 && (kernel_w == 1); + use_1x1 = use_1x1 && (dilation_h == 1); + use_1x1 = use_1x1 && (dilation_w == 1); + use_1x1 = use_1x1 && (stride_h == 1); + use_1x1 = use_1x1 && (stride_w == 1); + use_1x1 = use_1x1 && (pad_h == 0); + use_1x1 = use_1x1 && (pad_w == 0); + use_1x1 = use_1x1 && (group == 1); + + if (!use_1x1) { + { + Tensor weight_temp; + Tensor weight_temp2; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + weight_temp2.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_filter_KCRS_2_CRSKC4( + (const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + (char *) weight_temp2.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + weights_dev.copy_from(weight_temp2); + } + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + param.activation_param.has_active = with_relu; + param.alpha = alpha; + SaberDirectConv conv_direct; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_direct.init(inputs, outputs, param, ctx); + conv_direct.dispatch(inputs, outputs, param); + + } else { + { + Tensor weight_temp; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + + weights_dev.copy_from(weight_temp); + } + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + param.activation_param.has_active = with_relu; + param.alpha = alpha; + SaberGemmLikeConv conv_gemm; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_gemm.init(inputs, outputs, param, ctx); + conv_gemm.dispatch(inputs, outputs, param); + } + cudaDeviceSynchronize(); + output_host.copy_from(output_dev); + cudaDeviceSynchronize(); + conv_basic_check(input_host, check_output, + (const char*)weights_host.data(), (const float*)bias_host.data(), group, + kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, + pad_w, pad_h, true, with_relu, 0.f, alpha); + + write_tensorfile(output_dev, "int8_output.txt"); + write_tensorfile(check_output, "fp32_output.txt"); + + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)output_host.data(), (const float*)check_output.data(), + check_output.valid_size(), max_ratio, max_diff); + LOG(INFO) << "ratio = " << max_ratio << " max_diff = " << max_diff; +} + +TEST(TestSaberFunc, test_weights_calibrate) { + Tensor weights_host; + Tensor weights_temp; + + Shape weight_s({4, 4, 3, 3}, Layout_NCHW); + Shape weight_t_s({4, 4, 3, 3}, Layout_NCHW); + weights_host.re_alloc(weight_s, AK_FLOAT); + weights_temp.re_alloc(weight_t_s, AK_INT8); + Context ctx(0, 0, 1); + fill_tensor_rand(weights_host, -10, 10); + convert_weights_to_direct (weights_temp, weights_host, ctx); +// print_tensor_valid(weights_host); +// print_tensor_valid(weights_temp); +// write_tensorfile(weights_host, "int8_output.txt"); +// write_tensorfile(weights_temp, "fp32_output.txt"); +} +#if 0 +TEST(TestSaberFunc, test_saber_conv_eltwise_int8_results) { + + Env::env_init(); + Env::env_init(); + + bool with_relu = false; + float alpha = 1.f; + float beta = 1.f; + int input_num = 1; + int in_channels = 32; + int out_channels = 16; + int height = 24; + int width = 24; + + int kernel_h = 1; + int kernel_w = 1; + int pad_h = 0; + int pad_w = 0; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + int group = 1; + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape output_s({input_num, out_channels, height, width}, Layout_NCHW); + // trans to input_num, in_channels/4, height, width, inner_channels(4) + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + // trans to in_channels/4, kernel_h, kernel_w, out_channels, inner_channels(4); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + Tensor input_dev; + Tensor weights_dev; + Tensor bias_dev; + Tensor output_dev; + + Tensor input_host; + Tensor weights_host; + Tensor bias_host; + Tensor output_host; + Tensor check_output; + + input_dev.re_alloc(input_s, AK_INT8); + input_host.re_alloc(input_s, AK_INT8); + + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + + output_dev.re_alloc(output_s, AK_FLOAT); + output_host.re_alloc(output_s, AK_FLOAT); + check_output.re_alloc(output_s, AK_FLOAT); + + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + + fill_tensor_rand(input_host, -10, 10); + fill_tensor_rand(weights_host, -10, 10); + fill_tensor_rand(bias_dev, -10, 10); + fill_tensor_const(output_dev, 2); + output_host.copy_from(output_dev); + check_output.copy_from(output_dev); + bias_host.copy_from(bias_dev); + + Context ctx(0, 0, 1); + int generate_arch = Env::cur_env()[ctx.get_device_id()]._info._generate_arch; + // only support 61 arch for now. + bool arch_check = (generate_arch == 61); + if (!arch_check) { + LOG(INFO) << "device not support int8 op!!"; + return; + } + auto stream = ctx.get_compute_stream(); + { + Tensor input_temp; + input_temp.re_alloc(input_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) input_host.data(), + (char *) input_temp.mutable_data(), + input_host.num(), + input_host.channel(), + input_host.height(), + input_host.width()); + input_dev.copy_from(input_temp); + } + bool use_1x1 = true; + use_1x1 = use_1x1 && (kernel_h == 1); + use_1x1 = use_1x1 && (kernel_w == 1); + use_1x1 = use_1x1 && (dilation_h == 1); + use_1x1 = use_1x1 && (dilation_w == 1); + use_1x1 = use_1x1 && (stride_h == 1); + use_1x1 = use_1x1 && (stride_w == 1); + use_1x1 = use_1x1 && (pad_h == 0); + use_1x1 = use_1x1 && (pad_w == 0); + use_1x1 = use_1x1 && (group == 1); + + { + Tensor weight_temp; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + + weights_dev.copy_from(weight_temp); + } + ConvParam conv_param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); +// conv_param.activation_param.has_active = with_relu; +// conv_param.activation_param.active=Active_relu; + conv_param.alpha = alpha; + conv_param.beta = beta; + EltwiseParam elt_param(Eltwise_sum); + ConvEltwiseParam param(conv_param, elt_param); + + SaberConvEltwise conv_eltwise; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_eltwise.init(inputs, outputs, param, ctx); + conv_eltwise.dispatch(inputs, outputs, param); + + cudaDeviceSynchronize(); + output_host.copy_from(output_dev); + cudaDeviceSynchronize(); + conv_basic_check(input_host, check_output, + (const char*)weights_host.data(), (const float*)bias_host.data(), group, + kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, + pad_w, pad_h, true, with_relu, 1.f, conv_param.alpha); + + write_tensorfile(output_dev, "int8_output.txt"); + write_tensorfile(check_output, "fp32_output.txt"); + + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)output_host.data(), (const float*)check_output.data(), + check_output.valid_size(), max_ratio, max_diff); + LOG(INFO) << "ratio = " << max_ratio << " max_diff = " << max_diff; +} +#endif + +void test_saber_cudnn_speed(int input_num, + int in_channels, + int out_channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group) { + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape output_s({input_num, out_channels, height, width}, Layout_NCHW); + // trans to input_num, in_channels/4, height, width, inner_channels(4) + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + // trans to in_channels/4, kernel_h, kernel_w, out_channels, inner_channels(4); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + Tensor input_dev; + Tensor weights_dev; + Tensor bias_dev; + Tensor output_dev; + + Tensor input_host; + Tensor weights_host; + Tensor bias_host; + Tensor output_host; + Tensor check_output; + + input_dev.re_alloc(input_s, AK_INT8); + input_host.re_alloc(input_s, AK_INT8); + + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + + output_dev.re_alloc(output_s, AK_FLOAT); + output_host.re_alloc(output_s, AK_FLOAT); + check_output.re_alloc(output_s, AK_FLOAT); + + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + + fill_tensor_rand(input_host, -10, 10); + fill_tensor_rand(weights_host, -10, 10); + fill_tensor_rand(bias_dev, -10, 10); + bias_host.copy_from(bias_dev); + + Context ctx(0, 0, 1); + auto stream = ctx.get_compute_stream(); + { + Tensor input_temp; + input_temp.re_alloc(input_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) input_host.data(), + (char *) input_temp.mutable_data(), + input_host.num(), + input_host.channel(), + input_host.height(), + input_host.width()); + + input_dev.copy_from(input_temp); + } + bool use_1x1 = true; + use_1x1 = use_1x1 && (kernel_h == 1); + use_1x1 = use_1x1 && (kernel_w == 1); + use_1x1 = use_1x1 && (dilation_h == 1); + use_1x1 = use_1x1 && (dilation_w == 1); + use_1x1 = use_1x1 && (stride_h == 1); + use_1x1 = use_1x1 && (stride_w == 1); + use_1x1 = use_1x1 && (pad_h == 0); + use_1x1 = use_1x1 && (pad_w == 0); + use_1x1 = use_1x1 && (group == 1); + + int ts = 100; + SaberTimer timer; + { + { + Tensor weight_temp; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + + weights_dev.copy_from(weight_temp); + } + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + VenderConv2D conv_vender; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_vender.init(inputs, outputs, param, ctx); + conv_vender.dispatch(inputs, outputs, param); + + cudaDeviceSynchronize(); + for (int i = 0; i < ts; ++i) { + timer.start(ctx); + conv_vender.dispatch(inputs, outputs, param); + output_dev.record_event(ctx.get_compute_stream()); + output_dev.sync(); + timer.end(ctx); + } + printf("cudnn,%lf\n", timer.get_average_ms()); + } + cudaDeviceSynchronize(); +} + +void test_saber_direct_speed(int input_num, int in_channels, + int out_channels, + int height, + int width, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group) { + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape output_s({input_num, out_channels, height, width}, Layout_NCHW); + // trans to input_num, in_channels/4, height, width, inner_channels(4) + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + // trans to in_channels/4, kernel_h, kernel_w, out_channels, inner_channels(4); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + Tensor input_dev; + Tensor weights_dev; + Tensor bias_dev; + Tensor output_dev; + + Tensor input_host; + Tensor weights_host; + Tensor bias_host; + Tensor output_host; + Tensor check_output; + + input_dev.re_alloc(input_s, AK_INT8); + input_host.re_alloc(input_s, AK_INT8); + + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + + output_dev.re_alloc(output_s, AK_FLOAT); + output_host.re_alloc(output_s, AK_FLOAT); + check_output.re_alloc(output_s, AK_FLOAT); + + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + + fill_tensor_rand(input_host, -10, 10); + fill_tensor_rand(weights_host, -10, 10); + fill_tensor_rand(bias_dev, -10, 10); + bias_host.copy_from(bias_dev); + + Context ctx(0, 0, 1); + auto stream = ctx.get_compute_stream(); + { + Tensor input_temp; + input_temp.re_alloc(input_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) input_host.data(), + (char *) input_temp.mutable_data(), + input_host.num(), + input_host.channel(), + input_host.height(), + input_host.width()); + + input_dev.copy_from(input_temp); + } + bool use_1x1 = true; + use_1x1 = use_1x1 && (kernel_h == 1); + use_1x1 = use_1x1 && (kernel_w == 1); + use_1x1 = use_1x1 && (dilation_h == 1); + use_1x1 = use_1x1 && (dilation_w == 1); + use_1x1 = use_1x1 && (stride_h == 1); + use_1x1 = use_1x1 && (stride_w == 1); + use_1x1 = use_1x1 && (pad_h == 0); + use_1x1 = use_1x1 && (pad_w == 0); + use_1x1 = use_1x1 && (group == 1); + int ts = 100; + SaberTimer timer; + if (!use_1x1) { + { + Tensor weight_temp; + Tensor weight_temp2; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + weight_temp2.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_filter_KCRS_2_CRSKC4( + (const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + (char *) weight_temp2.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + weights_dev.copy_from(weight_temp2); + } + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + SaberDirectConv conv_direct; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_direct.init(inputs, outputs, param, ctx); + conv_direct.dispatch(inputs, outputs, param); + + cudaDeviceSynchronize(); + for (int i = 0; i < ts; ++i) { + timer.start(ctx); + conv_direct.dispatch(inputs, outputs, param); + output_dev.record_event(ctx.get_compute_stream()); + output_dev.sync(); + timer.end(ctx); + } + printf("direct,%lf\n", timer.get_average_ms()); + + } else { + { + Tensor weight_temp; + weight_temp.re_alloc(weights_host.valid_shape(), AK_INT8); + transpose_img_NCHW_2_NCHWC4((const char *) weights_host.data(), + (char *) weight_temp.mutable_data(), + weights_host.num(), + weights_host.channel(), + weights_host.height(), + weights_host.width()); + + weights_dev.copy_from(weight_temp); + } + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + + SaberGemmLikeConv conv_gemm; + std::vector*> inputs; + std::vector*> outputs; + inputs.push_back(&input_dev); + outputs.push_back(&output_dev); + conv_gemm.init(inputs, outputs, param, ctx); + conv_gemm.dispatch(inputs, outputs, param); + + cudaDeviceSynchronize(); + for (int i = 0; i < ts; ++i) { + timer.start(ctx); + conv_gemm.dispatch(inputs, outputs, param); + output_dev.record_event(ctx.get_compute_stream()); + output_dev.sync(); + timer.end(ctx); + } + printf("gemm,%lf\n", timer.get_average_ms()); + } + cudaDeviceSynchronize(); + output_host.copy_from(output_dev); + cudaDeviceSynchronize(); +} +#if 1 +TEST(TestSaberFunc, test_saber_speed) { + Env::env_init(); + Env::env_init(); + + std::vector input_num_v{1}; + std::vector in_channels_v{512}; + std::vector out_channels_v{2048}; + std::vector height_v{7}; + std::vector width_v{7}; + std::vector kernel_h_v{1}; + std::vector kernel_w_v{1}; + std::vector pad_h_v{0}; + std::vector pad_w_v{0}; + std::vector stride_h_v{1}; + std::vector stride_w_v{1}; + std::vector dilation_h_v{1}; + std::vector dilation_w_v{1}; + std::vector group_v{1}; + printf("input_num,in_channels,out_channels," + "height,width,kernel_h,kernel_w," + "pad_h,pad_w," + "stride_h,stride_w," + "dilation_h,dilation_w," + "group,type,latency,\n"); + + for (auto input_num : input_num_v) + for (auto in_channels : in_channels_v) + for (auto out_channels : out_channels_v) + for (auto height : height_v) + for (auto width : width_v) + for (auto kernel_h: kernel_h_v) + for (auto kernel_w: kernel_w_v) + for (auto pad_h: pad_h_v) + for (auto pad_w: pad_w_v) + for (auto stride_h: stride_h_v) + for (auto stride_w: stride_w_v) + for (auto dilation_h: dilation_h_v) + for (auto dilation_w: dilation_w_v) + for (auto group: group_v) { + printf("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,", + input_num, in_channels, out_channels, + height, width, + kernel_h, kernel_w, + pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, group); + + test_saber_direct_speed(input_num, + in_channels, + out_channels, + height, + width, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + group); + + printf("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,", + input_num, in_channels, out_channels, + height, width, + kernel_h, kernel_w, + pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, group); + + test_saber_cudnn_speed(input_num, + in_channels, + out_channels, + height, + width, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + group); + } +} +#endif +#endif + +int main(int argc, char* argv[]) { + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} \ No newline at end of file diff --git a/test/saber/test_saber_activation.cpp b/test/saber/test_saber_activation.cpp index 83f06a5cb..4061d8c8c 100644 --- a/test/saber/test_saber_activation.cpp +++ b/test/saber/test_saber_activation.cpp @@ -44,6 +44,15 @@ void activation_basic(const std::vector*>& inputs, } break; + + // swish: x/(1 + exp(-(b * x))) + case Active_swish: + for (size_t i = 0; i < count; i++) { + const dtype beta = param.coef; + dout[i] = din[i] / (1.0f + exp(-(din[i] * beta))); + } + + break; // tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x)) case Active_tanh: @@ -80,6 +89,14 @@ void activation_basic(const std::vector*>& inputs, break; + //gelu: y = x * 0.5 * (erf(x/sqrt(2)) + 1) + case Active_gelu: + for (size_t i = 0; i < count; i++) { + dtype x = din[i]; + dtype coeff = 0.5 * (erf(x/sqrt(2)) + 1); + dout[i] = x * coeff; + } + break; //prelu: x > 0 ? x : slope[c] * x case Active_prelu: @@ -134,7 +151,13 @@ void test_model() { //test example for (auto shape : {input_shape, input_shape2}) { - for (auto act : {1, 2, 3, 4, 5, 9, 10, active}) { +#ifdef USE_ARM_PLACE + for (auto act : {Active_sigmoid,Active_relu, Active_tanh, Active_clipped_relu, Active_prelu}) { +#else + for (auto act : {Active_sigmoid, Active_relu, Active_tanh, Active_clipped_relu, Active_prelu, Active_elu, Active_stanh, + Active_gelu, Active_swish}) { +#endif + LOG(INFO) << "================ active: " << act; for (auto neg_slope : {-1.0, 0.5}) { @@ -149,7 +172,7 @@ void test_model() { PreluParam prelu(shared, &slope_tensor); ActivationParam param(act, neg_slope, coef, prelu, has); testbase.set_param(param);//set param - testbase.set_input_shape(shape); + testbase.set_input_shape(shape, SPECIAL); testbase.run_test(activation_basic);//run test // LOG(INFO) << "NV run end"; } @@ -164,7 +187,7 @@ void test_model() { ActivationParam param(act, neg_slope, coef, prelu, has); //LOG(INFO) << "neg_slope: " << neg_slope << ", coef: " << coef << ", has: " << has; testbase.set_param(param);//set param - testbase.set_input_shape(shape); + testbase.set_input_shape(shape, SPECIAL); testbase.run_test(activation_basic);//run test // LOG(INFO) << "NV run end"; } @@ -184,6 +207,7 @@ TEST(TestSaberFunc, test_func_activation) { test_model(); #endif #ifdef USE_ARM_PLACE + Env::env_init(); test_model(); #endif #ifdef AMD_GPU @@ -198,11 +222,10 @@ TEST(TestSaberFunc, test_func_activation) { int main(int argc, const char** argv) { // initial logger - //logger::init(argv[0]); + logger::init(argv[0]); if (argc >= 2) { active = atoi(argv[1]); } - if (argc >= 3) { if (argc < 6) { LOG(ERROR) << "usage: ./" << argv[0] << "axis " << \ @@ -215,7 +238,6 @@ int main(int argc, const char** argv) { h_in = atoi(argv[4]); w_in = atoi(argv[5]); } - InitTest(); RUN_ALL_TESTS(argv[0]); return 0; diff --git a/test/saber/test_saber_affine_channel.cpp b/test/saber/test_saber_affine_channel.cpp index 43fd9d183..1c5bdc00c 100644 --- a/test/saber/test_saber_affine_channel.cpp +++ b/test/saber/test_saber_affine_channel.cpp @@ -14,13 +14,19 @@ void affine_channel_cpu_base(const std::vector* >& inputs, std::vector* >& outputs, AffineChannelParam& param) { const dtype* src = (const dtype*)inputs[0]->data(); - const dtype* scale = (const dtype*)inputs[1]->data(); - const dtype* bias = (const dtype*)inputs[2]->data(); + Tensor weight_tensor(param.weight()->valid_shape()); + Tensor bias_tensor(param.bias()->valid_shape()); + weight_tensor.copy_from(*param.weight()); + bias_tensor.copy_from(*param.bias()); + AffineChannelParam param_h(&weight_tensor, &bias_tensor); + + const dtype* scale = (const dtype*)param_h.weight()->data(); + const dtype* bias = (const dtype*)param_h.bias()->data(); dtype* dst = (dtype*)outputs[0]->mutable_data(); int channel_idx = inputs[0]->channel_index(); int channel = inputs[0]->channel(); - CHECK_EQ(inputs[1]->valid_size(), channel) << "affine channel input scale dims are not valid"; - CHECK_EQ(inputs[2]->valid_size(), channel) << "affine channel input bias dims are not valid"; + CHECK_EQ(param.weight()->valid_size(), channel) << "affine channel input scale dims are not valid"; + CHECK_EQ(param.bias()->valid_size(), channel) << "affine channel input bias dims are not valid"; int outer_num = inputs[0]->count_valid(0, channel_idx); int inner_num = inputs[0]->count_valid(channel_idx+1, inputs[0]->dims()); int id = 0; @@ -28,56 +34,56 @@ void affine_channel_cpu_base(const std::vector* >& inputs, for (int j = 0; j < channel; j++) { for (int k = 0; k < inner_num; k++) { dst[id] = src[id] * scale[j] + bias[j]; + //LOG(INFO) << "id" << id; + //LOG(INFO) << "j" << j; + //LOG(INFO) << "outer_num" << outer_num; + //LOG(INFO) << "inner_num" << inner_num; id++; } } } } - -TEST(TestSaberFunc, test_op_affine_channel) { - -#ifdef USE_CUDA - TestSaberBase testbase(3, 1); +template +void test_affine_channel() { + TestSaberBase testbase(1, 1); for (int w_in : {8, 8, 16}) { for (int h_in : {2, 8, 32}) { for (int ch_in : {2, 3, 8, 64}) { for (int num_in : {1, 21, 32}) { +// for (int w_in : {8}) { +// for (int h_in : {2}) { +// for (int ch_in : {2}) { +// for (int num_in : {2}) { Shape shape({num_in, ch_in, h_in, w_in}); - Shape scale_shape({1, ch_in, 1, 1}); - Shape bias_shape({1, ch_in, 1, 1}); - std::vector shape_vec = {shape, scale_shape, bias_shape}; - AffineChannelParam param; + Shape scale_shape({1, ch_in, 1, 1}, Layout_NCHW); + Shape bias_shape({1, ch_in, 1, 1}, Layout_NCHW); + Tensor scale(scale_shape, AK_FLOAT); + Tensor bias(bias_shape, AK_FLOAT); + std::vector shape_vec = {shape}; + fill_tensor_rand(scale, -1.0f, 1.0f); + fill_tensor_rand(bias, -1.0f, 1.0f); + AffineChannelParam param(&scale, &bias); testbase.set_param(param); testbase.set_rand_limit(-5.0, 5.0); testbase.add_inputs_shape(shape_vec); - testbase.run_test(affine_channel_cpu_base, 2.1e-5f); + testbase.run_test(affine_channel_cpu_base, 2.1e-5f); } } } } +} + +TEST(TestSaberFunc, test_op_affine_channel) { + +#ifdef USE_CUDA + Env::env_init(); + test_affine_channel(); #endif #ifdef USE_X86_PLACE - TestSaberBase testbase_x86(3, 1); - - for (int w_in : {8, 8, 16}) { - for (int h_in : {2, 8, 32}) { - for (int ch_in : {2, 3, 8, 64}) { - for (int num_in : {1, 21, 32}) { - Shape shape({num_in, ch_in, h_in, w_in}); - Shape scale_shape({1, ch_in, 1, 1}); - Shape bias_shape({1, ch_in, 1, 1}); - std::vector shape_vec = {shape, scale_shape, bias_shape}; - AffineChannelParam param_x86; - testbase_x86.set_param(param_x86); - testbase_x86.set_rand_limit(-5.0, 5.0); - testbase_x86.add_inputs_shape(shape_vec); - testbase_x86.run_test(affine_channel_cpu_base); - } - } - } - } +// Env::env_init(); +// test_affine_channel(); #endif } diff --git a/test/saber/test_saber_aligned_mat_mul.cpp b/test/saber/test_saber_aligned_mat_mul.cpp new file mode 100644 index 000000000..9352897f9 --- /dev/null +++ b/test/saber/test_saber_aligned_mat_mul.cpp @@ -0,0 +1,150 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/aligned_mat_mul.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include +using namespace anakin::saber; + +template +void gemm(const dtype* data_A, const dtype* data_B, int M, int N, int K, + bool trans_A, bool trans_B, dtype alpha, dtype beta, dtype* data_C) { + if (trans_A && trans_B) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + dtype result = (dtype) 0; + for (int k = 0; k < K; k++) { + result += data_A[k * M + m] * data_B[n * K + k]; + } + data_C[m * N + n] = alpha * result + beta * data_C[m * N + n]; + } + } + } else if (!trans_A && trans_B) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + dtype result = (dtype) 0; + for (int k = 0; k < K; k++) { + result += data_A[m * K + k] * data_B[n * K + k]; + } + data_C[m * N + n] = alpha * result + beta * data_C[m * N + n]; + } + } + } +} + +template +void aligned_mat_mul_basic(const std::vector*>& inputs, + std::vector*>& outputs, + AlignedMatMulParam& param) { + float alpha = param.scale; + float beta = 0.f; + bool trans_A = param.is_transpose_X; + bool trans_B = param.is_transpose_Y; + const dtype* src0 = (dtype*)inputs[0]->data(); + const dtype* src1 = (dtype*)inputs[1]->data(); + dtype* dst = (dtype*)outputs[0]->mutable_data(); + auto seq_offset_0 = inputs[0]->get_seq_offset()[0]; + auto seq_offset_1 = inputs[1]->get_seq_offset()[0]; + int inner_A = inputs[0]->count_valid(1, inputs[0]->dims()); + int inner_B = inputs[1]->count_valid(1, inputs[1]->dims()); + int batch_A = seq_offset_0[1]; + int batch_B = seq_offset_1[1]; + int M = param.is_transpose_X ? inner_A : batch_A; + int N = param.is_transpose_Y ? batch_B: inner_B; + int K_A = param.is_transpose_X ? batch_A : inner_A; + int K_B = param.is_transpose_Y ? inner_B : batch_B; + CHECK_EQ(K_A, K_B) << "mat mul two inputs K is not equal"; + int K = K_A; + int seq_num = seq_offset_0.size() - 1; + for (int i = 0; i < seq_num; i++) { + gemm(src0 + i * batch_A * inner_A, src1 + i * batch_B * inner_B, M, N, K, + trans_A, trans_B, alpha, beta, dst + i * M * N); + } +} + +void generate_equal_step_offset(int seq_num, int max_seq_len, std::vector& offset) { + offset.clear(); + offset.push_back(0); + for (int i = 0; i < seq_num; i++){ + offset.push_back((i+1)* max_seq_len); + } +} + + + +template +void test_model() { + //test example + TestSaberBase testbase(2, 1); + float scale = 0.8; + for (auto seq_num : {1}) { + for (auto left_seq_len: {2}) { + for (auto right_seq_len: {3}) { + for (auto trans_a : {false}) { + for (auto trans_b: {true}) { + for (auto emb_size: {5}) { + std::vector*> inputs; + std::vector seq_offset_0; + std::vector seq_offset_1; + generate_equal_step_offset(seq_num, left_seq_len, seq_offset_0); + generate_equal_step_offset(seq_num, right_seq_len, seq_offset_1); + int word_num_0 = seq_offset_0.back(); + int word_num_1 = seq_offset_1.back(); + Tensor* input_0 = new Tensor(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + Tensor* input_1 = new Tensor(Shape({word_num_1, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(*input_0, -1.f, 1.f); + fill_tensor_rand(*input_1, -1.f, 1.f); + std::vector> vseq_offset_0 = {seq_offset_0}; + input_0->set_seq_offset(vseq_offset_0); + std::vector> vseq_offset_1 = {seq_offset_1}; + input_1->set_seq_offset(vseq_offset_1); + inputs.push_back(input_0); + inputs.push_back(input_1); + testbase.add_custom_input(inputs); + AlignedMatMulParam param(trans_a, trans_b, scale); + testbase.set_param(param); + testbase.run_test(aligned_mat_mul_basic, 0.00001, true, true); + for (auto input: inputs) { + delete input; + } + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_func_aligned_mat_mul) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_anchor_generator.cpp b/test/saber/test_saber_anchor_generator.cpp new file mode 100644 index 000000000..2c97077b8 --- /dev/null +++ b/test/saber/test_saber_anchor_generator.cpp @@ -0,0 +1,107 @@ +#include "saber/core/context.h" +#include "saber/funcs/anchor_generator.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +template +void anchor_generator_cpu_base(const std::vector* >& inputs, + std::vector* >& outputs, + AnchorGeneratorParam& param) { + const dtype* src = (const dtype*)inputs[0]->data(); + dtype* dst = (dtype*)outputs[0]->mutable_data(); + dtype* var = (dtype*)outputs[1]->mutable_data(); + auto anchor_sizes = param.anchor_sizes; + auto aspect_ratios = param.aspect_ratios; + auto stride = param.stride; + auto variances = param.variances; + auto offset = param.offset; + int height = inputs[0]->height(); + int width = inputs[0]->width(); + int stride_w = stride[0]; + int stride_h = stride[1]; + auto anchor_tmp = dst; + auto var_tmp = var; + for (int h_idx = 0; h_idx < height; h_idx++) { + for (int w_idx = 0; w_idx < width; w_idx++) { + dtype x_ctr = (w_idx * stride_w) + offset * (stride_w - 1); + dtype y_ctr = (h_idx * stride_h) + offset * (stride_h - 1); + for (size_t r = 0; r < aspect_ratios.size(); r++) { + auto ar = aspect_ratios[r]; + for (size_t s = 0; s < anchor_sizes.size(); s++) { + auto anchor_size = anchor_sizes[s]; + dtype area = stride_w * stride_h; + dtype area_ratios = area / ar; + dtype base_w = round(sqrt(area_ratios)); + dtype base_h = round(base_w * ar); + dtype scale_w = anchor_size / stride_w; + dtype scale_h = anchor_size / stride_h; + dtype half_width = 0.5 * (scale_w * base_w - 1); + dtype half_height = 0.5 * (scale_h * base_h - 1); + anchor_tmp[0] = x_ctr - half_width; + anchor_tmp[1] = y_ctr - half_height; + anchor_tmp[2] = x_ctr + half_width; + anchor_tmp[3] = y_ctr + half_height; + var_tmp[0] = variances[0]; + var_tmp[1] = variances[1]; + var_tmp[2] = variances[2]; + var_tmp[3] = variances[3]; + anchor_tmp += 4; + var_tmp += 4; + } + } + } + } + +} + +template +void test_anchor_generator() { + std::vector anchor_sizes = {16, 32, 64, 128}; + std::vector aspect_ratios = {0.5, 1, 2}; + std::vector stride = {4, 4}; + std::vector variances = {0.1, 0.2, 0.3, 0.4}; + auto offset = 0.5; + TestSaberBase testbase(1, 2); + for (int w_in : {16, 32}) { + for (int h_in : {16, 32}) { + for (int ch_in : {1, 5, 7}) { + for (int num_in : {1, 2, 5}) { + Shape shape({num_in, ch_in, h_in, w_in}); + AnchorGeneratorParam param(anchor_sizes, + aspect_ratios, + variances, + stride, + offset); + testbase.set_param(param); + testbase.set_rand_limit(-5.0, 5.0); + testbase.set_input_shape(shape); + testbase.run_test(anchor_generator_cpu_base, 2.1e-5f, true, false); + } + } + } + } +} + +TEST(TestSaberFunc, test_op_anchor_generator) { +#ifdef USE_CUDA +test_anchor_generator(); +#endif +#ifdef USE_X86_PLACE +test_anchor_generator(); +#endif + +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_argmax.cpp b/test/saber/test_saber_argmax.cpp index 497066302..fc8ae5750 100644 --- a/test/saber/test_saber_argmax.cpp +++ b/test/saber/test_saber_argmax.cpp @@ -57,7 +57,7 @@ void argmax_nv_basic(const std::vector*>& tensor_in,std::ve int size = shape[ax]; if(size < top){ LOG(INFO) << "input data size less than topk"; - return; + return; } for (int n = 0; n < num * out_stride; n++){ for(int k = 0; k < stride; k ++){ @@ -79,10 +79,10 @@ void argmax_nv_basic(const std::vector*>& tensor_in,std::ve } } } - }else{//all + }else{//all if(in_channel < top){ LOG(INFO) << "input data size less than topk"; - return; + return; } for (int n = 0; n < num; n++){ const dtype* din_ch = din + n * in_channel; @@ -116,7 +116,7 @@ void argmax_nv_basic(const std::vector*>& tensor_in,std::ve } template void test_model(){ - + int num = num_in; int channel = ch_in; int height = h_in; @@ -125,8 +125,8 @@ void test_model(){ int topk = top_k; bool has = has_axis; int ax = axis; - - TestSaberBase testbase; + + TestSaberBase testbase; Shape input_shape({num, channel, height, width}, Layout_NCHW); Shape input_shape2({1, 32, 17, 32}, Layout_NCHW); // typename NV TargetD; @@ -143,7 +143,7 @@ void test_model(){ testbase.set_param(param);//set param testbase.set_input_shape(shape);//add some input shape testbase.run_test(argmax_nv_basic);//run test - + } } @@ -160,6 +160,10 @@ TEST(TestSaberFunc, test_func_argmax) { //Env::env_init(); test_model(); #endif +#ifdef USE_ARM_PLACE + //Env::env_init(); + test_model(); +#endif } int main(int argc, const char** argv) { diff --git a/test/saber/test_saber_arithmetic.cpp b/test/saber/test_saber_arithmetic.cpp new file mode 100644 index 000000000..f5ec68198 --- /dev/null +++ b/test/saber/test_saber_arithmetic.cpp @@ -0,0 +1,186 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/arithmetic.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +int active = 1; +int num_in = 1; +int ch_in = 2; +int h_in = 3; +int w_in = 5; +template +void arithmetic_basic(const std::vector*>& inputs, + std::vector*>& outputs, ArithmeticParam& param) { + const dtype *input_data_0 = (const dtype*)inputs[0]->data(); + const dtype *input_data_1 = (const dtype*)inputs[1]->data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + auto seq_offset_0 = inputs[0]->get_seq_offset()[0]; + auto seq_offset_1 = inputs[1]->get_seq_offset()[0]; + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int inner_size = inputs[0]->count_valid(1, inputs[0]->dims()); + + + // out[j] = input_0[j] + input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == SUM) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + if (len_0 > len_1) { + for (int j = 0; j < len_1; j++) { + out[j] = input_0[j] + input_1[j]; + } + for (int j = len_1; j < len_0; j++) { + out[j] = input_0[j]; + } + } else { + for (int j = 0; j < len_0; j++) { + out[j] = input_0[j] + input_1[j]; + } + } + + } + } + + // out[j] = input_0[j] - input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == SUB) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + if (len_0 > len_1) { + for (int j = 0; j < len_1; j++) { + out[j] = input_0[j] - input_1[j]; + } + for (int j = len_1; j < len_0; j++) { + out[j] = input_0[j]; + } + } else { + for (int j = 0; j < len_0; j++) { + out[j] = input_0[j] - input_1[j]; + } + } + } + } + // out[j] = input_0[j] * input_1[j] if j < count_0 && j < count_1; + // out[j] = input_0[j] if j < count_0 && j >= count_1; + if (param.op_type == MUL) { + size_t len = inputs[0]->valid_size(); + for (int i = 0; i < seq_num; i++) { + int len_0 = (seq_offset_0[i+1] - seq_offset_0[i]) * inner_size; + int len_1 = (seq_offset_1[i+1] - seq_offset_1[i]) * inner_size; + auto input_0 = input_data_0 + seq_offset_0[i] * inner_size; + auto input_1 = input_data_1 + seq_offset_1[i] * inner_size; + auto out = output_data + seq_offset_0[i] * inner_size; + if (len_0 > len_1) { + for (int j = 0; j < len_1; j++) { + out[j] = input_0[j] * input_1[j]; + } + for (int j = len_1; j < len_0; j++) { + out[j] = input_0[j]; + } + } else { + for (int j = 0; j < len_0; j++) { + out[j] = input_0[j] * input_1[j]; + } + } + } + } + + outputs[0]->set_seq_offset(inputs[0]->get_seq_offset()); +} + +std::vector generate_sequence_offset(int seq_num, int max_seq_len) { + std::vector offset; + int cumsum = 0; + offset.push_back(cumsum); + for (int i = 0; i < seq_num; i++){ + int cur_len = rand() % max_seq_len + 1; + cumsum += cur_len; + offset.push_back(cumsum); + } + return offset; +} + + + +template +void test_model() { + TestSaberBase testbase(2, 1); + //test example + for (auto seq_num : {1, 2, 8}) { + for (auto max_seq_len: {10, 16, 30}) { + for (auto emb_size: {32, 128, 61}) { + for (auto op_type : {SUM, SUB, MUL}) { + std::vector seq_offset_0 = generate_sequence_offset(seq_num, max_seq_len); + std::vector seq_offset_1 = generate_sequence_offset(seq_num, max_seq_len); + int word_num_0 = seq_offset_0.back(); + int word_num_1 = seq_offset_1.back(); + Tensor input_0; + Tensor input_1; + input_0.re_alloc(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + input_1.re_alloc(Shape({word_num_1, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(input_0, -1.f, 1.f); + fill_tensor_rand(input_1, -1.f, 1.f); + + std::vector> vseq_offset_0 = {seq_offset_0}; + std::vector> vseq_offset_1 = {seq_offset_1}; + input_0.set_seq_offset(vseq_offset_0); + input_1.set_seq_offset(vseq_offset_1); + std::vector*> inputs; + inputs.push_back(&input_0); + inputs.push_back(&input_1); + testbase.add_custom_input(inputs); + ArithmeticParam param(op_type); + testbase.set_param(param); + testbase.run_test(arithmetic_basic, 0.00001, true, true); + } + } + } + } +} + +TEST(TestSaberFunc, test_func_arithmetic) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_attention_padding_mask.cpp b/test/saber/test_saber_attention_padding_mask.cpp new file mode 100644 index 000000000..245ab39e3 --- /dev/null +++ b/test/saber/test_saber_attention_padding_mask.cpp @@ -0,0 +1,148 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/attention_padding_mask.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +template +void attention_padding_mask_basic(const std::vector*>& inputs, + std::vector*>& outputs, + AttentionPaddingMaskParam& param) { + + auto src_offset = inputs[1]->get_seq_offset()[0]; + auto attn_offset = inputs[0]->get_seq_offset()[0]; + int src_len = inputs[1]->count_valid(1, inputs[1]->dims()); + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = src_offset[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0) << "Missmatch batch size"; + + size_t count = inputs[0]->valid_size(); + dtype *attn_data = (dtype*)inputs[0]->mutable_data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + memcpy(output_data, attn_data, count * sizeof(dtype)); + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_output_data = output_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx+1]-src_offset[src_seq_idx]; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_output_data[k] = param.mask; + } + } + } + //print_tensor(*inputs[0]); + //print_tensor(*outputs[0]); +} + +void generate_equal_step_offset(int seq_num, int max_seq_len, std::vector& offset) { + offset.clear(); + offset.push_back(0); + for (int i = 0; i < seq_num; i++){ + offset.push_back((i+1)* max_seq_len); + } +} +void generate_sequence_offset(int seq_num, int max_seq_len, + std::vector& offset) { + offset.clear(); + int cumsum = 0; + offset.push_back(cumsum); + for (int i = 0; i < seq_num; i++){ + int cur_len = rand() % max_seq_len + 1; + cumsum += cur_len; + offset.push_back(cumsum); + //printf("offset:%d, %d\n", i, cumsum); + } +} + +int get_max_len(std::vector& offset) { + int max_len = 0; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i+1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + + + +template +void test_model() { + //test example + TestSaberBase testbase(2, 1); + float scale = 0.8; + for (auto seq_num : {1, 3}) { + for (auto left_seq_len: {2}) { + for (auto right_seq_len: {3}) { + for (auto trans_a : {false}) { + for (auto trans_b: {true}) { + for (auto emb_size: {5}) { + std::vector*> inputs; + std::vector seq_offset_0; + std::vector seq_offset_1; + generate_sequence_offset(seq_num, left_seq_len, seq_offset_1); + int max_len = get_max_len(seq_offset_1); + generate_equal_step_offset(seq_num, right_seq_len, seq_offset_0); + int word_num_0 = seq_offset_0.back(); + int word_num_1 = seq_offset_1.back(); + Tensor* input_0 = new Tensor(Shape({word_num_0, max_len, 1, 1}), AK_FLOAT); + Tensor* input_1 = new Tensor(Shape({word_num_1, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(*input_0, -1.f, 1.f); + fill_tensor_rand(*input_1, -1.f, 1.f); + std::vector> vseq_offset_0 = {seq_offset_0}; + input_0->set_seq_offset(vseq_offset_0); + std::vector> vseq_offset_1 = {seq_offset_1}; + input_1->set_seq_offset(vseq_offset_1); + inputs.push_back(input_0); + inputs.push_back(input_1); + testbase.add_custom_input(inputs); + AttentionPaddingMaskParam param(-900000000.f, 12800001); + testbase.set_param(param); + testbase.run_test(attention_padding_mask_basic, 0.00001, true, true); + for (auto input: inputs) { + delete input; + } + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_func_attention_padding_mask) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_axpy.cpp b/test/saber/test_saber_axpy.cpp index e61092a7f..9f3fb6ce7 100644 --- a/test/saber/test_saber_axpy.cpp +++ b/test/saber/test_saber_axpy.cpp @@ -32,29 +32,6 @@ void axpy_nv_basic(const std::vector*>& inputs,std::vector< const dtype* bias =(const dtype*)bias_in->data(); int in_channel = channel * height * width; int size = height * width; -/* - for (int i = 0; i < num; i++){ - const dtype* din_ptr = din + i * in_channel; - const dtype* bias_ptr = bias + i * in_channel; - const dtype* scale_ptr = scale + i * channel; - dtype* dout_ptr = dout + i * in_channel; - for(int j = 0; j < channel; j++){ - LOG(INFO) << "scale: "; - LOG(INFO) << scale_ptr[j]; - const dtype* din_ch_ptr = din_ptr + j * size; - dtype* dout_ch_ptr = dout_ptr + j * size; - const dtype* bias_ch_ptr = bias_ptr + j * size; - LOG(INFO) << "din :"; - for (int k = 0; k < size; k++){ - LOG(INFO) << din_ch_ptr[k]; - } - LOG(INFO) << "bias :"; - for (int k = 0; k < size; k++){ - LOG(INFO) << bias_ch_ptr[k]; - } - } - } -*/ for (int i = 0; i < num; i++){ const dtype* din_ptr = din + i * in_channel; const dtype* bias_ptr = bias + i * in_channel; @@ -102,7 +79,7 @@ void test_model(){ } TEST(TestSaberFunc, test_func_axpy) { - + #ifdef USE_CUDA //Init the test_base test_model(); @@ -110,6 +87,9 @@ TEST(TestSaberFunc, test_func_axpy) { #ifdef USE_X86_PLACE test_model(); #endif +#ifdef USE_ARM_PLACE + test_model(); +#endif } diff --git a/test/saber/test_saber_base.h b/test/saber/test_saber_base.h index 3cd55f3f1..7524e2332 100644 --- a/test/saber/test_saber_base.h +++ b/test/saber/test_saber_base.h @@ -3,12 +3,12 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. */ #ifndef ANAKIN_TEST_SABER_BASE_H @@ -28,12 +28,12 @@ #include using namespace anakin :: test; -namespace anakin{ -namespace saber{ - template class Op, - template class Param> -class TestSaberBase{ +namespace anakin { +namespace saber { +template class Op, + template class Param> +class TestSaberBase { public: typedef Param Param_t; typedef Op Op_t; @@ -44,173 +44,268 @@ class TestSaberBase{ typedef std::vector Input_ht; typedef std::vector Output_ht; typedef typename DataTrait::Dtype OpDataType; - typedef void (*CpuFunc_t) (const Input_ht&, Output_ht&, Param_t& param); - - TestSaberBase (int in_num = 1, int out_num=1) : _op_input_num(in_num), _op_output_num(out_num){ + typedef void (*CpuFunc_t)(const Input_ht&, Output_ht&, Param_t& param); + + TestSaberBase(int in_num = 1, int out_num = 1) : _op_input_num(in_num), _op_output_num(out_num) { + Env :: env_init(); + Env :: env_init(); } - - void add_param (Param_t& param){ + ~TestSaberBase(){ + clear_datas(); + } + void add_param(Param_t& param) { _params.push_back(param); } - void set_param (Param_t& param){ + void set_param(Param_t& param) { _params.clear(); _params.push_back(param); } - - void add_inputs_shape(Shape new_shape){ - + + void add_inputs_shape(Shape new_shape,std::vector in_tensor_scale={}, + std::vector out_tensor_scale={}) { + std :: vector in_d; std :: vector in_h; std :: vector out_d; std :: vector out_h; std :: vector out_hd; - - for(int i = 0; i < _op_input_num; ++i){ - TensorD *d_id = new TensorD(new_shape); - TensorH *d_ih = new TensorH(new_shape); + + for (int i = 0; i < _op_input_num; ++i) { + TensorD* d_id = new TensorD(new_shape,_in_data_type); + TensorH* d_ih = new TensorH(new_shape,_in_data_type); + d_id->set_scale(in_tensor_scale); + d_ih->set_scale(in_tensor_scale); in_d.push_back(d_id); in_h.push_back(d_ih); } - - for(int i = 0; i < _op_output_num; ++i){ - TensorD *d_od = new TensorD(new_shape); - TensorH *d_oh = new TensorH(new_shape); - TensorH *d_ohd = new TensorH(new_shape); + + for (int i = 0; i < _op_output_num; ++i) { + TensorD* d_od = new TensorD(new_shape); + TensorH* d_oh = new TensorH(new_shape); + TensorH* d_ohd = new TensorH(new_shape); + d_od->set_scale(out_tensor_scale); + d_oh->set_scale(out_tensor_scale); + d_ohd->set_scale(out_tensor_scale); out_d.push_back(d_od); out_h.push_back(d_oh); out_hd.push_back(d_ohd); } + clear_datas(); _inputs_dev.push_back(in_d); _inputs_host.push_back(in_h); _outputs_dev.push_back(out_d); _outputs_host.push_back(out_h); _outputs_hd.push_back(out_hd); - _input_shapes.push_back(std::vector{new_shape}); - - + _input_shapes.push_back(std::vector {new_shape}); + + } - - void add_inputs_shape(std::vector new_shape_v){ - + + void add_inputs_shape(std::vector new_shape_v) { + CHECK_GE(new_shape_v.size(), _op_input_num) << "unvaliable shape vector"; - + std :: vector in_d; std :: vector in_h; std :: vector out_d; std :: vector out_h; std :: vector out_hd; - - for(int i = 0; i < _op_input_num; ++i){ - TensorD *d_id = new TensorD(new_shape_v[i]); - TensorH *d_ih = new TensorH(new_shape_v[i]); + + for (int i = 0; i < _op_input_num; ++i) { + TensorD* d_id = new TensorD(new_shape_v[i],_in_data_type); + TensorH* d_ih = new TensorH(new_shape_v[i],_in_data_type); in_d.push_back(d_id); in_h.push_back(d_ih); + } - for(int i = 0; i < _op_output_num; ++i){ - TensorD *d_od = new TensorD(); - TensorH *d_oh = new TensorH(); - TensorH *d_ohd = new TensorH(); + + for (int i = 0; i < _op_output_num; ++i) { + TensorD* d_od = new TensorD(); + TensorH* d_oh = new TensorH(); + TensorH* d_ohd = new TensorH(); out_d.push_back(d_od); out_h.push_back(d_oh); out_hd.push_back(d_ohd); } + clear_datas(); + _inputs_dev.push_back(in_d); _inputs_host.push_back(in_h); _outputs_dev.push_back(out_d); _outputs_host.push_back(out_h); _outputs_hd.push_back(out_hd); _input_shapes.push_back(new_shape_v); - - + _input_type = RANDOM; } - void set_input_shape (Shape new_shape, TestDataType type = RANDOM, OpDataType value = 1){ + + void add_inputs_shape(std::vector new_shape_v,std::vector> in_tensor_scale, + std::vector> out_tensor_scale) { + + CHECK_GE(new_shape_v.size(), _op_input_num) << "unvaliable shape vector"; + CHECK_EQ(in_tensor_scale.size(),new_shape_v.size()); + CHECK_EQ(out_tensor_scale.size(),new_shape_v.size()); + std :: vector in_d; + std :: vector in_h; + std :: vector out_d; + std :: vector out_h; + std :: vector out_hd; + + for (int i = 0; i < _op_input_num; ++i) { + TensorD* d_id = new TensorD(new_shape_v[i],_in_data_type); + TensorH* d_ih = new TensorH(new_shape_v[i],_in_data_type); + d_id->set_scale(in_tensor_scale[i]); + d_ih->set_scale(in_tensor_scale[i]); + in_d.push_back(d_id); + in_h.push_back(d_ih); + + } + + for (int i = 0; i < _op_output_num; ++i) { + TensorD* d_od = new TensorD(); + TensorH* d_oh = new TensorH(); + TensorH* d_ohd = new TensorH(); + d_od->set_scale(out_tensor_scale[i]); + d_oh->set_scale(out_tensor_scale[i]); + d_ohd->set_scale(out_tensor_scale[i]); + out_d.push_back(d_od); + out_h.push_back(d_oh); + out_hd.push_back(d_ohd); + } clear_datas(); - + + _inputs_dev.push_back(in_d); + _inputs_host.push_back(in_h); + _outputs_dev.push_back(out_d); + _outputs_host.push_back(out_h); + _outputs_hd.push_back(out_hd); + _input_shapes.push_back(new_shape_v); + _input_type = RANDOM; + } + + void set_input_shape(Shape new_shape, std::vector scale_in, std::vector scale_out,TestDataType type = RANDOM, OpDataType value = 1) { + //clear_datas(); + + add_inputs_shape(new_shape,scale_in,scale_out); + _input_type = type; + _special_value = value; + } + + void set_input_shape(Shape new_shape, TestDataType type = RANDOM, OpDataType value = 1) { + //clear_datas(); + add_inputs_shape(new_shape); _input_type = type; _special_value = value; } - void set_input_shape (std::vector new_shape_v, TestDataType type = RANDOM, OpDataType value = 1){ - clear_datas(); - + void set_input_shape(std::vector new_shape_v, TestDataType type = RANDOM, + OpDataType value = 1) { + //clear_datas(); + add_inputs_shape(new_shape_v); _input_type = type; _special_value = value; } - void auto_gen_inputs (){ + void auto_gen_inputs() { CHECK_EQ(_op_input_num, 1) << "only support input_num == 1"; - for(int n : {1, 2}){ - for(int c : {32, 64}){ - for(int h : {64, 256}){ - for(int w : {64, 256}){ - add_inputs_shape (Shape({n, c, h, w})); + + for (int n : { + 1, 2 + }) { + for (int c : { + 32, 64 + }) { + for (int h : { + 64, 256 + }) { + for (int w : { + 64, 256 + }) { + add_inputs_shape(Shape({n, c, h, w})); } } } } } - void fill_inputs (float minv, float maxv){ + void fill_inputs(float minv, float maxv) { int input_size = _inputs_dev.size(); CHECK_EQ(input_size, _inputs_host.size()) << "dev and host inputs num must be equal"; - if(_input_type == RANDOM){ - for(int i=0; i<_inputs_dev.size(); ++i){ - for(int j=0; j<_op_input_num; ++j){ + + if (_input_type == RANDOM) { + for (int i = 0; i < _inputs_dev.size(); ++i) { + for (int j = 0; j < _op_input_num; ++j) { fill_tensor_rand(*_inputs_dev[i][j], minv, maxv); - // LOG(INFO) << "_op_input_num: " << _op_input_num; + // LOG(INFO) << "_op_input_num: " << _op_input_num; _inputs_host[i][j] -> copy_from(*_inputs_dev[i][j]); } } } else { CHECK_EQ(input_size, 1) << "special input num must be 1"; - for(int i = 0; i < _inputs_dev.size(); ++i){ - for(int j = 0; j < _op_input_num; ++j){ + + for (int i = 0; i < _inputs_dev.size(); ++i) { + for (int j = 0; j < _op_input_num; ++j) { fill_tensor_const(*_inputs_dev[i][j], _special_value); _inputs_host[i][j] -> copy_from(*_inputs_dev[i][j]); } } } } - void add_custom_input (Input_dt& input){ + void add_custom_input(Input_dt& input) { CHECK_EQ(input.size(), _op_input_num) << "input must equal op_input_num"; - clear_datas(); + //clear_datas(); std::vector shape_v; - for (int i=0; i<_op_input_num; ++i){ + + for (int i = 0; i < _op_input_num; ++i) { shape_v.push_back(input[i] -> valid_shape()); } + add_inputs_shape(shape_v); - for(int i = 0; i < _op_input_num; ++i) - { + + for (int i = 0; i < _op_input_num; ++i) { SaberStatus status = _inputs_dev[0][i]->set_dtype(input[i]->get_dtype()); - status &= _inputs_host[0][i]->set_dtype(input[i]->get_dtype()); - if(!status) + SaberStatus status2 = _inputs_host[0][i]->set_dtype(input[i]->get_dtype()); + + if (status != SaberSuccess || status2 != SaberSuccess) { LOG(INFO) << "ERROR"; + } + _inputs_dev[0][i] -> copy_from(*input[i]); _inputs_host[0][i] -> copy_from(*input[i]); - if(input[i]->get_seq_offset().size() > 0){ - _inputs_dev[0][i] -> set_seq_offset(input[i]->get_seq_offset()); + + if (input[i]->get_seq_offset().size() > 0) { + _inputs_dev[0][i] -> set_seq_offset(input[i]->get_seq_offset()); _inputs_host[0][i] -> set_seq_offset(input[i]->get_seq_offset()); } } + _input_type = CUSTOM; - } + + void set_input_datatype(DataType dtype_in = AK_FLOAT) { + _in_data_type = dtype_in; + } + void set_ouput_datatype(DataType dtype_out = AK_FLOAT) { + _out_data_type = dtype_out; + } + void compute_outputs_shape (int param_index = 0){ CHECK_GT(_params.size(), 0) << "no available param"; CHECK_GT(_inputs_dev.size(), 0) << "no available inputs"; CHECK_GE(param_index, 0) << "param index must be positive"; CHECK_EQ(_inputs_dev.size(), _outputs_dev.size()) << "inputs and outputs must have same num"; CHECK_LT(param_index, _params.size()) << "param_index out of range"; - for(int i = 0; i < _inputs_dev.size(); ++i){ + + for (int i = 0; i < _inputs_dev.size(); ++i) { SABER_CHECK(_base_op.compute_output_shape(_inputs_dev[i], - _outputs_dev[i], _params[param_index])); + _outputs_dev[i], _params[param_index])); } - for(int i = 0; i < _outputs_dev.size(); ++i) { - for(int j = 0; j < _op_output_num; ++j) { + + for (int i = 0; i < _outputs_dev.size(); ++i) { + for (int j = 0; j < _op_output_num; ++j) { Shape sh = _outputs_dev[i][j] -> valid_shape(); - _outputs_dev[i][j] -> re_alloc(sh, Dtype); - _outputs_host[i][j] -> re_alloc(sh, Dtype); - _outputs_hd[i][j] -> re_alloc(sh, Dtype); + _outputs_dev[i][j] -> re_alloc(sh, _out_data_type); + _outputs_host[i][j] -> re_alloc(sh, _out_data_type); + _outputs_hd[i][j] -> re_alloc(sh, _out_data_type); if (!_use_random_output) { fill_tensor_const(*_outputs_dev[i][j], 0); fill_tensor_const(*_outputs_host[i][j], 0); @@ -222,20 +317,20 @@ class TestSaberBase{ } } } - + template - void clear_vv(std::vector>& data_vec){ - for (auto vec : data_vec){ - for (auto tensor_p : vec){ - if (nullptr != tensor_p){ + void clear_vv(std::vector>& data_vec) { + for (auto vec : data_vec) { + for (auto tensor_p : vec) { + if (nullptr != tensor_p) { delete tensor_p; } } } + data_vec.clear(); } - void clear_datas() - { + void clear_datas() { clear_vv(_inputs_dev); clear_vv(_outputs_dev); clear_vv(_inputs_host); @@ -243,120 +338,176 @@ class TestSaberBase{ clear_vv(_outputs_hd); _input_shapes.clear(); } - SaberStatus get_op_result (SaberImplStrategy strategy, ImplEnum implenum, int param_index = 0,bool test_speed=false){ + SaberStatus get_op_result(SaberImplStrategy strategy, ImplEnum implenum, int param_index = 0, + bool test_speed = false) { CHECK_GE(param_index, 0) << "param index must be positive"; CHECK_LT(param_index, _params.size()) << "param index out of range"; - + Context ctx(0, 1, 1); - SaberStatus status; + SaberStatus status = SaberSuccess; SaberTimer t; - int iter_num=test_speed?100:10; + int iter_num = test_speed ? 100 : 1; t.clear(); t.start(ctx); - for(int input_index = 0; input_index < _inputs_dev.size(); ++input_index){ + + for (int input_index = 0; input_index < _inputs_dev.size(); ++input_index) { _base_op.init(_inputs_dev[input_index], _outputs_dev[input_index], _params[param_index], strategy, implenum, ctx); - for(int iter=0; itercopy_from(*_outputs_host[input_index][0]); - status= _base_op(_inputs_dev[input_index], _outputs_dev[input_index], - _params[param_index], ctx); - if(status == SaberUnImplError){ + auto out_num = _outputs_dev[input_index].size(); + + for (int iter = 0; iter < iter_num; ++iter) { + for (int out_id = 0; out_id < out_num; out_id++) { + _outputs_dev[input_index][out_id]->copy_from(*_outputs_host[input_index][out_id]); + } + + status = _base_op(_inputs_dev[input_index], _outputs_dev[input_index], + _params[param_index], ctx); + + if (status == SaberUnImplError) { return status; } + typename TensorD :: API :: stream_t stream = ctx.get_compute_stream(); - _outputs_dev[input_index][0] -> record_event(stream); - _outputs_dev[input_index][0] -> sync(); - + + for (int out_id = 0; out_id < out_num; out_id++) { + _outputs_dev[input_index][out_id] -> record_event(stream); + _outputs_dev[input_index][out_id] -> sync(); + } + } } + t.end(ctx); float ts = t.get_average_ms(); - if(test_speed) { + + if (test_speed) { LOG(INFO) << "avg run time:" << ts / _inputs_dev.size() / 100 << "ms"; } - for(int input_index = 0; input_index < _inputs_dev.size(); ++input_index){ - for(int j = 0; j < _op_output_num; ++j){ + + for (int input_index = 0; input_index < _inputs_dev.size(); ++input_index) { + for (int j = 0; j < _op_output_num; ++j) { + _outputs_hd[input_index][j]->reshape(_outputs_dev[input_index][j]->valid_shape()); + _outputs_hd[input_index][j] -> copy_from(*_outputs_dev[input_index][j]); - // LOG(INFO) << "input_index: " << input_index << ", j: " << j; } } + return status; } - void get_cpu_result (CpuFunc_t CpuFunc, int param_index=0){ + void get_cpu_result(CpuFunc_t CpuFunc, int param_index = 0) { CHECK_EQ(_inputs_host.size(), _outputs_dev.size()) << "input and output number must be equal"; - CHECK_EQ(_outputs_host.size(),_outputs_dev.size()) << "input and output number must be equal"; - for(int i = 0; i < _inputs_dev.size(); ++i){ + CHECK_EQ(_outputs_host.size(), _outputs_dev.size()) << "input and output number must be equal"; + + for (int i = 0; i < _inputs_dev.size(); ++i) { CpuFunc(_inputs_host[i], _outputs_host[i], _params[param_index]); } } - void result_check_accuracy (double succ_ratio = 0.00001,bool write_error_tensor=false){ + void result_check_accuracy(double succ_ratio = 0.00001, bool write_error_tensor = false) { CHECK_EQ(_outputs_host.size(), _outputs_hd.size()) << "output size in dev and cpu must be equal"; int check_size = _outputs_host.size(); std::vector max_diff(check_size, 0); std::vector max_ratio(check_size, 0); Shape sh = _inputs_host[0][0] -> valid_shape(); LayoutType lo = _inputs_host[0][0] -> get_layout(); - for(int i = 0; i < _outputs_host.size(); ++i){ - for(int j = 0; j<_op_output_num; ++j){ + + for (int i = 0; i < _outputs_host.size(); ++i) { + for (int j = 0; j < _op_output_num; ++j) { tensor_cmp_host(static_cast(_outputs_hd[i][j] -> data()), - static_cast(_outputs_host[i][j] -> data()), - _outputs_hd[i][j] -> valid_size(), max_ratio[i], max_diff[i]); - LOG(INFO) << "input_shape: (" << sh.num() << "," << sh.channel() << "," << sh.height() << "," << sh.width() << ")"; + static_cast(_outputs_host[i][j] -> data()), + _outputs_hd[i][j] -> valid_size(), max_ratio[i], max_diff[i]); + LOG(INFO) << "input_shape: (" << sh.num() << "," << sh.channel() << "," << sh.height() << "," << + sh.width() << ")"; LOG(INFO) << "input_layout = " << lo; - LOG(INFO) << "max_ratio: " << max_ratio[i]<<", max diff: "<valid_shape(); - LOG(INFO) << " output layout: "<<_outputs_hd[i][j]->get_layout(); - if ((max_diff[i]< 0.0001 || max_ratio[i] <= succ_ratio) && (_outputs_hd[i][0]->valid_shape() == _outputs_host[i][0]->valid_shape()) \ - && _outputs_hd[i][0]->get_layout() == _outputs_host[i][0]->get_layout()){ + LOG(INFO) << "max_ratio: " << max_ratio[i] << ", max diff: " << max_diff[i]; + LOG(INFO) << " mean_value: " << tensor_mean_value(*_outputs_hd[i][j]) << "," << tensor_mean_value( + *_outputs_host[i][j]); + LOG(INFO) << " output shape: " << _outputs_hd[i][j]->valid_shape(); + LOG(INFO) << " output layout: " << _outputs_hd[i][j]->get_layout(); + + if ((max_diff[i] < 0.0001 || max_ratio[i] <= succ_ratio) + && (_outputs_hd[i][0]->valid_shape() == _outputs_host[i][0]->valid_shape()) \ + && _outputs_hd[i][0]->get_layout() == _outputs_host[i][0]->get_layout()) { LOG(INFO) << "Test Passed!"; } else { - if(write_error_tensor) { - write_tensorfile(*_outputs_hd[i][j], "error_record_target"); - write_tensorfile(*_outputs_host[i][j], "error_record_host"); + LOG(INFO) << "max_ratio: " << max_ratio[i] << ", max diff: " << max_diff[i]; + + if (write_error_tensor) { + char target_file_name[100]; + char host_file_name[100]; + sprintf(target_file_name, "error_target_output_%d", j); + sprintf(host_file_name, "error_host_output_%d", j); + write_tensorfile(*_outputs_hd[i][j], target_file_name); + write_tensorfile(*_outputs_host[i][j], host_file_name); } + print_tensor(*_inputs_host[0][0]); + //print_tensor(*_inputs_host[0][1]); print_tensor(*_outputs_host[0][0]); print_tensor(*_outputs_hd[0][0]); - LOG(FATAL) << "Test Failed!!"<< "output:(" << i << "-" << j << ")"; + LOG(FATAL) << "Test Failed!!" << "output:(" << i << "-" << j << ")"; } } } } - void set_rand_limit (float minv, float maxv){ + void set_rand_limit(float minv, float maxv) { _max_value = maxv; _min_value = minv; } - void run_test (CpuFunc_t CpuFunc, double succ_ratio=0.00001, bool write_error_tensor= false,bool test_speed=false){ - if(_input_type == SPECIAL){ + void run_test(CpuFunc_t CpuFunc, double succ_ratio = 0.00001, bool write_error_tensor = false, + bool test_speed = false) { + if (_input_type == SPECIAL) { fill_inputs(_special_value, _special_value); } - if(_input_type == RANDOM){ + + if (_input_type == RANDOM) { fill_inputs(_min_value, _max_value); } - // LOG(INFO) << "_input_type" << _input_type; + + // LOG(INFO) << "_input_type" << _input_type; compute_outputs_shape(); - Env :: env_init(); - Env :: env_init(); - + std :: vector runtype{"STATIC", "RUNTIME", "SPECIFY"}; std :: vector impltype{"VENDER", "SABER"}; - for(auto strate : {SPECIFY, RUNTIME, STATIC}){ - for(auto implenum : {VENDER_IMPL, SABER_IMPL}){ - LOG(INFO) << "TESTING: strategy:" << runtype[strate-1] << ",impltype:" << impltype[(int)implenum]; - if(get_op_result(strate, implenum,test_speed) == SaberUnImplError){ + for (auto strate : { SPECIFY, RUNTIME, STATIC}) { + for (auto implenum : {VENDER_IMPL, SABER_IMPL}) { + LOG(INFO) << "TESTING: strategy:" << runtype[strate - 1] << ",impltype:" << impltype[(int)implenum]; +#ifdef USE_ARM_PLACE + for (auto th: {1, 2, 4}){ + Context ctx; + LOG(INFO) << "create runtime ctx"; + ctx.set_run_mode(SABER_POWER_HIGH, th); + LOG(INFO) << "test threads activated"; + LOG(INFO) << "number of threads: " << th; +#ifdef USE_OPENMP + #pragma omp parallel + { + int thread = omp_get_num_threads(); + LOG(INFO) << "number of omp threads: " << thread; + } + #endif + if (get_op_result(strate, implenum, 0, test_speed) == SaberUnImplError) { + LOG(INFO) << "Unimpl!!"; + continue; + } + get_cpu_result(CpuFunc); + result_check_accuracy(succ_ratio, write_error_tensor); + } +#else + if (get_op_result(strate, implenum, 0, test_speed) == SaberUnImplError) { LOG(INFO) << "Unimpl!!"; continue; } + get_cpu_result(CpuFunc); - result_check_accuracy(succ_ratio,write_error_tensor); + result_check_accuracy(succ_ratio, write_error_tensor); +#endif } } } - void result_check_speed(){ + void result_check_speed() { } void set_random_output(bool random_output) { _use_random_output = random_output; @@ -367,8 +518,11 @@ class TestSaberBase{ Op_t _base_op; TestDataType _input_type; OpDataType _special_value; - float _max_value{1.0}; - float _min_value{-1.0}; + DataType _out_data_type = AK_FLOAT; + DataType _in_data_type = AK_FLOAT; + float _max_value{100.0}; + float _min_value{-100.0}; + std :: vector _inputs_host; std :: vector _inputs_dev; std :: vector _outputs_dev; diff --git a/test/saber/test_saber_box_clip.cpp b/test/saber/test_saber_box_clip.cpp new file mode 100644 index 000000000..1f52f64b4 --- /dev/null +++ b/test/saber/test_saber_box_clip.cpp @@ -0,0 +1,105 @@ +#include "core/context.h" +#include "saber/funcs/box_clip.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test/saber/test_saber_base.h" +#include + +using namespace anakin::saber; + +template +void box_clip_basic(const std::vector*>& inputs, + std::vector*>& outputs, EmptyParam& param) { + static constexpr int im_info_size = 3; + static constexpr int box_info_size = 4; + auto seq_offset = inputs[1]->get_seq_offset(); + CHECK_EQ(inputs.size(), 2) << "need two input"; + CHECK_EQ(seq_offset.size(), 1) << "need offset to cal batch"; + CHECK_GT(seq_offset[0].size(), 1) << "need offset to cal batch"; + auto offset = seq_offset[0]; + auto img = inputs[1]; + auto im_info = inputs[0]; + const float* im_info_ptr = static_cast(im_info->data()); + const float* box_ptr_in = static_cast(img->data()); + float* box_ptr_out = static_cast(outputs[0]->data()); + int batch_size = offset.size() - 1; + CHECK_EQ(batch_size * im_info_size, im_info->valid_size()) << "im_info should be valid"; + + for (int batch_id = 0; batch_id < batch_size; batch_id++) { + const float img_h = im_info_ptr[batch_id * im_info_size + 0]; + const float img_w = im_info_ptr[batch_id * im_info_size + 1]; + const float scale = im_info_ptr[batch_id * im_info_size + 2]; + const float img_h_scale = round(img_h / scale) - 1; + const float img_w_scale = round(img_w / scale) - 1; + const int start_in_batch = offset[batch_id]; + const int end_in_batch = offset[batch_id + 1]; + + for (int im_id = start_in_batch; im_id < end_in_batch; im_id++) { + const float* batch_box_ptr_in = &box_ptr_in[im_id * box_info_size]; + float* batch_box_ptr_out = &box_ptr_out[im_id * box_info_size]; + batch_box_ptr_out[0] = std::max(std::min(batch_box_ptr_in[0], img_w_scale), 0.f); + batch_box_ptr_out[1] = std::max(std::min(batch_box_ptr_in[1], img_h_scale), 0.f); + batch_box_ptr_out[2] = std::max(std::min(batch_box_ptr_in[2], img_w_scale), 0.f); + batch_box_ptr_out[3] = std::max(std::min(batch_box_ptr_in[3], img_h_scale), 0.f); + } + } +} + +template +void test_model() { + + int batch = 2; + int box_per_batch = 2; + int num = box_per_batch * batch; + int channel = 4; + int height = 1; + int width = 1; + + TestSaberBase testbase(2, 1); + + EmptyParam param; + + Shape input_shape({num, channel, height, width}, Layout_NCHW); + Shape im_info_shape({batch, 3, 1, 1}, Layout_NCHW); + Tensor input_box_host(input_shape); + Tensor im_info_host(im_info_shape); + fill_tensor_rand(input_box_host, 0, 100); + fill_tensor_rand(im_info_host, 0, 100); + std::vector> seq_offset({{0}}); + + for (int i = 1; i <= batch; i++) { + seq_offset[0].push_back(seq_offset[0][i - 1] + box_per_batch); + } + + input_box_host.set_seq_offset(seq_offset); + std::vector*> input_vec; + input_vec.push_back(&im_info_host); + input_vec.push_back(&input_box_host); + testbase.set_param(param);//set param + testbase.add_custom_input(input_vec); + testbase.run_test(box_clip_basic);//run test + + +} + +TEST(TestSaberFunc, test_func_axpy) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +} + + +int main(int argc, const char** argv) { + + InitTest(); + RUN_ALL_TESTS(argv[0]); + + return 0; +} + diff --git a/test/saber/test_saber_box_coder.cpp b/test/saber/test_saber_box_coder.cpp new file mode 100644 index 000000000..ab38e8031 --- /dev/null +++ b/test/saber/test_saber_box_coder.cpp @@ -0,0 +1,171 @@ +#include "core/context.h" +#include "funcs/box_coder.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include "tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; +enum BOX_CODER_VAR { + FIX_SIZE_VAR = 0, + NO_VAR = 1, + FROM_INPUT_VAR = 2 +}; +template +static inline void box_coder(Tensor* proposals, + const Tensor* anchors, + const Tensor* bbox_deltas, + const Tensor* variances, + BoxCoderParam& param + ) { + const size_t row = bbox_deltas->num(); + const size_t col = bbox_deltas->channel(); + const size_t anchor_nums = row * col; + const size_t anchor_len = anchors->valid_shape()[1]; + CHECK_EQ(anchor_len, 5) << "anchor length is 5"; + int out_len = 4; + int var_len = 4; + int delta_len = 4; + const Dtype* anchor_data = (const Dtype*) anchors->data(); + const Dtype* bbox_deltas_data = (const Dtype*) bbox_deltas->data(); + Dtype* proposals_data = (Dtype*) proposals->data(); + const Dtype* variances_data = nullptr; + float normalized = !param.box_normalized ? 1.f : 0; + + if (variances) { + variances_data = (const Dtype*)variances->data(); + } + + for (int64_t row_id = 0; row_id < row; ++row_id) { + for (int64_t col_id = 0; col_id < col; ++col_id) { + size_t delta_offset = row_id * col * delta_len + col_id * delta_len; + size_t out_offset = row_id * col * out_len + col_id * out_len; + int prior_box_offset = param.axis == 0 ? col_id * anchor_len : row_id * anchor_len; + int var_offset = param.axis == 0 ? col_id * var_len : row_id * var_len; + auto anchor_data_tmp = anchor_data + prior_box_offset + 1; + auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset; + auto proposals_data_tmp = proposals_data + out_offset; + auto anchor_width = anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; + auto anchor_height = anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + Dtype bbox_center_x = 0, bbox_center_y = 0; + Dtype bbox_width = 0, bbox_height = 0; + + if (fix_size_var == FROM_INPUT_VAR) { + auto variances_data_tmp = variances_data + var_offset; + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data_tmp[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(variances_data_tmp[2] * + bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(variances_data_tmp[3] * + bbox_deltas_data_tmp[3]) * anchor_height; + } + + if (fix_size_var == FIX_SIZE_VAR) { + bbox_center_x = + variances_data[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(variances_data[2] * + bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(variances_data[3] * + bbox_deltas_data_tmp[3]) * anchor_height; + + } else if (fix_size_var == NO_VAR) { + bbox_center_x = + bbox_deltas_data_tmp[0] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(bbox_deltas_data_tmp[2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data_tmp[3]) * anchor_height; + } + + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; + } + } +} + +template +void boxcoder_basic(const std::vector*>& inputs, + std::vector*>& outputs, BoxCoderParam& param) { + Tensor* anchor = inputs[0]; + Tensor* delta = inputs[1]; + Tensor* variances = nullptr; + Tensor* proposal = outputs[0]; + + if (param.variance() != nullptr && param.variance()->valid_size() > 0) { + Tensor host_tenosr(param.variance()->valid_shape()); + host_tenosr.copy_from(*param.variance()); + variances = &host_tenosr; + CHECK(variances->valid_size() == 4); + box_coder(proposal, anchor, delta, variances, + param); + } else if (inputs.size() >= 3) { + variances = inputs[2]; + box_coder(proposal, anchor, delta, variances, + param); + } else { + box_coder(proposal, anchor, delta, variances, param); + } +}; + +template +void test_model() { + + TestSaberBase testbase(2, 1); + int box_num = 10; + int class_num = 11; + Shape prior_box_shape({box_num, 5, 1, 1}, Layout_NCHW); + Shape delta_shape({class_num, box_num, 1, 4}, Layout_NCHW); + Shape var_shape({1, 1, 1, 4}, Layout_NCHW); + Tensor var_tensor(var_shape); + fill_tensor_rand(var_tensor, 0, 1); + BoxCoderParam param(&var_tensor, false, 0); + + + + testbase.set_param(param);//set param + std::vector shape_v; + shape_v.push_back(prior_box_shape);//scale + shape_v.push_back(delta_shape);//x + testbase.set_input_shape(shape_v);//add some input shape + testbase.set_rand_limit(-1.f, 1.f); + testbase.run_test(boxcoder_basic, 0.00001, true, false);//run test + + +} + +TEST(TestSaberFunc, test_func_axpy) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + test_model(); +#endif +} + + +int main(int argc, const char** argv) { + + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + + return 0; +} + diff --git a/test/saber/test_saber_buffer.cpp b/test/saber/test_saber_buffer.cpp index 338fd88b3..5d03e791b 100644 --- a/test/saber/test_saber_buffer.cpp +++ b/test/saber/test_saber_buffer.cpp @@ -2,7 +2,7 @@ #include "saber/core/buffer.h" #include "saber/core/env.h" #include "saber/core/data_traits.h" - +#include using namespace anakin::saber; diff --git a/test/saber/test_saber_cast.cpp b/test/saber/test_saber_cast.cpp index e56a33e75..9b020ff77 100644 --- a/test/saber/test_saber_cast.cpp +++ b/test/saber/test_saber_cast.cpp @@ -47,7 +47,7 @@ void cast_basic(const std::vector*>& inputs,std::vectorget_dtype() == 5){//AK_INT32 const int* in_data = (const int*)tensor_in->data(); float* out_data = (float*)tensor_out->mutable_data(); @@ -100,12 +100,14 @@ void test_model(){ TestSaberBase testbase(1,1); testbase.set_param(param); testbase.add_custom_input(input_dt); + testbase.set_ouput_datatype(AK_FLOAT); testbase.run_test(cast_basic, 2.1e-5f); } if (b == 5){ TestSaberBase testbase(1,1); testbase.set_param(param); testbase.add_custom_input(input_dt); + testbase.set_ouput_datatype(AK_INT32); testbase.run_test(cast_basic, 2.1e-5f); } } @@ -145,7 +147,7 @@ void test_model(){ testbase.add_custom_input(input_dt); testbase.run_test(cast_nv_basic);//run test } - + } } } @@ -156,15 +158,19 @@ TEST(TestSaberFunc, test_func_cast) { int channel = ch_in; int height = h_in; int width = w_in; - + #ifdef USE_CUDA //Init the test_base test_model(); #endif #ifdef USE_X86_PLACE - //Env::env_init(); + Env::env_init(); test_model(); #endif +#ifdef USE_ARM_PLACE + Env::env_init(); + test_model(); +#endif } diff --git a/test/saber/test_saber_concat.cpp b/test/saber/test_saber_concat.cpp index 0e3fc7eba..3b19bc993 100644 --- a/test/saber/test_saber_concat.cpp +++ b/test/saber/test_saber_concat.cpp @@ -92,7 +92,7 @@ void test_model(){ } TEST(TestSaberFunc, test_func_concat) { - + #ifdef USE_CUDA //Init the test_base test_model(); diff --git a/test/saber/test_saber_concat_int8.cpp b/test/saber/test_saber_concat_int8.cpp new file mode 100644 index 000000000..f0b4a6cb9 --- /dev/null +++ b/test/saber/test_saber_concat_int8.cpp @@ -0,0 +1,8 @@ + + +int main(int argc, const char** argv) { + + + return 0; +} + diff --git a/test/saber/test_saber_context.cpp b/test/saber/test_saber_context.cpp index 7c6cfcb3c..831e01b8a 100644 --- a/test/saber/test_saber_context.cpp +++ b/test/saber/test_saber_context.cpp @@ -42,6 +42,7 @@ TEST(TestSaberFunc, test_arm_context) { LOG(INFO) << "set active ids"; LOG(INFO) << "test threads activated"; +#ifdef USE_OPENMP #pragma omp parallel { int threads = omp_get_num_threads(); @@ -54,6 +55,7 @@ TEST(TestSaberFunc, test_arm_context) { #pragma omp parallel printf("thread1 core ID: %d\n", th_id); } +#endif } #endif //USE_ARM_PLACE diff --git a/test/saber/test_saber_conv.cpp b/test/saber/test_saber_conv.cpp index b1685886d..658b020c2 100644 --- a/test/saber/test_saber_conv.cpp +++ b/test/saber/test_saber_conv.cpp @@ -6,17 +6,19 @@ #include "test_saber_base.h" #include "conv_func_helper.h" #include +#include "saber/funcs/impl/x86/x86_utils.h" using namespace anakin::saber; #define CHECK_RESULT //#define CHECK_SPEED #define RUN_BASIC_TEST false +#define RUN_BASIC_TEST_ARM true #if 0 #ifdef USE_BM_PLACE TEST(TestSaberFunc, test_saber_conv_results_bm) { Env::env_init(); Env::env_init(); - TestSaberBase testbase_bm; + TestSaberBase testbase_bm; std::vector kernel{1, 3}; std::vector pad{0, 1}; std::vector stride_h_v{1}; @@ -30,59 +32,70 @@ TEST(TestSaberFunc, test_saber_conv_results_bm) { std::vector bias_term_v{true, false}; std::vector with_relu_v{false}; - for (int input_num :{1,2}) - for (int out_channels :{1,2,5}) - for (int in_channels :{1,2,5}) - for (auto kernel_h_w : kernel) - for (auto pad_h_w : pad) - for (auto stride_h : stride_h_v) - for (auto stride_w : stride_h_v) - for (auto height : in_h_v) - for (auto width : in_w_v) - for (auto dilation : dilation_h_w) - for (auto bias_term : bias_term_v) - for (auto with_relu : with_relu_v) - for (auto group : group_v) { - LOG(INFO)<<"info :"< weights_dev; - Tensor bias_dev; + for (int input_num : { + 1, 2 + }) - weights_dev.re_alloc(weights_s, AK_FLOAT); - fill_tensor_rand(weights_dev, -5.f, 5.0f); - if (bias_term) { - bias_dev.re_alloc(bias_s, AK_FLOAT); - fill_tensor_rand(bias_dev, -5.0f, 5.0f); - } - ConvParam param_bm(group, pad_h_w, pad_h_w, - stride_h, stride_w, - dilation, dilation, - &weights_dev, &bias_dev); - testbase_bm.set_param(param_bm);//set param - testbase_bm.set_input_shape(Shape({input_num,in_channels,height,width}, - Layout_NCHW));//add some input shape - testbase_bm.run_test(conv_cpu_func, 1e-3);//run test + for (int out_channels : { + 1, 2, 5 + }) - } + for (int in_channels : { + 1, 2, 5 + }) + + for (auto kernel_h_w : kernel) + for (auto pad_h_w : pad) + for (auto stride_h : stride_h_v) + for (auto stride_w : stride_h_v) + for (auto height : in_h_v) + for (auto width : in_w_v) + for (auto dilation : dilation_h_w) + for (auto bias_term : bias_term_v) + for (auto with_relu : with_relu_v) + for (auto group : group_v) { + LOG(INFO) << "info :" << input_num << "," << in_channels << "," << + height << "," << width << "," << out_channels << "," << kernel_h_w << "," << + kernel_h_w << "," << stride_h << "," << stride_w << "," << dilation << "," << dilation << "," << + pad_h_w << "," << pad_h_w << "," << bias_term; + Shape weights_s({out_channels, in_channels, kernel_h_w, kernel_h_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + Tensor weights_dev; + Tensor bias_dev; + + weights_dev.re_alloc(weights_s, AK_FLOAT); + fill_tensor_rand(weights_dev, -5.f, 5.0f); + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + fill_tensor_rand(bias_dev, -5.0f, 5.0f); + } + + ConvParam param_bm(group, pad_h_w, pad_h_w, + stride_h, stride_w, + dilation, dilation, + &weights_dev, &bias_dev); + testbase_bm.set_param(param_bm);//set param + testbase_bm.set_input_shape(Shape({input_num, in_channels, height, width}, + Layout_NCHW));//add some input shape + testbase_bm.run_test(conv_cpu_func, 1e-3);//run test + + } } #endif #endif TEST(TestSaberFunc, test_saber_conv_results) { #ifdef USE_CUDA -// Env::env_init(); -// Env::env_init(); -// TestSaberBase testbase_nv; + // Env::env_init(); + // Env::env_init(); + // TestSaberBase testbase_nv; #endif #ifdef USE_X86_PLACE Env::env_init(); TestSaberBase testbase_x86; #endif - std::vector kernel_h_v{1, 3}; + std::vector kernel_h_v {1, 3}; std::vector kernel_w_v{1, 3}; std::vector pad_h_v{0, 1}; std::vector pad_w_v{0, 1}; @@ -98,6 +111,7 @@ TEST(TestSaberFunc, test_saber_conv_results) { std::vector output_channels_v{4}; std::vector bias_term_v{true, false}; std::vector with_relu_v{true, false}; + if (RUN_BASIC_TEST) { for (int bias_term : bias_term_v) for (int with_relu : with_relu_v) @@ -117,59 +131,64 @@ TEST(TestSaberFunc, test_saber_conv_results) { Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); -#ifdef USE_CUDA -// Tensor weights_dev; -// Tensor bias_dev; -// -// weights_dev.re_alloc(weights_s, AK_FLOAT); -// fill_tensor_rand(weights_dev, -5.f, 5.0f); -// if (bias_term) { -// bias_dev.re_alloc(bias_s, AK_FLOAT); -// fill_tensor_rand(bias_dev, -5.0f, 5.0f); -// } -// ConvParam param_nv(group, pad_h, pad_w, -// stride_h, stride_w, -// dilation_h, dilation_w, -// &weights_dev, &bias_dev); -// if (with_relu) { -// param_nv.activation_param = ActivationParam(Active_relu); -// } -#endif -#ifdef USE_X86_PLACE + #ifdef USE_CUDA + // Tensor weights_dev; + // Tensor bias_dev; + // + // weights_dev.re_alloc(weights_s, AK_FLOAT); + // fill_tensor_rand(weights_dev, -5.f, 5.0f); + // if (bias_term) { + // bias_dev.re_alloc(bias_s, AK_FLOAT); + // fill_tensor_rand(bias_dev, -5.0f, 5.0f); + // } + // ConvParam param_nv(group, pad_h, pad_w, + // stride_h, stride_w, + // dilation_h, dilation_w, + // &weights_dev, &bias_dev); + // if (with_relu) { + // param_nv.activation_param = ActivationParam(Active_relu); + // } + #endif + #ifdef USE_X86_PLACE Tensor weights_x86; weights_x86.re_alloc(weights_s, AK_FLOAT); fill_tensor_rand(weights_x86, -5.f, 5.0f); Tensor bias_x86; + if (bias_term) { bias_x86.re_alloc(bias_s, AK_FLOAT); fill_tensor_rand(bias_x86, -5.0f, 5.0f); } + ConvParam param_x86(group, pad_h, pad_w, - stride_h, stride_w, - dilation_h, dilation_w, - &weights_x86, &bias_x86); + stride_h, stride_w, + dilation_h, dilation_w, + &weights_x86, &bias_x86); + if (with_relu) { param_x86.activation_param = ActivationParam(Active_relu); } -#endif + + #endif + for (auto input_num : input_num_v) - for (auto height : in_h_v) - for (auto width : in_w_v) { -#ifdef USE_CUDA + for (auto height : in_h_v) + for (auto width : in_w_v) { + #ifdef USE_CUDA -// testbase_nv.set_param(param_nv);//set param -// testbase_nv.set_input_shape(Shape({input_num,in_channels,height,width}, -// Layout_NCHW));//add some input shape -// testbase_nv.run_test(conv_cpu_func, 1e-3);//run test -#endif -#ifdef USE_X86_PLACE - testbase_x86.set_param(param_x86);//set param - testbase_x86.set_input_shape(Shape({input_num, in_channels, height, width}, - Layout_NCHW));//add some input shape - testbase_x86.run_test(conv_cpu_func, 1e-3);//run test -#endif - } + // testbase_nv.set_param(param_nv);//set param + // testbase_nv.set_input_shape(Shape({input_num,in_channels,height,width}, + // Layout_NCHW));//add some input shape + // testbase_nv.run_test(conv_cpu_func, 1e-3);//run test + #endif + #ifdef USE_X86_PLACE + testbase_x86.set_param(param_x86);//set param + testbase_x86.set_input_shape(Shape({input_num, in_channels, height, width}, + Layout_NCHW));//add some input shape + testbase_x86.run_test(conv_cpu_func, 1e-3);//run test + #endif + } } } } @@ -179,28 +198,29 @@ int test_conv_results(int group, int out_channels, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w, int pad_h, int pad_w, bool bias_term, bool with_relu, - SaberImplStrategy strategy, ImplEnum imp) { - - LOG(INFO)<< " conv param: " - << " input_num = " << input_num - << " in_channels = " << in_channels - << " height = " << height - << " width = " << width - << " group = " << group - << " pad_h = " << pad_h - << " pad_w = " << pad_w - << " stride_h = " << stride_h - << " stride_w = " << stride_w - << " dilation_h = " << dilation_h - << " dilation_w = " << dilation_w - << " kernel_h = " << kernel_h - << " kernel_w = " << kernel_w - << " out_channels = " << out_channels - << " bias_term = " << (bias_term ? "true" : "false") - << " with_relu = " << (with_relu ? "true" : "false"); + SaberImplStrategy strategy, ImplEnum imp, float eps = 1e-3, int threads = 1) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false") + << " threads = " << threads; Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); - Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape weights_s({out_channels, in_channels / group, kernel_h, kernel_w}, Layout_NCHW); Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); // init input Tensor @@ -209,9 +229,10 @@ int test_conv_results(int group, input_dev.re_alloc(input_s, AK_FLOAT); input_host.re_alloc(input_s, AK_FLOAT); fill_tensor_rand(input_dev, -10.0f, 10.0f); + //fill_tensor_const(input_dev, 1.f); input_host.copy_from(input_dev); -// input_dev.set_scale({10.1f / 128}); -// LOG(INFO) << input_dev.get_scale()[0]; + // input_dev.set_scale({10.1f / 128}); + // LOG(INFO) << input_dev.get_scale()[0]; // init weights Tensor Tensor weights_dev; @@ -219,30 +240,38 @@ int test_conv_results(int group, weights_dev.re_alloc(weights_s, AK_FLOAT); weights_host.re_alloc(weights_s, AK_FLOAT); fill_tensor_rand(weights_dev, -10.0f, 10.0f); + //fill_tensor_const(weights_dev, 1.f); weights_host.copy_from(weights_dev); Tensor bias_dev; Tensor bias_host; + if (bias_term) { bias_dev.re_alloc(bias_s, AK_FLOAT); bias_host.re_alloc(bias_s, AK_FLOAT); fill_tensor_rand(bias_dev, -10.0f, 10.0f); bias_host.copy_from(bias_dev); } + Tensor output_dev; Tensor output_host; Tensor check_host; Context ctx1(0, 1, 1); -// ActivationParam act_param(Active_relu); + #ifdef USE_ARM_PLACE + ctx1.set_run_mode(SABER_POWER_HIGH, threads); + #endif + // ActivationParam act_param(Active_relu); ConvParam param(group, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, &weights_dev, &bias_dev); + if (with_relu) { ActivationParam act_param(Active_relu); param.activation_param = act_param; } + Conv conv; std::vector* > input_v; std::vector* > output_v; @@ -257,7 +286,6 @@ int test_conv_results(int group, param.stride_h, param.stride_w, param.group, imp); conv(input_v, output_v, param, ctx1); - typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); output_v[0]->record_event(stream); output_v[0]->sync(); @@ -265,31 +293,711 @@ int test_conv_results(int group, output_host.copy_from(output_dev); check_host.re_alloc(output_host.valid_shape(), AK_FLOAT); + conv_basic_check(input_host, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + param.activation_param.has_active); + + // print_tensor_valid(check_host); + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)output_host.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + if (max_ratio > eps) { + if (max_diff > eps){ + print_tensor_valid(weights_host); + print_tensor_valid(output_host); + print_tensor_valid(check_host); + LOG(FATAL) << " max_ratio = " << max_ratio << " max_diff = " << max_diff; + } + } + return 0; +} + +template +int count_diff(const dtype* src1, const dtype* src2, + int size, double max_ratio, + bool signed_input = false, bool wino = false) { + if (max_ratio <= 0) { + max_ratio = 0.1; + } + int count = 0; + if (wino) { + // It's a known issue that winograd convolution result is not bitwise identical as direct convolution result. + return count; + } + for (int i = 0; i < size; ++i) { + if (signed_input && (fabs(src1[i] - src2[i]) <= 1)) + continue; + double ratio = fabs(src1[i] - src2[i]) / fabs(src1[i] + src2[i] + 1e-12); + if (ratio > max_ratio) { + ++count; + } + } + return count; +} + +template +int test_conv_results_x86_C16R(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp,bool input_nchw=false, bool output_nhwc=false, + bool output_uint8=false) { + float abs_w_x=1.f; + float abs_b=2.f; + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false"); + Shape input_s; + if (input_nchw){ + input_s=Shape({input_num, in_channels, height, width}, Layout_NCHW); + }else{ + input_s=Shape({input_num, in_channels, height, width}, Layout_NCHW_C16R); + } + Shape weights_s({out_channels, in_channels / group, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_dev_s; + if (output_nhwc){ + output_dev_s=Shape({input_num, out_height, out_width,out_channels}, Layout_NHWC); + }else{ + output_dev_s=Shape({input_num, out_channels, out_height, out_width}, Layout_NCHW_C16R); + } + + + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_FLOAT); + input_host.re_alloc(input_s, AK_FLOAT); + + + fill_tensor_const(input_dev, abs_w_x); +// fill_tensor_seq(input_dev); +// fill_tensor_rand(input_dev, -abs_w_x, abs_w_x); + input_host.copy_from(input_dev); + + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + + fill_tensor_rand(weights_dev, -abs_w_x, abs_w_x); + bool nothing_flag = false; + std::string nothing_str = ""; +// fill_tensor_const(weights_dev, abs_w_x); +// load_tensor_in_io_format(weights_dev,nothing_flag,nothing_str,"../fp32/record+weights+conv+out+0+64_3_3_3_+nchw+ak_float+0.txt"); + + weights_host.copy_from(weights_dev); + + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); +// fill_tensor_const(bias_dev, 1); +// fill_tensor_const(bias_dev, abs_b); +// load_tensor_in_io_format(bias_dev,nothing_flag,nothing_str,"../fp32/record+bias+conv+out+0+1_64_1_1_+nchw+ak_float+0.txt"); + fill_tensor_rand(bias_dev, -abs_b, abs_b); + bias_host.copy_from(bias_dev); + } + + Tensor output_dev(output_dev_s); + if (output_uint8){ + output_dev.re_alloc(output_dev_s,AK_UINT8); + float max_out=(in_channels*kernel_h*kernel_w*abs_w_x*abs_w_x+abs_b); + output_dev.set_scale({max_out/127.f}); +// output_dev.set_scale({0.038397}); + LOG(INFO)<<"max out "< output_host(output_dev_s); + Tensor check_host; + fill_tensor_rand(output_dev, 0.f, 0.f); + Context ctx1(0, 1, 1); + // ActivationParam act_param(Active_relu); + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + if (with_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + + Conv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + conv.compute_output_shape(input_v, output_v, param); + SABER_CHECK(conv.init(input_v, output_v, param, strategy, imp, ctx1)); + SABER_CHECK(conv(input_v, output_v, param, ctx1)); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + + Tensor nchw_input_tensor(Shape({input_num, in_channels, height, width})); + reorder_nchwc_nchw(input_host, nchw_input_tensor); + check_host.re_alloc(Shape({input_num, out_channels, out_height, out_width}), AK_FLOAT); + Tensor nchw_output_check(check_host.valid_shape()); + conv_basic_check(nchw_input_tensor, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + param.activation_param.has_active); + LOG(INFO) << "cal check finish"; + Tensor nchwc16_output_check(check_host.valid_shape()); + if (output_nhwc){ + anakin::saber::reorder_nhwc_nchw(output_dev, nchwc16_output_check); + }else{ + anakin::saber::reorder_nchwc_nchw(output_dev, nchwc16_output_check); + } + + double max_ratio = 0.0; + double max_diff = 0.0; + if (output_uint8){ + tensor_cmp_host_mlu((const float*)nchwc16_output_check.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + if (max_ratio < 0.15) { + LOG(INFO)<<"mean ak "< +int test_conv_results_x86_C8R(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false"); + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW_C8R); + Shape weights_s({out_channels, in_channels / group, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_dev_s({input_num, out_channels, out_height, out_width}, Layout_NCHW_C8R); + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_FLOAT); + input_host.re_alloc(input_s, AK_FLOAT); +// { +// float *tmp= static_cast(input_dev.mutable_data()); +// for(int i=0;i weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); +// fill_tensor_const(weights_dev, 1.f); + // fill_tensor_seq(weights_dev); + fill_tensor_rand(weights_dev, -2.0f, 2.0f); + weights_host.copy_from(weights_dev); + + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + // fill_tensor_const(bias_dev, 3.f); + fill_tensor_rand(bias_dev, -2.0f, 2.0f); + bias_host.copy_from(bias_dev); + } + + Tensor output_dev(output_dev_s); + Tensor output_host(output_dev_s); + Tensor check_host; + fill_tensor_rand(output_dev, -2.0f, 2.0f); + Context ctx1(0, 1, 1); + // ActivationParam act_param(Active_relu); + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + if (with_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + + Conv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + // output_dev.set_layout_without_shape(Layout_NCHW_C8); + conv.compute_output_shape(input_v, output_v, param); + // LOG(INFO)<<"layout "<::API::stream_t stream = ctx1.get_compute_stream(); + // output_v[0]->record_event(stream); + // output_v[0]->sync(); + // output_host.re_alloc(output_dev.valid_shape(), AK_FLOAT); + // output_host.copy_from(output_dev); + + // print_tensor(input_dev); + // print_tensor(output_dev); + // print_tensor(output_host); + Tensor nchwc8_input_check(Shape({input_num, in_channels, height, width})); + anakin::saber::reorder_nchwc_nchw(input_host, nchwc8_input_check); + check_host.re_alloc(Shape({input_num, out_channels, out_height, out_width}), AK_FLOAT); + Tensor nchw_output_check(check_host.valid_shape()); + conv_basic_check(nchwc8_input_check, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + param.activation_param.has_active); + LOG(INFO) << "cal check finish"; + // print_tensor_valid(check_host); + + // anakin::saber::input_reorder_nChwc8(check_host,nchw_output_check); + Tensor nchwc8_output_check(check_host.valid_shape()); + anakin::saber::reorder_nchwc_nchw(output_dev, nchwc8_output_check); + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)nchwc8_output_check.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + + if (max_ratio > 1e-3 && max_diff > 1e-3) { + print_tensor(nchwc8_output_check); + print_tensor(check_host); +// print_tensor(input_host); +// print_tensor(weights_dev); + LOG(FATAL) << " max_ratio = " << max_ratio << " max_diff = " << max_diff; + } else { + LOG(INFO) << "passed"; + } + + return 0; +} + + +template +int test_conv_results_x86(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false"); + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_dev_s({input_num, (out_channels + 7) / 8, out_height, out_width, 8}, Layout_NCHW_C8); + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_FLOAT); + input_host.re_alloc(input_s, AK_FLOAT); +// { +// float *tmp= static_cast(input_dev.mutable_data()); +// for(int i=0;i weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + fill_tensor_const(weights_dev, 1.f); +// fill_tensor_rand(weights_dev, -2.0f, 2.0f); + weights_host.copy_from(weights_dev); + + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + // fill_tensor_const(bias_dev, 3.f); + fill_tensor_rand(bias_dev, -2.0f, 2.0f); + bias_host.copy_from(bias_dev); + } + + Tensor output_dev; + Tensor output_host; + Tensor check_host; + + Context ctx1(0, 1, 1); + // ActivationParam act_param(Active_relu); + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + if (with_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + + Conv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + // output_dev.set_layout_without_shape(Layout_NCHW_C8); + conv.compute_output_shape(input_v, output_v, param); +// LOG(INFO) << "layout " << output_dev.get_layout(); + output_dev.re_alloc(output_dev.valid_shape(), AK_FLOAT); + + // output_dev.re_alloc(output_dev_s, AK_FLOAT); + +// LOG(INFO) << "layout " << output_dev.get_layout(); + conv.init(input_v, output_v, param, strategy, imp, ctx1); +// LOG(INFO) << "layout " << output_dev.get_layout() << "," +// << output_dev.size() << "," < x86_timer; + x86_timer.start(ctx1); + + for (int i=0; i::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + output_host.re_alloc(output_dev.valid_shape(), AK_FLOAT); + output_host.copy_from(output_dev); + + // print_tensor(input_dev); + // print_tensor(output_dev); + // print_tensor(output_host); + check_host.re_alloc(Shape({input_num, out_channels, out_height, out_width}), AK_FLOAT); + Tensor nchw_output_check(check_host.valid_shape()); conv_basic_check(input_host, check_host, (const float*)weights_host.data(), (const float*)bias_host.data(), group, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, pad_w, pad_h, bias_term, param.activation_param.has_active); -// print_tensor_valid(check_host); + // print_tensor_valid(check_host); + + // anakin::saber::input_reorder_nChwc8(check_host,nchw_output_check); + // Tensor nchwc8_output_check(check_host.valid_shape()); + // anakin::saber::reorder_nchwc8_nchw(output_host,nchwc8_output_check); double max_ratio = 0.0; double max_diff = 0.0; tensor_cmp_host((const float*)output_host.data(), (const float*)check_host.data(), check_host.valid_size(), max_ratio, max_diff); - if (max_ratio > 1e-3) { - print_tensor_valid(output_host); + if (max_ratio > 1e-3 && max_diff>1e-3) { +// print_tensor(output_dev); +// print_tensor(check_host); +// print_tensor(input_host); LOG(FATAL) << " max_ratio = " << max_ratio << " max_diff = " << max_diff; + } else { + LOG(INFO) << "passed "<<" max_ratio = " << max_ratio << " max_diff = " << max_diff; } + return 0; } +#if defined(USE_X86_PLACE) +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#define X86_CONV_ONE_TEST 1 +TEST(TestSaberFunc, test_saber_x86_conv_results) { + + Env::env_init(); + bool use_avx512=jit::mayiuse(jit::avx512_common); + bool use_avx2=jit::mayiuse(jit::avx2); + //#ifdef USE_OPENMP + // omp_set_dynamic(0); + // omp_set_num_threads(1); + //#endif + + SaberImplStrategy strategy = SPECIFY; + ImplEnum imp = SABER_IMPL; +#if X86_CONV_ONE_TEST + int group = 1; + int input_num = 1; + int in_channels = 3; + int height = 224; + int width = 224; + int out_channels = 64; + int kernel_h = 3; + int kernel_w = 3; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + int pad_h = 1; + int pad_w = 1; + bool bias_term = true; + bool with_relu = true; +#else + + std::vector kernel_h_v{1, 3}; + std::vector kernel_w_v{1, 3}; + std::vector pad_h_v{0, 1}; + std::vector pad_w_v{0, 1}; + std::vector stride_h_v{1, 2}; + std::vector stride_w_v{1, 2}; + std::vector dilation_h_v{1, 2}; + std::vector dilation_w_v{1, 2}; + std::vector in_channels_v{16}; + std::vector out_channels_v{32}; + std::vector group_v{1}; + std::vector in_h_v{12, 21}; + std::vector in_w_v{12, 21}; + std::vector input_num_v{1, 3}; + std::vector bias_term_v{true, false}; + std::vector with_relu_v{true, false}; + for (auto group : group_v) { + for (auto input_num : input_num_v) { + for (auto out_channels : out_channels_v) { + for (auto in_channels : in_channels_v) { + for (auto kernel_h : kernel_h_v) { + for (auto kernel_w : kernel_w_v) { + for (auto height : in_h_v) { + for (auto width : in_w_v) { + for (auto stride_h : stride_h_v) { + for (auto stride_w : stride_w_v) { + for (auto dilation_h : dilation_h_v) { + for (auto dilation_w : dilation_w_v) { + for (auto pad_h : pad_h_v) { + for (auto pad_w : pad_w_v) { + for (auto bias_term : bias_term_v) { + for (auto with_relu : with_relu_v) { +#endif +if(use_avx512) { + for (int i = 0; i < 1; i++) { + test_conv_results_x86_C16R(group, + input_num, in_channels, + height, width, + out_channels, kernel_h, + kernel_w, + stride_h, stride_w, + dilation_h, dilation_w, + pad_h, pad_w, bias_term, + with_relu, + strategy, SABER_IMPL, true, true,true); + } +} + +// +//if(use_avx2) { +// for (int i = 0; i < 1; i++) { +// test_conv_results_x86_C8R(group, +// input_num, in_channels, +// height, width, +// out_channels, kernel_h, +// kernel_w, +// stride_h, stride_w, +// dilation_h, dilation_w, +// pad_h, pad_w, bias_term, +// with_relu, +// strategy, SABER_IMPL); +// } +//} + +// for (int i = 0; i < 1; i++) { +// test_conv_results_x86(group, +// input_num, +// in_channels, +// height, +// width, +// out_channels, +// kernel_h, +// kernel_w, +// stride_h, +// stride_w, +// dilation_h, +// dilation_w, +// pad_h, +// pad_w, +// bias_term, +// with_relu, +// strategy, +// imp); +// } + +#if !X86_CONV_ONE_TEST + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } +#endif + + +} + +#endif + TEST(TestSaberFunc, test_saber_cuda_conv_results) { #ifdef USE_CUDA Env::env_init(); Env::env_init(); #endif - std::vector kernel_h_v{1, 3}; + std::vector kernel_h_v {1, 3}; std::vector kernel_w_v{1, 3}; std::vector pad_h_v{0, 1}; std::vector pad_w_v{0, 1}; @@ -299,79 +1007,478 @@ TEST(TestSaberFunc, test_saber_cuda_conv_results) { std::vector dilation_w_v{1, 2}; std::vector in_channels_v{4, 8}; std::vector out_channels_v{4, 8}; -// std::vector group_v{1, 2, 32}; + // std::vector group_v{1, 2, 32}; std::vector in_h_v{24, 36}; std::vector in_w_v{24, 36}; std::vector input_num_v{1, 3}; std::vector bias_term_v{true, false}; std::vector with_relu_v{true, false}; #ifdef USE_CUDA + if (RUN_BASIC_TEST) { - for (auto input_num : input_num_v) { - for (auto out_channels : out_channels_v) { - for (auto in_channels : in_channels_v) { - for (auto kernel_h : kernel_h_v) { - for (auto kernel_w : kernel_w_v) { - for (auto height : in_h_v) { - for (auto width : in_w_v) { - for (auto stride_h : stride_h_v) { - for (auto stride_w : stride_w_v) { - for (auto dilation_h : dilation_h_v) { - for (auto dilation_w : dilation_w_v) { - for (auto pad_h : pad_h_v) { - for (auto pad_w : pad_w_v) { - for (auto bias_term : bias_term_v) { - for (auto with_relu : with_relu_v) { - test_conv_results(1, - input_num, - in_channels, - height, - width, - out_channels, - kernel_h, - kernel_w, - stride_h, stride_w, - dilation_h, dilation_w, - pad_h, pad_w, bias_term, - with_relu, - SPECIFY, - VENDER_IMPL); - test_conv_results(1, - input_num, - in_channels, - height, - width, - out_channels, - kernel_h, - kernel_w, - stride_h, stride_w, - dilation_h, dilation_w, - pad_h, pad_w, bias_term, - with_relu, - SPECIFY, - SABER_IMPL); - } - } - } - } - } - } - } + for (auto input_num : input_num_v) { + for (auto out_channels : out_channels_v) { + for (auto in_channels : in_channels_v) { + for (auto kernel_h : kernel_h_v) { + for (auto kernel_w : kernel_w_v) { + for (auto height : in_h_v) { + for (auto width : in_w_v) { + for (auto stride_h : stride_h_v) { + for (auto stride_w : stride_w_v) { + for (auto dilation_h : dilation_h_v) { + for (auto dilation_w : dilation_w_v) { + for (auto pad_h : pad_h_v) { + for (auto pad_w : pad_w_v) { + for (auto bias_term : bias_term_v) { + for (auto with_relu : with_relu_v) { + test_conv_results(1, + input_num, + in_channels, + height, + width, + out_channels, + kernel_h, + kernel_w, + stride_h, stride_w, + dilation_h, dilation_w, + pad_h, pad_w, bias_term, + with_relu, + SPECIFY, + VENDER_IMPL); + test_conv_results(1, + input_num, + in_channels, + height, + width, + out_channels, + kernel_h, + kernel_w, + stride_h, stride_w, + dilation_h, dilation_w, + pad_h, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } } + +#endif +} + + +TEST(TestSaberFunc, test_saber_arm_conv_results) { +#ifdef USE_ARM_PLACE + + Env::env_init(); +//!ToDO add set_run_mode interface + +//! conv1x1s1 +#if 1 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto out_channels : {1, 5, 16}) { + for (auto in_channels : {1, 3, 8}) { + for (auto kernel_w : {1}) { + for (auto height : {1, 3, 8, 15, 28, 32, 38, 75}) { + for (auto stride_w : {1}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto group: {1, 2, 4}){ + for (auto threads: {1, 2, 4}){ + if (in_channels % group != 0 || out_channels % group != 0) { + continue; + } + int width = height; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } + } + } } +#endif + +//! conv3x3s1(not winograd) +#if 0 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto out_channels : {3, 5, 16}) { + for (auto in_channels : {1, 3, 8}) { + for (auto kernel_w : {3}) { + for (auto height : {3, 4, 15, 28, 32, 38, 75, 112}) { + for (auto stride_w : {1}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0, 1, 2}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto group: {1}){ + for (auto threads: {1, 2, 4}){ + if (in_channels % group != 0 || out_channels % group != 0) { + continue; + } + int width = height; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 12-3f, + threads); + } + } + } + } + } + } + } + } + } + } + } + } } +#endif + +//! conv3x3s1(winograd) +#if 0 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto out_channels : {32, 64}) { + for (auto in_channels : {32, 64}) { + for (auto kernel_w : {3}) { + for (auto height : {38, 75, 112}) { + for (auto stride_w : {1}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0, 1, 2}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto group: {1}){ + for (auto threads: {1, 2, 4}){ + if (in_channels % group != 0 || out_channels % group != 0) { + continue; + } + int width = height; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-2f, + threads); + } + } + } + } + } + } + } + } + } + } + } + } } +#endif + +//! conv3x3s2 +#if 1 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto out_channels : {3, 5, 16}) { + for (auto in_channels : {1, 3, 8}) { + for (auto kernel_w : {3}) { + for (auto height : {7, 15, 28, 32, 38, 75, 112}) { + for (auto stride_w : {2}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0, 1, 2}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto group: {1}){ + for (auto threads: {1, 2, 4}){ + if (in_channels % group != 0 || out_channels % group != 0) { + continue; + } + int width = height; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } + } + } } +#endif + +//! conv3x3dw +#if 1 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto in_channels : {3, 5, 16}) { + for (auto kernel_w : {3}) { + for (auto height : {15, 28, 32, 38, 75, 112}) { + for (auto stride_w : {1, 2}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0, 1}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto threads: {1, 2, 4}){ + int width = height; + int out_channels = in_channels; + int group = in_channels; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } } +#endif + +//! conv5x5s1dw +#if 0 +#ifdef __aarch64__ + + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1}) { + for (auto in_channels : {3}) { + for (auto kernel_w : {5}) { + for (auto height : {15}) { + for (auto stride_w : {1}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {0}) { + for (auto bias_term : {false}) { + for (auto with_relu : {false}) { + for (auto threads: {1, 2, 4}){ + int width = height; + int out_channels = in_channels; + int group = in_channels; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } } +#endif +#endif + +//! conv5x5s2p2 dw +#if 1 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto in_channels : {3, 5, 16, 32}) { + for (auto kernel_w : {5}) { + for (auto height : {5, 15, 28, 32, 38, 75, 112}) { + for (auto stride_w : {2}) { + for (auto dilation_w : {1}) { + for (auto pad_w : {2}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto threads: {1, 2, 4}){ + int width = height; + int out_channels = in_channels; + int group = in_channels; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } } +#endif + +//! otherwise conv, invoke gemm +#if 1 + if (RUN_BASIC_TEST_ARM) { + for (auto input_num : {1, 2}) { + for (auto out_channels : {4, 8, 16}) { + for (auto in_channels : {1, 4, 8}) { + for (auto kernel_w : {2, 4, 5}) { + for (auto height : {15, 28, 32, 38, 75, 112}) { + for (auto stride_w : {1, 2, 4}) { + for (auto dilation_w : {1, 2}) { + for (auto pad_w : {0, 1, 2}) { + for (auto bias_term : {false, true}) { + for (auto with_relu : {false, true}) { + for (auto group: {1, 2}){ + for (auto threads: {1, 2, 4}){ + if (in_channels % group != 0 || out_channels % group != 0) { + continue; + } + int width = height; + test_conv_results( group, + input_num, + in_channels, + height, + width, + out_channels, + kernel_w, + kernel_w, + stride_w, stride_w, + dilation_w, dilation_w, + pad_w, pad_w, bias_term, + with_relu, + SPECIFY, + SABER_IMPL, + 1e-3f, + threads); + } + } + } + } + } + } + } + } + } + } + } + } } #endif -} +#endif +} int main(int argc, const char** argv) { // initial logger - //logger::init(argv[0]); + logger::init(argv[0]); InitTest(); RUN_ALL_TESTS(argv[0]); return 0; diff --git a/test/saber/test_saber_conv_eltwise.cpp b/test/saber/test_saber_conv_eltwise.cpp index df328a97e..a37cd3bfb 100644 --- a/test/saber/test_saber_conv_eltwise.cpp +++ b/test/saber/test_saber_conv_eltwise.cpp @@ -221,8 +221,9 @@ int test_conv_results(int group, group, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, pad_w, pad_h, bias_term, param.activation_param.has_active, 1.f); - +#ifdef USE_CUDA cudaDeviceSynchronize(); +#endif conv.init(input_v, output_v, conv_eltwise_param, strategy, imp, ctx1); conv.trans_weights(*param.mutable_weight(), *param.mutable_bias(), param.pad_h, param.pad_w, param.dilation_h, param.dilation_w, diff --git a/test/saber/test_saber_conv_eltwise_int8.cpp b/test/saber/test_saber_conv_eltwise_int8.cpp new file mode 100644 index 000000000..83d7eebeb --- /dev/null +++ b/test/saber/test_saber_conv_eltwise_int8.cpp @@ -0,0 +1,535 @@ +#include "saber/core/context.h" +#include "saber/funcs/conv_eltwise.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include "conv_func_helper.h" +#include "saber/core/tensor_op.h" +#include +#if defined(USE_X86_PLACE) +#include "jit_generator.h" +#endif +using namespace anakin::saber; + +template +int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) { + if (max_ratio <= 0) { + max_ratio = 0.1; + } + + int count = 0; + + for (int i = 0; i < size; ++i) { + double ratio = fabs(src1[i] - src2[i]) / fabs(src1[i] + src2[i] + 1e-12); + + if (ratio > max_ratio) { + ++count; + } + } + + return count; +} + +template +int test_conv_results(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool relu, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false"); + +#ifdef USE_X86_PLACE + Shape input_s({input_num, height, width, in_channels}, Layout_NHWC); + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape weights_s_dw({group, in_channels / group, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_s({input_num, out_height, out_width, out_channels}, Layout_NHWC); + + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_UINT8); + input_host.re_alloc(input_s, AK_UINT8); + fill_tensor_rand(input_dev, 0.0f, 32.0f); + input_host.copy_from(input_dev); + input_dev.set_scale({1 / 512.f}); + // LOG(INFO) << input_dev.get_scale()[0]; + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + + if (group > 1) { + weights_dev.re_alloc(weights_s_dw, AK_INT8); + weights_host.re_alloc(weights_s_dw, AK_INT8); + } else { + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + } + + fill_tensor_rand(weights_dev, -64.0f, 64.0f); + weights_host.copy_from(weights_dev); + std::vector scale_w_init; + + for (int i = 0; i < out_channels; i ++) { + scale_w_init.push_back(1 / 128.f); + } + + weights_dev.set_scale(scale_w_init); + + // int bias + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_INT32); + bias_host.re_alloc(bias_s, AK_INT32); + fill_tensor_rand(bias_dev, -1.0f, 1.0f); + bias_host.copy_from(bias_dev); + } + + Context ctx1(0, 1, 1); + ActivationParam act_param; + + if (relu) { + ActivationParam act_relu_param(Active_relu); + act_param = act_relu_param; + } + + ConvParam conv_param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, bias_term ? &bias_dev : nullptr, + act_param,1.f,0.f,round_mode::nearest); + + std::vector coeff; + coeff.push_back(1.0f); + coeff.push_back(0.5f); + EltwiseParam elt_param(Eltwise_sum, coeff); + ConvEltwiseParam param(conv_param, elt_param); + + // init output Tensor + Tensor output_dev; + Tensor output_host; + Tensor check_host; + + if (conv_param.activation_param.has_active) { + output_dev.re_alloc(output_s, AK_UINT8); + output_host.re_alloc(output_s, AK_UINT8); + output_dev.set_scale({1 / 256.0f}); + check_host.re_alloc(output_host.valid_shape(), AK_UINT8); + } else { + output_dev.re_alloc(output_s, AK_INT8); + output_host.re_alloc(output_s, AK_INT8); + output_dev.set_scale({1 / 128.0f}); + check_host.re_alloc(output_host.valid_shape(), AK_INT8); + } + + fill_tensor_const(output_dev, 4.0f); + check_host.copy_from(output_dev); + output_host.copy_from(output_dev); + + ConvEltwise conv_eltwise; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + + if (conv_eltwise.init(input_v, output_v, param, strategy, imp, ctx1) == SaberSuccess) { + conv_eltwise(input_v, output_v, param, ctx1); + } else { + LOG(INFO) << "conv_eltwise init fail"; + return 0; + } + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + + if (conv_param.activation_param.has_active) { + output_host.re_alloc(output_dev.valid_shape(), AK_UINT8); + output_host.copy_from(output_dev); + } else { + output_host.re_alloc(output_dev.valid_shape(), AK_INT8); + output_host.copy_from(output_dev); + } + + // calc scale info + std::vector scale; + float scale_in = input_dev.get_scale()[0]; + float scale_out = output_dev.get_scale()[0]; + auto scale_w = weights_dev.get_scale(); + std::vector().swap(scale); + + for (int i = 0; i < scale_w.size(); i++) { + scale.push_back((scale_w[i]*scale_in) / scale_out); + } + + conv_basic_check_int8(input_host, check_host, + (const char*)weights_host.data(), bias_term ? (const int*)bias_host.data() : nullptr, + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + conv_param.activation_param.has_active, scale, &elt_param); + int count = count_diff((const unsigned char*)output_host.data(), + (const unsigned char*)check_host.data(), check_host.valid_size(), 2e-1); + + + if ((double)count / output_host.valid_size() < 0.02) { + LOG(INFO) << "PASS!!! count = " << count; + return 0; + } else { + print_tensor_valid(output_host); + print_tensor_valid(check_host); + LOG(FATAL) << "FAIL!!! count = " << count + << " conv param: " + << " group = " << group + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels; + return -1; + } +#endif +} + +#ifdef USE_X86_PLACE +template +int test_conv_results_nhwc(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp,bool is_unsigned=true) { + + LOG(INFO)<< " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false"); + + float input_max=5.f; + Shape input_nhwc({input_num, height, width, in_channels}, Layout_NHWC); + Shape input_nchw({input_num, in_channels, height, width}, Layout_NCHW); + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_nhwc({input_num, out_height, out_width, out_channels}, Layout_NHWC); + Shape output_nchw({input_num, out_channels, out_height, out_width}, Layout_NCHW); + + // init input Tensor + Tensor input_dev; + Tensor input_host; + Tensor input_dev_temp; + input_dev.re_alloc(input_nhwc, AK_INT8); + input_dev_temp.re_alloc(input_nchw, AK_INT8); + input_host.re_alloc(input_nchw, AK_FLOAT); + bool nothing_flag = false; + std::string nothing_str = ""; + + fill_tensor_rand(input_host,-input_max,input_max); +// load_tensor_in_io_format(input_host,nothing_flag,nothing_str,"record+ConvEltwise+res2a_branch2c+in+0+1_64_56_56_+nchw+ak_float+0.txt"); + input_host.set_scale({input_max/127.f}); + utils::ScaleUtils::scale_fp32_int8(input_dev_temp,input_host); + reorder_nhwc_nchw(input_dev_temp,input_dev); + input_dev.set_scale(input_host.get_scale()); + + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + + fill_tensor_rand(weights_dev,-input_max,input_max); +// load_tensor_in_io_format(weights_dev,nothing_flag,nothing_str,"record+weights+conv_eltwise+out+0+256_64_1_1_+nchw+ak_float+0.txt"); + weights_host.copy_from(weights_dev); + + + Tensor bias_dev; + Tensor bias_host; + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + fill_tensor_rand(bias_dev, -input_max, input_max); +// fill_tensor_const(bias_dev, 0.f); +// load_tensor_in_io_format(bias_dev,nothing_flag,nothing_str,"record+bias+conv_eltwise+out+0+1_256_1_1_+nchw+ak_float+0.txt"); + bias_host.copy_from(bias_dev); + } + Tensor output_load_temp_fp32(output_nchw,AK_FLOAT); + Tensor output_load_temp_int8(output_nchw,AK_INT8); +// fill_tensor_const(output_load_temp_fp32,0); + fill_tensor_rand(output_load_temp_fp32,-input_max,input_max); +// load_tensor_in_io_format(output_load_temp_fp32,nothing_flag,nothing_str,"record+pre_out+conv_eltwise+out+3+1_256_56_56_+nchw+ak_float+0.txt"); + Tensor output_dev(output_nhwc,AK_INT8); + + output_dev.set_scale({(in_channels*kernel_h*kernel_w*input_max)/127.f}); + +// float elt_scale=0.019590; + float elt_scale=input_max/127.f; + Tensor output_host(output_nchw); + Tensor check_host(output_nchw); + check_host.copy_from(output_load_temp_fp32); + output_load_temp_int8.set_scale({elt_scale}); + output_load_temp_fp32.set_scale({elt_scale}); + LOG(INFO)<<"out scale "< ctx1(0, 1, 1); + EltwiseParam elt_param(Eltwise_sum,{1,1}); + + ConvParam conv_param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + if (with_relu) { + ActivationParam act_param(Active_relu); + conv_param.activation_param = act_param; + elt_param.activation_param=act_param; + } +// EltwiseParam elt_param(Eltwise_sum,{1,0.019590}); + conv_param.beta=elt_scale; + conv_param.beta_type=AK_INT8; + + ConvEltwiseParam conv_elt_param(conv_param,elt_param); + ConvEltwise conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); +// write_tensorfile(output_dev,"init_output",false); + conv.compute_output_shape(input_v, output_v, conv_elt_param); + + + conv.init(input_v, output_v, conv_elt_param, strategy, imp, ctx1); + + conv(input_v, output_v, conv_elt_param, ctx1); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + reorder_nhwc_nchw(output_dev,output_host); + + conv_basic_check(input_host, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + conv_elt_param.conv_param.activation_param.has_active,1.f); +// print_tensor_valid(check_host); + double max_ratio = 0.0; + double max_diff = 0.0; + //tensor_cmp_host((const float*)output_host.data(), (const float*)check_host.data(), + // check_host.valid_size(), max_ratio, max_diff); + tensor_cmp_host_mlu((const float*)output_host.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + +// int count = count_diff((const float*)output_host.data(), +// (const float*)check_host.data(), check_host.valid_size(), 2e-1); + if (max_ratio< 0.15) { + //LOG(INFO) << " PASS!!! max_ratio = " << max_ratio << " max_diff = " << max_diff; + write_tensorfile(output_host,"output_host"); + write_tensorfile(check_host,"check_host"); + LOG(INFO) << "PASS!!! ratio = " << max_ratio <<" in "< +#if defined(USE_X86_PLACE) +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#endif using namespace anakin::saber; -#define BASIC_TEST false +#define BASIC_TEST true template -int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) { +int count_diff(const dtype* src1, const dtype* src2, + int size, double max_ratio, + bool signed_input = false, bool wino = false) { if (max_ratio <= 0) { max_ratio = 0.1; } int count = 0; + if (wino) { + // It's a known issue that winograd convolution result is not bitwise identical as direct convolution result. + return count; + } for (int i = 0; i < size; ++i) { + if (signed_input && (fabs(src1[i] - src2[i]) <= 1)) + continue; double ratio = fabs(src1[i] - src2[i]) / fabs(src1[i] + src2[i] + 1e-12); if (ratio > max_ratio) { ++count; @@ -24,13 +35,14 @@ int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) return count; } -template -int test_conv_results(int group, - int input_num, int in_channels, int height, int width, - int out_channels, int kernel_h, int kernel_w, - int stride_h, int stride_w, int dilation_h, int dilation_w, - int pad_h, int pad_w, bool bias_term, - SaberImplStrategy strategy, ImplEnum imp) { +#ifdef USE_X86_PLACE +template +int test_conv_results_nhwc(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp,bool is_unsigned=true) { LOG(INFO)<< " conv param: " << " input_num = " << input_num @@ -48,6 +60,176 @@ int test_conv_results(int group, << " kernel_w = " << kernel_w << " out_channels = " << out_channels << " bias_term = " << (bias_term ? "true" : "false"); + float input_max=1.f; + Shape input_nhwc({input_num, height, width, in_channels}, Layout_NHWC); + Shape input_nchw({input_num, in_channels, height, width}, Layout_NCHW); + Shape weights_s({out_channels, in_channels/group, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int out_height = (pad_h * 2 + height - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + int out_width = (pad_w * 2 + width - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + Shape output_nhwc({input_num, out_height, out_width, out_channels}, Layout_NHWC); + Shape output_nchw({input_num, out_channels, out_height, out_width}, Layout_NCHW); + + // init input Tensor + Tensor input_dev; + Tensor input_host; + Tensor input_dev_temp; + if (is_unsigned) { + input_dev.re_alloc(input_nhwc, AK_UINT8); + input_dev_temp.re_alloc(input_nchw, AK_UINT8); + }else{ + input_dev.re_alloc(input_nhwc, AK_INT8); + input_dev_temp.re_alloc(input_nchw, AK_INT8); + } + input_host.re_alloc(input_nchw, AK_FLOAT); + bool nothing_flag = false; + std::string nothing_str = ""; + + if (is_unsigned) { + fill_tensor_rand(input_host, 0.f, input_max); +// fill_tensor_const(input_host,input_max); + }else{ + fill_tensor_rand(input_host, -input_max, input_max); +// fill_tensor_const(input_host,input_max); + } +// load_tensor_in_io_format(input_host,nothing_flag,nothing_str,"record+ConvBatchnormScaleRelu+res2a_branch2a+in+0+1_64_56_56_+nchw+ak_float+0.txt"); + input_host.set_scale({input_max/127.f}); + if (is_unsigned) { + utils::ScaleUtils::scale_fp32_uint8(input_dev_temp, input_host); + }else{ + utils::ScaleUtils::scale_fp32_int8(input_dev_temp, input_host); + } + reorder_nhwc_nchw(input_dev_temp,input_dev); + input_dev.set_scale(input_host.get_scale()); + +// LOG(INFO) << input_dev.get_scale()[0]; + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + + fill_tensor_rand(weights_dev,-input_max,input_max); +// fill_tensor_const(weights_dev, input_max);// +// load_tensor_in_io_format(weights_dev,nothing_flag,nothing_str,"record+weights_int8+conv+out+0+64_64_1_1_+nchw+ak_float+0.txt"); + weights_host.copy_from(weights_dev); + + + Tensor bias_dev; + Tensor bias_host; + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + fill_tensor_rand(bias_dev,-input_max,input_max); +// fill_tensor_const(bias_dev,input_max); +// load_tensor_in_io_format(bias_dev,nothing_flag,nothing_str,"record+bias_int8+conv+out+0+1_64_1_1_+nchw+ak_float+0.txt"); + bias_host.copy_from(bias_dev); + } + Tensor output_dev(output_nhwc, OutPutDtype); + if (OutPutDtype == AK_UINT8 || OutPutDtype == AK_INT8) { + output_dev.set_scale({in_channels * kernel_h * kernel_w * input_max / 127.f}); + } + Tensor output_host(output_nchw); + Tensor check_host(output_nchw); + + Context ctx1(0, 1, 1); + + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + if (with_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + Conv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + conv.compute_output_shape(input_v, output_v, param); + + + SABER_CHECK(conv.init(input_v, output_v, param, strategy, imp, ctx1)); + + SABER_CHECK(conv(input_v, output_v, param, ctx1)); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + reorder_nhwc_nchw(output_dev,output_host); + + conv_basic_check(input_host, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + param.activation_param.has_active); + + double max_ratio = 0.0; + double max_diff = 0.0; + + tensor_cmp_host_mlu((const float*)output_host.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + + + if (max_ratio< 0.15) { + //LOG(INFO) << " PASS!!! max_ratio = " << max_ratio << " max_diff = " << max_diff; +// write_tensorfile(output_host,"output_host"); +// write_tensorfile(check_host,"check_host"); + LOG(INFO) << "PASS!!! ratio = " << max_ratio <<" in "< output_dev; Tensor output_host; Tensor check_host; + Tensor check_host_int8; Context ctx1(0, 1, 1); -// ActivationParam act_param(Active_relu); + + int generate_arch = Env::cur_env()[ctx1.get_device_id()]._info._generate_arch; + // only support 61 arch for now. + bool arch_check = (generate_arch == 61); + if (!arch_check) { + LOG(INFO) << "device not support int8 op!!"; + return 0; + } + + ActivationParam act_param(Active_relu); ConvParam param(group, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, &weights_dev, &bias_dev); + if (with_relu) { + param.activation_param = act_param; + } Conv conv; std::vector* > input_v; std::vector* > output_v; @@ -99,8 +294,8 @@ int test_conv_results(int group, conv.init(input_v, output_v, param, strategy, imp, ctx1); conv.trans_weights(*param.mutable_weight(), *param.mutable_bias(), - param.pad_h, param.pad_w, param.dilation_h, param.dilation_w, - param.stride_h, param.stride_w, param.group, imp); + param.pad_h, param.pad_w, param.dilation_h, param.dilation_w, + param.stride_h, param.stride_w, param.group, imp); conv(input_v, output_v, param, ctx1); @@ -123,10 +318,12 @@ int test_conv_results(int group, //tensor_cmp_host((const float*)output_host.data(), (const float*)check_host.data(), // check_host.valid_size(), max_ratio, max_diff); int count = count_diff((const float*)output_host.data(), - (const float*)check_host.data(), check_host.valid_size(), 2e-1); + (const float*)check_host_int8.data(), check_host_int8.valid_size(), 2e-1); +// write_tensorfile(output_dev, "int8_output.txt"); +// write_tensorfile(check_host_int8, "fp32_output.txt"); if ((double)count / output_host.valid_size() < 0.02) { //LOG(INFO) << " PASS!!! max_ratio = " << max_ratio << " max_diff = " << max_diff; - LOG(INFO) << "PASS!!! count = " << count; + LOG(INFO) << "PASS!!! count = " << count; return 0; } else { write_tensorfile(output_dev, "int8_output.txt"); @@ -134,34 +331,169 @@ int test_conv_results(int group, // print_tensor_valid(output_host); // print_tensor_valid(check_host); //LOG(FATAL) << "FAIL!!! max_ratio = " << max_ratio << " max_diff = " << max_diff - LOG(FATAL) << "FAIL!!! count = " << count - << " conv param: " - << " input_num = " << input_num - << " in_channels = " << in_channels - << " height = " << height - << " width = " << width - << " group = " << group - << " pad_h = " << pad_h - << " pad_w = " << pad_w - << " stride_h = " << stride_h - << " stride_w = " << stride_w - << " dilation_h = " << dilation_h - << " dilation_w = " << dilation_w - << " kernel_h = " << kernel_h - << " kernel_w = " << kernel_w - << " out_channels = " << out_channels; + LOG(FATAL) << "FAIL!!! count = " << count + << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels; return -1; } } +template +int test_conv_results_s8s8(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO)<< " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false"); + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_FLOAT); + input_host.re_alloc(input_s, AK_FLOAT); + fill_tensor_rand(input_dev, -10.0f, 10.0f); + input_host.copy_from(input_dev); + input_dev.set_scale({10.1f / 128}); + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + fill_tensor_rand(weights_dev, -10.0f, 10.0f); + weights_host.copy_from(weights_dev); + + Tensor bias_dev; + Tensor bias_host; + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + fill_tensor_rand(bias_dev, -10.0f, 10.0f); + bias_host.copy_from(bias_dev); + } + Tensor output_dev; + output_dev.set_scale({200.1f / 128}); + Tensor output_host; + Tensor check_host; + + Context ctx1(0, 1, 1); + + int generate_arch = Env::cur_env()[ctx1.get_device_id()]._info._generate_arch; + // only support 61 arch for now. + bool arch_check = (generate_arch == 61); + if (!arch_check) { + LOG(INFO) << "device not support int8 op!!"; + return 0; + } + + ActivationParam act_param(Active_relu); + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + if (with_relu) { + param.activation_param = act_param; + } + Conv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + conv.compute_output_shape(input_v, output_v, param); + output_dev.re_alloc(output_dev.valid_shape(), AK_INT8); + + conv.init(input_v, output_v, param, strategy, imp, ctx1); + conv.trans_weights(*param.mutable_weight(), *param.mutable_bias(), + param.pad_h, param.pad_w, param.dilation_h, param.dilation_w, + param.stride_h, param.stride_w, param.group, imp); + + conv(input_v, output_v, param, ctx1); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + output_host.re_alloc(output_dev.valid_shape(), AK_INT8); + output_host.copy_from(output_dev); + + check_host.re_alloc(output_host.valid_shape(), AK_FLOAT); + + conv_basic_check(input_host, check_host, + (const float*)weights_host.data(), (const float*)bias_host.data(), + group, kernel_w, kernel_h, stride_w, stride_h, + dilation_w, dilation_h, pad_w, pad_h, bias_term, + param.activation_param.has_active); +// print_tensor(output_dev); +// int count = count_diff((const float*)output_host.data(), +// (const float*)check_host.data(), +// check_host.valid_size(), 2e-1); +// write_tensorfile(output_dev, "int8_output.txt"); +// write_tensorfile(check_host, "fp32_output.txt"); +// if ((double)count / output_host.valid_size() < 0.02) { +// //LOG(INFO) << " PASS!!! max_ratio = " << max_ratio << " max_diff = " << max_diff; +// LOG(INFO) << "PASS!!! count = " << count; +// return 0; +// } else { +// LOG(FATAL) << "FAIL!!! count = " << count +// << " conv param: " +// << " input_num = " << input_num +// << " in_channels = " << in_channels +// << " height = " << height +// << " width = " << width +// << " group = " << group +// << " pad_h = " << pad_h +// << " pad_w = " << pad_w +// << " stride_h = " << stride_h +// << " stride_w = " << stride_w +// << " dilation_h = " << dilation_h +// << " dilation_w = " << dilation_w +// << " kernel_h = " << kernel_h +// << " kernel_w = " << kernel_w +// << " out_channels = " << out_channels; +// return -1; +// } +} + TEST(TestSaberFunc, test_saber_conv_int8_results) { #ifdef USE_CUDA Env::env_init(); Env::env_init(); #endif -#ifdef USE_X86_PLACE - Env::env_init(); -#endif + std::vector kernel_h_v{1, 3}; std::vector kernel_w_v{1, 3}; std::vector pad_h_v{0, 1}; @@ -170,44 +502,103 @@ TEST(TestSaberFunc, test_saber_conv_int8_results) { std::vector stride_w_v{1, 2}; std::vector dilation_h_v{1}; std::vector dilation_w_v{1}; - std::vector in_channels_v{ 4}; - std::vector out_channels_v{4, 8}; + std::vector in_channels_v{ 16, 32}; + std::vector out_channels_v{16, 32, 8}; // std::vector group_v{1, 2, 32}; - std::vector in_h_v{24, 36}; - std::vector in_w_v{24, 36}; - std::vector input_num_v{1, 3}; - std::vector bias_term_v{true, false}; + std::vector in_h_v{28}; + std::vector in_w_v{28}; + std::vector input_num_v{1}; + std::vector bias_term_v{true}; + std::vector with_relu_v{true}; + #ifdef USE_CUDA if (BASIC_TEST) { - for (auto input_num : input_num_v) - for (auto out_channels : out_channels_v) - for (auto in_channels : in_channels_v) - for (auto kernel_h : kernel_h_v) - for (auto kernel_w : kernel_w_v) - for (auto height : in_h_v) - for (auto width : in_w_v) - for (auto stride_h : stride_h_v) - for (auto stride_w : stride_w_v) - for (auto dilation_h : dilation_h_v) - for (auto dilation_w : dilation_w_v) - for (auto pad_h : pad_h_v) - for (auto pad_w : pad_w_v) - for (auto bias_term : bias_term_v) - test_conv_results(1, - input_num, - in_channels, - height, - width, - out_channels, - kernel_h, - kernel_w, - stride_h, stride_w, dilation_h, dilation_w, - pad_h, pad_w, bias_term, - SPECIFY, - VENDER_IMPL); + for (auto input_num : input_num_v) { + for (auto out_channels : out_channels_v) { + for (auto in_channels : in_channels_v) { + for (auto kernel_h : kernel_h_v) { + for (auto kernel_w : kernel_w_v) { + for (auto height : in_h_v) { + for (auto width : in_w_v) { + for (auto stride_h : stride_h_v) { + for (auto stride_w : stride_w_v) { + for (auto dilation_h : dilation_h_v) { + for (auto dilation_w : dilation_w_v) { + for (auto pad_h : pad_h_v) { + for (auto pad_w : pad_w_v) { + for (auto bias_term : bias_term_v) { + for (auto with_relu : with_relu_v) { + test_conv_results_s8s8(1, + input_num, + in_channels, + height, + width, + out_channels, + kernel_h, + kernel_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + pad_h, pad_w, + bias_term, + with_relu, + SPECIFY, + SABER_IMPL); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } } #endif } +TEST(TestSaberFunc, test_saber_conv_int8_x86_results) { +#ifdef USE_X86_PLACE + Env::env_init(); + + int group = 1; + int input_num = 1; + int in_channels = 23; + int height = 112; + int width = 112; + int out_channels = 64; + int kernel_h = 3; + int kernel_w = 3; + int stride_h = 1; + int stride_w = 1; + int dilation_h = 1; + int dilation_w = 1; + int pad_h = 3; + int pad_w = 3; + bool bias_term = true; + bool with_relu = false; + + if (jit::mayiuse(jit::avx512_core)&&jit::mayiuse(jit::avx512_core_vnni)) { + test_conv_results_nhwc(group, + input_num, in_channels, + height, width, + out_channels, kernel_h, + kernel_w, + stride_h, stride_w, + dilation_h, dilation_w, + pad_h, pad_w, bias_term,with_relu, + SPECIFY, SABER_IMPL, false); + + } +#endif + +} int main(int argc, const char** argv) { diff --git a/test/saber/test_saber_conv_int8_arm.cpp b/test/saber/test_saber_conv_int8_arm.cpp new file mode 100644 index 000000000..259372542 --- /dev/null +++ b/test/saber/test_saber_conv_int8_arm.cpp @@ -0,0 +1,944 @@ +#include "saber/core/tensor_op.h" +#ifdef USE_ARM_PLACE +#include "saber/core/tensor_op.h" +#include "saber/funcs/timer.h" +#include "test/saber/test_saber_func.h" +#include "saber/funcs/conv.h" +#include "saber/funcs/impl/arm/neon/impl/conv_arm_impl.h" +#include "saber/funcs/type_trans.h" +using namespace anakin::saber; + + + +int g_cluster = 0; +int g_threads = 1; +int g_test_iter = 1; + +bool g_basic_test = false; +bool g_compare_result = true; +bool g_flag_relu = false; +bool g_flag_bias = false; + +int g_num = 1; +int g_chin = 4; +int g_h_in = 10; +int g_w_in = 10; + +int g_ch_out = 4; +int g_group = 1; +int g_kw = 1; +int g_pad_w = 0; +int g_stride_w = 1; +int g_dila_w = 1; +int g_kh = 1; +int g_pad_h = 0; +int g_stride_h = 1; +int g_dila_h = 1; + +typedef Tensor TensorH; + +/** + * \brief basic direct convolution function + */ +//! for float, dtype1 and type2 is float +//! for int8, dytpe1 is char, dtype2 is int +template +static void conv_basic(const Dtype1* din, Dtype2* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const Dtype1* weights, const Dtype2* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu) { + + Dtype2 beta = 0; + auto src_data = din; + auto dst_data_ref = dout; + auto weights_data = weights; + auto with_bias = flag_bias; + auto bias_data = bias; + + int in_num = num; + int out_channels = chout; + int out_h = hout; + int out_w = wout; + + int in_channel = chin; + int in_h = hin; + int in_w = win; + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + for (int n = 0; n < in_num; ++n) { +#pragma omp parallel for collapse(4) + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + g * out_c_group * out_h * out_w + + oc * out_h * out_w + oh * out_w + ow; + Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0; + dst_data_ref[out_idx] = bias_d;// + dst_data_ref[out_idx] * beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + + ic * in_h * in_w + + ih * in_w + + iw; + int widx = g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + + kh * kernel_w + + kw; + + dst_data_ref[out_idx] + += src_data[iidx] + * weights_data[widx]; + } + } + } + if (flag_relu) { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 ? dst_data_ref[out_idx] : (Dtype2)0; + } + } + } + } + } + } +} + +template +static int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio, float tensor_scale) { + double sum_abs1 = 0.0; + double sum_abs2 = 0.0; + for (int i = 0; i < size; ++i) { + sum_abs1 += fabs(src1[i]); + sum_abs2 += fabs(src2[i]); + } + double mean_abs1 = sum_abs1 / size; + double mean_abs2 = sum_abs2 / size; + double mean_val = (mean_abs1 + mean_abs2) / 2.0; + if (max_ratio <= 0) { + max_ratio = 0.1; + } + int count = 0; + for (int i = 0; i < size; ++i) { + double abs_diff = fabs(src1[i] - src2[i]); + double ratio = abs_diff / (fabs(src1[i] + src2[i]) + 1e-12); + if (ratio > max_ratio && abs_diff > (tensor_scale + 1e-5f) && abs_diff > mean_val * 0.1f) { + ++count; + } + } + return count; +} + +SaberStatus test_arm_conv_int8(int n, int c, int h, int w, \ + int ch_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w, int pad_h, \ + int dila_w, int dila_h, int group, bool is_bias, bool is_relu, int thread_num, int cluster_id) { + + double to = 0; + double min_time = 1000000; + SaberTimer t1; + + Context ctx1; + PowerMode mode = static_cast(cluster_id); + ctx1.set_run_mode(mode, thread_num); + LOG(INFO) << "test threads activated"; +#pragma omp parallel + { +#ifdef USE_OPENMP + int thread = omp_get_num_threads(); + LOG(INFO) << "number of threads: " << thread; +#endif + } + + TensorH tout_basic_int32; + TensorH tout_basic_int8; + TensorH tout_saber_int32; + TensorH tout_saber_int8; + TensorH tout_basic_fp32; + TensorH tout_saber_fp32; + + TensorH thinf; + TensorH thinc; + Shape shin({n, c, h, w}); + thinf.re_alloc(shin, AK_FLOAT); + thinc.re_alloc(shin, AK_INT8); + + int num = n; + int chin = c; + int hin = h; + int win = w; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << num << " in_channels = " << chin << " img_h = " << hin << " img_w = " << win; + LOG(INFO) << " ch_out = " << ch_out << " group = " << group + << " kernel_w = " << kernel_w << " kernel_h = " << kernel_h; + LOG(INFO) << " pad_width = " << pad_w << " pad_height = " << pad_h << \ + " stride_width = " << stride_w << " stride_height = " << stride_h << \ + " dilation_w = " << dila_w << " dilation_h = " << dila_h << \ + " bias flag = " << (is_bias? "true" : "false") << ", relu flag = " << (is_relu? "true" : "false"); + + int kernel_exten = dila_h * (kernel_h - 1) + 1; + int hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; + + kernel_exten = dila_w * (kernel_w - 1) + 1; + int wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; + + if (hout <= 0 || wout <= 0) { + return SaberSuccess; + } + + Shape shape_out({num, ch_out, hout, wout}); + + Shape shw({ch_out, chin / group, kernel_h, kernel_w}); + Shape shb({1, ch_out, 1, 1}); + + TensorH pweihtf; + TensorH pbiasf; + + TensorH pweihtc; + TensorH pbiasi; + + pweihtf.re_alloc(shw, AK_FLOAT); + //pbiasf.re_alloc(shb, AK_FLOAT); + + pweihtc.re_alloc(shw, AK_FLOAT); + //pbiasi.re_alloc(shb, AK_INT32); + + fill_tensor_rand(thinf, -1.f, 1.f); + fill_tensor_rand(pweihtf, -1.f, 1.f); + // fill_tensor_const(thinf, 1.f); + // fill_tensor_const(pweihtf, 1.f); + + LOG(INFO) << "get input scale"; + pweihtc.copy_from(pweihtf); + //! convert input data type + std::vector scale; + get_tensor_scale(thinf, scale, -1, 127.f); + thinf.set_scale(scale); + LOG(INFO) << "input tesnor scale at factor 127.f is " << thinf.get_scale()[0] << ", max_val: " << 127.f * thinf.get_scale()[0]; + + trans_tensor_dtype(thinf, thinc, scale[0], 1.f, {1.f}); + thinc.set_scale(scale); +// print_tensor(thinf); +// print_tensor(thinc); + + LOG(INFO) << "get weights scale"; + //! convert weight data type + + trans_weights_dtype(pweihtc, AK_INT8, 127.f, CONV_TYPE, group); + std::vector w_scale = pweihtc.get_scale(); + // LOG(INFO) << "input tesnor scale at factor 127.f is "; + // for (int j = 0; j < w_scale.size(); ++j) { + // LOG(INFO) << "|-- " << j << ": " << w_scale[j] << ", max_val: " << 127.f * w_scale[j]; + // } + if (is_bias){ + pbiasf.re_alloc(shb, AK_FLOAT); + pbiasi.re_alloc(shb, AK_INT32); + fill_tensor_rand(pbiasf, -1.f, 1.f); + trans_fp32_bias_to_int32(pbiasf, pbiasi, thinf.get_scale()[0], w_scale); + } + +// print_tensor(pweihtf); +// print_tensor(pweihtc); + + std::vector scale_out = {1.f}; + tout_saber_int8.set_scale(scale_out); + tout_basic_int8.set_scale(scale_out); + + //! get int8 and fp32 basic result + if (g_compare_result) { + LOG(INFO) << "run basic conv for precision comparation"; + const int8_t* dinc = static_cast(thinc.data()); + const int8_t* weightc = static_cast(pweihtc.data()); + const int* biasi = static_cast(pbiasi.data()); + const float* dinf = static_cast(thinf.data()); + const float* weightf = static_cast(pweihtf.data()); + const float* biasf = static_cast(pbiasf.data()); + tout_basic_fp32.re_alloc(shape_out, AK_FLOAT); + tout_basic_int32.re_alloc(shape_out, AK_INT32); + tout_basic_int8.re_alloc(shape_out, AK_INT8); + + float* dout_basic_fp32 = static_cast(tout_basic_fp32.mutable_data()); + int* dout_basic_int32 = static_cast(tout_basic_int32.mutable_data()); + + memset(dout_basic_fp32, 0, sizeof(float) * tout_basic_fp32.valid_size()); + memset(dout_basic_int32, 0, sizeof(float) * tout_basic_int32.valid_size()); + +// LOG(INFO) << "do basic fp32 conv"; +// conv_basic(dinf, dout_basic_fp32, num, ch_out, hout, wout, chin, hin, win, \ +// weightf, biasf, group, kernel_w, kernel_h, stride_w, stride_h, \ +// dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); + + LOG(INFO) << "do basic int8 conv, trans basic int32 to fp32"; + conv_basic(dinc, dout_basic_int32, num, ch_out, hout, wout, chin, hin, win, \ + weightc, biasi, group, kernel_w, kernel_h, stride_w, stride_h, \ + dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); + + LOG(INFO) << "trans basic int32 to int8"; + trans_tensor_dtype(tout_basic_int32, tout_basic_int8, thinf.get_scale()[0], tout_basic_int8.get_scale()[0], w_scale); + LOG(INFO) << "trans basic int32 to fp32"; + trans_tensor_dtype(tout_basic_int32, tout_basic_fp32, thinf.get_scale()[0], 1.f, w_scale); + +// print_tensor(tout_basic_fp32); + // LOG(INFO) << "basic in32 result"; + // print_tensor(tout_basic_int32); + } + + Conv conv_int8; + Conv conv_int8_fp32; + Conv conv_int8_int32; + + ConvParam param(group, pad_h, pad_w, stride_h, stride_w, dila_h, dila_w, &pweihtc, &pbiasf); + if (is_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + std::vector tvin_fp32; + std::vector tvin_int8; + std::vector tvout_saber_fp32; + std::vector tvout_saber_int32; + std::vector tvout_saber_int8; + + tvin_fp32.push_back(&thinf); + tvin_int8.push_back(&thinc); + tvout_saber_fp32.push_back(&tout_saber_fp32); + tvout_saber_int32.push_back(&tout_saber_int32); + tvout_saber_int8.push_back(&tout_saber_int8); + + //! fp32 + conv_int8_fp32.compute_output_shape(tvin_int8, tvout_saber_fp32, param); + Shape sh_out_saber_fp32 = tvout_saber_fp32[0]->valid_shape(); + //! int32 + conv_int8_int32.compute_output_shape(tvin_int8, tvout_saber_int32, param); + Shape sh_out_saber_int32 = tvout_saber_int32[0]->valid_shape(); + //! int8 + conv_int8.compute_output_shape(tvin_int8, tvout_saber_int8, param); + Shape sh_out_saber = tvout_saber_int8[0]->valid_shape(); + + LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ + << shape_out[2] << ", " << shape_out[3]; + CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; + + //! re_alloc mem for output tensor +// LOG(INFO) << "re-alloc output memory"; + tvout_saber_int32[0]->re_alloc(shape_out, AK_INT32); + tvout_saber_fp32[0]->re_alloc(shape_out, AK_FLOAT); + tvout_saber_int8[0]->re_alloc(shape_out, AK_INT8); + + //! init the op + LOG(INFO) << "saber conv impl init"; + //! fp32 + auto states = conv_int8_fp32.init(tvin_int8, tvout_saber_fp32, param, SPECIFY, SABER_IMPL, ctx1); + // states = conv_int8.init(tvin_int8, tvout_saber_fp32, ctx1); + //! int32 + states = conv_int8_int32.init(tvin_int8, tvout_saber_int32, param, SPECIFY, SABER_IMPL, ctx1); + //! int8 + states = conv_int8.init(tvin_int8, tvout_saber_int8, param, SPECIFY, SABER_IMPL, ctx1); + CHECK_EQ(states, SaberSuccess) << "Saber conv init failed"; + + //! compute + LOG(INFO) << "saber conv compute"; + to = 0; + min_time = 1000000; + for (int i = 0; i < g_test_iter; ++i) { + t1.clear(); + t1.start(ctx1); + //! fp32 + //states = conv_int8.dispatch(tvin_int8, tvout_saber_fp32); + //! int32 + //states = conv_int8.dispatch(tvin_int8, tvout_saber_int32); + //! int8 + states = conv_int8(tvin_int8, tvout_saber_int8, param, ctx1); + t1.end(ctx1); + to += t1.get_average_ms(); + if (t1.get_average_ms() < min_time) { + min_time = t1.get_average_ms(); + } + CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; + } + double gops = 2.0 * n * ch_out * wout * hout * (chin / group) * kernel_w * kernel_h; + LOG(INFO) << "saber int8 conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ + ", GOPS: " << 0.000001 * gops / min_time; + to = 0; + min_time = 1000000; + for (int i = 0; i < g_test_iter; ++i) { + t1.clear(); + t1.start(ctx1); + //! int32 + states = conv_int8_int32(tvin_int8, tvout_saber_int32, param, ctx1); + t1.end(ctx1); + to += t1.get_average_ms(); + if (t1.get_average_ms() < min_time) { + min_time = t1.get_average_ms(); + } + CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; + } + + LOG(INFO) << "saber int32 conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ + ", GOPS: " << 0.000001 * gops / min_time; + to = 0; + min_time = 1000000; + for (int i = 0; i < g_test_iter; ++i) { + t1.clear(); + t1.start(ctx1); + //! fp32 + states = conv_int8_fp32(tvin_int8, tvout_saber_fp32, param, ctx1); + t1.end(ctx1); + to += t1.get_average_ms(); + if (t1.get_average_ms() < min_time) { + min_time = t1.get_average_ms(); + } + CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; + } + LOG(INFO) << "saber fp32 conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ + ", GOPS: " << 0.000001 * gops / min_time; + +// print_tensor(tout_saber_fp32); +#if 0 + if (g_compare_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic_fp32, tout_saber_fp32, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + if (fabsf(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "basic result"; + print_tensor(tout_basic_fp32); + LOG(WARNING) << "saber result"; + print_tensor(tout_saber_fp32); + TensorH tdiff(tout_basic_fp32.valid_shape(), AK_FLOAT); + tensor_diff(tout_basic_fp32, tout_saber_fp32, tdiff); + print_tensor(tdiff); + return SaberInvalidValue; + } + } + } +#endif +#if 1 + if (g_compare_result) { + LOG(INFO) << "int32 result: "; + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host((const int*)tout_basic_int32.data(), (const int*)tout_saber_int32.data(), tout_basic_int32.valid_size(), max_ratio, max_diff); + LOG(INFO) << "int32 compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + + //! int32 + double mean_basic = tensor_mean_value(tout_basic_int32, nullptr); + double mean_saber = tensor_mean_value(tout_saber_int32, nullptr); + + LOG(INFO) << "int32 mean_basic: " << mean_basic << ", mean_saber: " << mean_saber; + double max_ratio_thresh = 2e-1f; + //! int32 + long long diff_num = count_diff(static_cast(tout_basic_int32.data()), \ + static_cast(tout_saber_int32.data()), tout_saber_int32.valid_size(), max_ratio_thresh, thinf.get_scale()[0]); + LOG(INFO) << "int32 number of diff ratio > " << max_ratio_thresh << " is: " << diff_num << ", %" \ + << 100.f * diff_num / tout_basic_int32.valid_size(); + + if ((float)diff_num / tout_saber_int32.valid_size() > 0.05/* || mean_diff_ratio > 0.1*/) { + //!int32 + print_tensor(thinc); + print_tensor(pweihtc); + LOG(INFO) << "int32 basic result:"; + print_tensor(tout_basic_int32); + LOG(INFO) << "int32 saber result:"; + print_tensor(tout_saber_int32); + return SaberInvalidValue; + } + LOG(INFO) << "int32 passed"; + } + if (g_compare_result) { + LOG(INFO) << "fp32 result: "; + double max_ratio = 0; + double max_diff = 0; + // ! fp32 + tensor_cmp_host((const float*)tout_basic_fp32.data(), (const float*)tout_saber_fp32.data(), tout_basic_fp32.valid_size(), max_ratio, max_diff); + // ! int8 + LOG(INFO) << "fp32 compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + + //! fp32 + double mean_basic = tensor_mean_value(tout_basic_fp32, nullptr); + double mean_saber = tensor_mean_value(tout_saber_fp32, nullptr); + + LOG(INFO) << "fp32 mean_basic: " << mean_basic << ", mean_saber: " << mean_saber; + double max_ratio_thresh = 2e-1f; + //! fp32 + long long diff_num = count_diff(static_cast(tout_basic_fp32.data()), \ + static_cast(tout_saber_fp32.data()), tout_saber_fp32.valid_size(), max_ratio_thresh, thinf.get_scale()[0]); + LOG(INFO) << "fp32 number of diff ratio > " << max_ratio_thresh << " is: " << diff_num << ", %" \ + << 100.f * diff_num / tout_basic_fp32.valid_size(); + + if ((float)diff_num / tout_saber_fp32.valid_size() > 0.05/* || mean_diff_ratio > 0.1*/) { + //! fp32 + print_tensor(thinc); + print_tensor(pweihtc); + + LOG(INFO) << "fp32 basic result-int32:"; + print_tensor(tout_basic_int32); + LOG(INFO) << "fp32 basic result-fp32:"; + print_tensor(tout_basic_fp32); + LOG(INFO) << "fp32 saber result-fp32:"; + print_tensor(tout_saber_fp32); + + return SaberInvalidValue; + } + LOG(INFO) << "fp32 passed"; + } + if (g_compare_result) { + LOG(INFO) << "int8 result: "; + double max_ratio = 0; + double max_diff = 0; + // ! int8 + tensor_cmp_host((const int8_t*)tout_basic_int8.data(), (const int8_t*)tout_saber_int8.data(), \ + tout_basic_int8.valid_size(), max_ratio, max_diff); + LOG(INFO) << "int8 compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + //! int8 + double mean_basic = tensor_mean_value(tout_basic_int8, nullptr); + double mean_saber = tensor_mean_value(tout_saber_int8, nullptr); + + LOG(INFO) << "int8 mean_basic: " << mean_basic << ", mean_saber: " << mean_saber; + double max_ratio_thresh = 2e-1f; + //! int8 + long long diff_num = count_diff(static_cast(tout_basic_int8.data()), \ + static_cast(tout_saber_int8.data()), tout_saber_int8.valid_size(), max_ratio_thresh, thinf.get_scale()[0]); + LOG(INFO) << "int8 number of diff ratio > " << max_ratio_thresh << " is: " << diff_num << ", %" \ + << 100.f * diff_num / tout_saber_int8.valid_size(); + if ((float)diff_num / tout_saber_int8.valid_size() > 0.05/* || mean_diff_ratio > 0.1*/) { + //! int8 + print_tensor(thinc); + print_tensor(pweihtc); + LOG(INFO) << "int8 basic result int32:"; + print_tensor(tout_basic_int32); + LOG(INFO) << "int8 basic result int8:"; + print_tensor(tout_basic_int8); + LOG(INFO) << "int8 saber result:"; + print_tensor(tout_saber_int8); + return SaberInvalidValue; + } + LOG(INFO) << "int8 passed"; +// CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; + } +#endif + return SaberSuccess; +} + +#if 1 +TEST(TestSaberFunc, test_func_conv_depthwise_3x3_int8) { + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : {1, 3, 8, 16, 24}) { + for (auto& h : {4, 8, 9, 15, 28, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 112, 128, 256}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1, 2, 4}) { + for (auto & stride : {1, 2}){ + int stride_w = stride; + int stride_h = stride; + int group = c; + int pad_w = 1; + int pad_h = 1; + int dila_w = 1; + int dila_h = 1; + int kw = 3; + int kh = 3; + int w = h; + int chout = c; + LOG(INFO) << "conv_depthwise_3x3_int8 OP"; + auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ + th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 3x3s2_dw conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 3x3s2_dw conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } +} +#endif + +#ifdef __aarch64__ +#if 0 +TEST(TestSaberFunc, test_func_conv_depthwise_5x5_int8) { + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : { 1, 3, 8, 16, 24}) { + for (auto& h : {1, 2, 4, 8, 9, 15, 28, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,/* 112, 128, 256*/}) { + for (auto &flag_bias : {false, /*true*/}) { + for (auto &flag_relu : {false, /*true*/}) { + for (auto &th : {2 /*1, 2, 4*/}) { + for (auto & stride : {1/*, 2*/}){ + int stride_w = stride; + int stride_h = stride; + int group = c; + int pad_w = 2; + int pad_h = 2; + int dila_w = 1; + int dila_h = 1; + int kw = 5; + int kh = 5; + int w = h; + int chout = c; + LOG(INFO) << "conv_depthwise_5x5_int8 OP"; + auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ + th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 5x5s1_dw conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 5x5s1_dw conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } +} +#endif +#endif // __aarch64__ + +#if 1 +TEST(TestSaberFunc, test_func_conv_3x3s1_direct_int8) { + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : {1, 3, 8, 16, 32, 64}) { + for (auto& h : {5, 15, 16, 28, 56, 112, 128, 256}) { + for (auto& w : {6, 15, 28, 29, 30, 31, 32, 56, 112, 128, 255, 256}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1, 2, 4}) { + for (auto & chout : {3, 8, 9, 10, 11, 12}){ + int stride_w = 1; + int stride_h = 1; + int group = 1; + int pad_w = 1; + int pad_h = 1; + int dila_w = 1; + int dila_h = 1; + int kw = 3; + int kh = 3; + LOG(INFO) << "conv_3x3s1_direct_int8 OP"; + auto flag = test_arm_conv_int8(batch, c, h, w, chout, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ + th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 3x3s1_direct conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 3x3s1_direct conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << chout << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } + } +} +#endif + +#if 1 +TEST(TestSaberFunc, test_func_conv_3x3s2_direct_int8) { + + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& ci : {2, 3, 8}) { + for (auto& co : {1, 5, 16}) { + for (auto& h : {1, 3, 8, 15, 16, 28, 32, 75}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1, 2, 4}) { + int stride_w = 2; + int stride_h = 2; + int group = 1; + int pad_w = 1; + int pad_h = 1; + int dila_w = 1; + int dila_h = 1; + int kw = 3; + int kh = 3; + LOG(INFO) << "conv_3x3s2_direct_int8 OP"; + auto flag = test_arm_conv_int8(batch, ci, h, h, co, kw, kh, stride_w, stride_h, \ + pad_w, pad_h, dila_w, dila_h, group, flag_bias, flag_relu, \ + th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 3x3s2_direct conv: batchsize: " << batch << ", channel: " + << ci << ", h & w: " << h << ", ch_out: " << co << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 3x3s2_direct conv: batchsize: " << batch << ", channel: " + << ci << ", h & w: " << h << ", ch_out: " << co << ", group: " << group << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } +} +#endif + +#if 1 +TEST(TestSaberFunc, test_func_conv_1x1s1_int8) { + + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : {1, 3, 8}) { + for (auto& cout : {1, 5, 16}) { + for (auto& g_div : {1, 2}) { + for (auto& h : {1, 3, 8, 15, 28, 32, 38, 75}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1, 2, 4}) { + int w = h; + int g = g_div; + if ((c % g_div != 0) || (cout % g_div != 0)) { + g = 1; + } + auto flag = test_arm_conv_int8(batch, c, h, w, cout, 1, 1, 1, 1, \ + 0, 0, 1, 1, g, flag_bias, flag_relu, th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 1x1s1 conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << cout << ", group: " << g << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 1x1s1 conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << cout << ", group: " << g << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } + } +} +#endif + +#if 1 +TEST(TestSaberFunc, test_func_conv_gemm_int8) { + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : {1, 3, 8}) { + for (auto& cout : {1, 5, 16}) { + for (auto& g_div : {1, 2}) { + for (auto& h : {1, 3, 8, 15, 28, 32, 38, 75}) { + for (auto& kw : {1, 2, 3, 5}) { + for (auto& kh : {1, 2, 3, 5}) { + for (auto& pad : {1, 2}) { + for (auto& stride : {1, 2}) { + for (auto& dila : {1, 2}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1, 2, 4}) { + int w = h; + int g = g_div; + if ((c % g_div != 0) || (cout % g_div != 0)) { + g = 1; + } + //! 3x3s1/s2 direct + if (kw == 3 && kh == 3 && (stride == 1 || stride == 2) && dila == 1) { + continue; + } + //! 3x3 dw + if (kw == 3 && kh == 3 && dila == 1 && pad == 1 && g == cout && g == c) { + continue; + } + //! 5x5 dw + if (kw == 5 && kh == 5 && dila == 1 && pad == 2 && g == cout && g == c) { + continue; + } + auto flag = test_arm_conv_int8(batch, c, h, w, cout, kw, kh, stride, stride, \ + pad, pad, dila, dila, g, flag_bias, flag_relu, th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << cout << ", group: " << g << \ + ", kernel_h: " << kh << ", kernel_w: " << kw << \ + ", pad: " << pad << ", stride: " << stride << ", dila: " << dila << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 conv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", ch_out: " << cout << ", group: " << g << \ + ", kernel_h: " << kh << ", kernel_w: " << kw << \ + ", pad: " << pad << ", stride: " << stride << ", dila: " << dila << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } +} +#endif + +#if 1 +TEST(TestSaberFunc, test_conv_int8_custom_size) { + for (int i = 0; i < 1; i++) { + auto flag = test_arm_conv_int8(g_num, g_chin, g_h_in, g_w_in, g_ch_out, g_kw, g_kh, g_stride_w, g_stride_h, \ + g_pad_w, g_pad_h, g_dila_w, g_dila_h, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 conv: batchsize: " << g_num << ", channel: " \ + << g_chin << ", h & w: " << g_h_in << \ + ", pad: " << g_pad_h << ", stride: " << g_stride_h << ", dila: " << g_dila_h << \ + ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " + << (g_flag_relu ? "true" : "false") << ", threads: " << \ + g_threads << ", cluster: " << g_cluster << " passed!!"; + } else { + LOG(FATAL) << "test int8 conv: batchsize: " << g_num << ", channel: " + << g_chin << ", h & w: " << g_h_in << \ + ", pad: " << g_pad_h << ", stride: " << g_stride_h << ", dila: " << g_dila_h << \ + ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " + << (g_flag_relu ? "true" : "false") << ", threads: " << \ + g_threads << ", cluster: " << g_cluster << " failed!!"; + } + } +} +#endif + +int main(int argc, const char** argv){ + Env::env_init(); + LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; + + if (argc >= 2) { + g_basic_test = atoi(argv[1]) > 0; + } + + if (argc >= 3) { + g_cluster = atoi(argv[2]); + } + if (argc >= 4) { + g_threads = atoi(argv[3]); + } + if (argc >= 5) { + g_test_iter = atoi(argv[4]); + } + if (argc >= 6) { + g_compare_result = atoi(argv[5]) > 0; + } + if (argc >= 7) { + g_flag_bias = atoi(argv[6]) > 0; + } + if (argc >= 8) { + g_flag_relu = atoi(argv[7]) > 0; + } + if (argc >= 9) { + if (argc < 18) { + LOG(FATAL) << "usage: ./" << argv[0] << "basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; + return -1; + } + g_num = atoi(argv[8]); + g_chin = atoi(argv[9]); + g_h_in = atoi(argv[10]); + g_w_in = atoi(argv[11]); + g_ch_out = atoi(argv[12]); + g_group = atoi(argv[13]); + g_kw = atoi(argv[14]); + g_kh = g_kw; + g_pad_w = atoi(argv[15]); + g_pad_h = g_pad_w; + g_stride_w = atoi(argv[16]); + g_stride_h = g_stride_w; + g_dila_w = atoi(argv[17]); + g_dila_h = g_dila_w; + } + if (argc > 18) { + g_kh = atoi(argv[18]); + } + if (argc > 19) { + g_pad_h = atoi(argv[19]); + } + if (argc > 20) { + g_stride_h = atoi(argv[20]); + } + if (argc > 21) { + g_dila_h = atoi(argv[21]); + } + + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + +#else + +int main(int argc, const char** argv){ + LOG(INFO) << "this unit test only be used in TargetType is ARM"; + return 0; +} + +#endif + diff --git a/test/saber/test_saber_conv_pooling_int8.cpp b/test/saber/test_saber_conv_pooling_int8.cpp new file mode 100644 index 000000000..eb68017bf --- /dev/null +++ b/test/saber/test_saber_conv_pooling_int8.cpp @@ -0,0 +1,388 @@ +#include "saber/core/context.h" +#include "saber/funcs/conv_pooling.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "conv_func_helper.h" +#include + +using namespace anakin::saber; + +template +int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) { + if (max_ratio <= 0) { + max_ratio = 0.1; + } + + int count = 0; + + for (int i = 0; i < size; ++i) { + double ratio = fabs(src1[i] - src2[i]) / fabs(src1[i] + src2[i] + 1e-12); + + if (ratio > max_ratio) { + ++count; + } + } + + return count; +} + +template +int test_conv_pool_results(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int conv_kernel_h, int conv_kernel_w, + int conv_stride_h, int conv_stride_w, int conv_dilation_h, int conv_dilation_w, + int conv_pad_h, int conv_pad_w, bool bias_term, bool relu, + int pool_stride_h, int pool_stride_w, int pool_pad_h, int pool_pad_w, + int pool_kernel_h, int pool_kernel_w, PoolingType pool_type, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " conv_pad_h = " << conv_pad_h + << " conv_pad_w = " << conv_pad_w + << " conv_stride_h = " << conv_stride_h + << " conv_stride_w = " << conv_stride_w + << " conv_dilation_h = " << conv_dilation_h + << " conv_dilation_w = " << conv_dilation_w + << " conv_kernel_h = " << conv_kernel_h + << " conv_kernel_w = " << conv_kernel_w + << " pool_pad_h = " << pool_pad_h + << " pool_pad_w = " << pool_pad_w + << " pool_stride_h = " << pool_stride_h + << " pool_stride_w = " << pool_stride_w + << " pool_kernel_h = " << pool_kernel_h + << " pool_kernel_w = " << pool_kernel_w + << " out_channels = " << out_channels + << " relu = " << (relu ? "true" : "false") + << " bias_term = " << (bias_term ? "true" : "false"); + +#ifdef USE_CUDA + return 0; +#endif +#ifdef USE_X86_PLACE + Shape input_s({input_num, height, width, in_channels}, Layout_NHWC); + Shape weights_s({out_channels, in_channels, conv_kernel_h, conv_kernel_w}, Layout_NCHW); + Shape weights_s_dw({group, in_channels / group, conv_kernel_h, conv_kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + + // generate conv_output shape + int conv_out_height = (conv_pad_h * 2 + height - (conv_dilation_h * (conv_kernel_h - 1) + 1)) / + conv_stride_h + 1; + int conv_out_width = (conv_pad_w * 2 + width - (conv_dilation_w * (conv_kernel_w - 1) + 1)) / + conv_stride_w + 1; + Shape conv_output_s({input_num, conv_out_height, conv_out_width, out_channels}, Layout_NHWC); + + // generate conv_pool_output shape + int out_height = (conv_out_height + 2 * pool_pad_h - pool_kernel_h) / pool_stride_h + 1; + int out_width = (conv_out_width + 2 * pool_pad_w - pool_kernel_w) / pool_stride_w + 1; + Shape output_s({input_num, out_height, out_width, out_channels}, Layout_NHWC); + + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_UINT8); + input_host.re_alloc(input_s, AK_UINT8); + fill_tensor_rand(input_dev, 0.0f, 32.0f); + input_host.copy_from(input_dev); + input_dev.set_scale({1 / 512.f}); + + // init weights Tensor + Tensor weights_dev; + Tensor weights_host; + + if (group > 1) { + weights_dev.re_alloc(weights_s_dw, AK_INT8); + weights_host.re_alloc(weights_s_dw, AK_INT8); + } else { + weights_dev.re_alloc(weights_s, AK_INT8); + weights_host.re_alloc(weights_s, AK_INT8); + } + + fill_tensor_rand(weights_dev, -64.0f, 64.0f); + weights_host.copy_from(weights_dev); + std::vector scale_w_init; + + for (int i = 0; i < out_channels; i ++) { + scale_w_init.push_back(1 / 128.f); + } + + weights_dev.set_scale(scale_w_init); + + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_INT32); + bias_host.re_alloc(bias_s, AK_INT32); + fill_tensor_rand(bias_dev, -1.0f, 1.0f); + bias_host.copy_from(bias_dev); + } + + Tensor check_host; + + Context ctx1(0, 1, 1); + ActivationParam act_param; + + if (relu) { + ActivationParam act_relu_param(Active_relu); + act_param = act_relu_param; + } + + ConvParam conv_param(group, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + conv_dilation_h, conv_dilation_w, + &weights_dev, bias_term ? &bias_dev : nullptr, + act_param, 1.f, 0.f,AK_UINT8, round_mode::nearest); + + PoolingParam pool_param(pool_kernel_h, pool_kernel_w, + pool_pad_h, pool_pad_w, pool_stride_h, pool_stride_w, + pool_type); + ConvPoolingParam param(conv_param, pool_param); + // init output Tensor + Tensor output_dev; + Tensor output_host; + Tensor conv_output_host; + + if (conv_param.activation_param.has_active) { + output_dev.re_alloc(output_s, AK_UINT8); + conv_output_host.re_alloc(conv_output_s, AK_UINT8); + output_host.re_alloc(output_s, AK_UINT8); + output_dev.set_scale({1 / 256.0f}); + conv_output_host.set_scale({1 / 256.0f}); + } else { + output_dev.re_alloc(output_s, AK_INT8); + conv_output_host.re_alloc(conv_output_s, AK_INT8); + output_host.re_alloc(output_s, AK_INT8); + output_dev.set_scale({1 / 128.0f}); + conv_output_host.set_scale({1 / 128.0f}); + } + + output_host.copy_from(output_dev); + + ConvPooling conv_pooling; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + // conv.compute_output_shape(input_v, output_v, param); + // output_dev.re_alloc(output_dev.valid_shape(), AK_INT8); + + if (conv_pooling.init(input_v, output_v, param, strategy, imp, ctx1) == SaberSuccess) { + conv_pooling(input_v, output_v, param, ctx1); + } else { + LOG(INFO) << "init return non Success!"; + return -1; + } + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + + if (conv_param.activation_param.has_active) { + output_host.re_alloc(output_dev.valid_shape(), AK_UINT8); + output_host.copy_from(output_dev); + // print_tensor_valid(output_host); + check_host.re_alloc(output_host.valid_shape(), AK_UINT8); + } else { + output_host.re_alloc(output_dev.valid_shape(), AK_INT8); + output_host.copy_from(output_dev); + check_host.re_alloc(output_host.valid_shape(), AK_INT8); + } + + // calc scale info + std::vector scale; + float scale_in = input_dev.get_scale()[0]; + float scale_out = output_dev.get_scale()[0]; + auto scale_w = weights_dev.get_scale(); + std::vector().swap(scale); + + for (int i = 0; i < scale_w.size(); i++) { + scale.push_back((scale_w[i] * scale_in) / scale_out); + } + + conv_basic_check_int8(input_host, conv_output_host, + (const char*)weights_host.data(), bias_term ? (const int*)bias_host.data() : nullptr, + group, conv_kernel_w, conv_kernel_h, conv_stride_w, conv_stride_h, + conv_dilation_w, conv_dilation_h, conv_pad_w, conv_pad_h, bias_term, + conv_param.activation_param.has_active, scale); + pool_basic_check_int8(conv_output_host, check_host, pool_kernel_w, pool_kernel_h, pool_stride_w, + pool_stride_h, + pool_pad_w, pool_pad_h, pool_type); + int count = count_diff((const unsigned char*)output_host.data(), + (const unsigned char*)check_host.data(), check_host.valid_size(), 2e-1); + + // print_tensor_valid(check_host); + // double max_ratio = 0.0; + // double max_diff = 0.0; + // tensor_cmp_host((const float*)output_host.data(), (const float*)check_host.data(), + // check_host.valid_size(), max_ratio, max_diff); + if ((double)count / output_host.valid_size() < 0.02) { + // LOG(INFO) << " PASS!!! max_ratio = " << max_ratio << " max_diff = " << max_diff; + LOG(INFO) << "PASS!!! count = " << count; + return 0; + } else { + print_tensor_valid(output_host); + print_tensor_valid(check_host); + // LOG(FATAL) << "FAIL!!! max_ratio = " << max_ratio << " max_diff = " << max_diff + + LOG(FATAL) << "FAIL!!! count = " << count + << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " conv_pad_h = " << conv_pad_h + << " conv_pad_w = " << conv_pad_w + << " conv_stride_h = " << conv_stride_h + << " conv_stride_w = " << conv_stride_w + << " conv_dilation_h = " << conv_dilation_h + << " conv_dilation_w = " << conv_dilation_w + << " conv_kernel_h = " << conv_kernel_h + << " conv_kernel_w = " << conv_kernel_w + << " pool_pad_h = " << pool_pad_h + << " pool_pad_w = " << pool_pad_w + << " pool_stride_h = " << pool_stride_h + << " pool_stride_w = " << pool_stride_w + << " pool_kernel_h = " << pool_kernel_h + << " pool_kernel_w = " << pool_kernel_w + << " out_channels = " << out_channels + << " relu = " << (relu ? "true" : "false") + << " bias_term = " << (bias_term ? "true" : "false"); + return -1; + } + +#endif +} + +TEST(TestSaberFunc, test_saber_conv_int8_results) { +#ifdef USE_CUDA + Env::env_init(); + Env::env_init(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); +#endif + std::vector groups{1}; + std::vector conv_kernel_h_v{3}; + std::vector conv_kernel_w_v{3}; + std::vector conv_pad_h_v{0}; + std::vector conv_pad_w_v{0}; + std::vector conv_stride_h_v{1}; + std::vector conv_stride_w_v{1}; + std::vector conv_dilation_h_v{1}; + std::vector conv_dilation_w_v{1}; + std::vector pool_kernel_h_v{2, 3}; + std::vector pool_kernel_w_v{2, 3}; + std::vector pool_pad_h_v{0}; + std::vector pool_pad_w_v{0}; + std::vector pool_stride_h_v{2, 3}; + std::vector pool_stride_w_v{2, 3}; + std::vector pool_type_v{Pooling_max}; + std::vector in_channels_v{16}; + std::vector out_channels_v{16}; + std::vector in_h_v{32}; + std::vector in_w_v{32}; + std::vector input_num_v{1}; + std::vector bias_term_v{true}; + std::vector relu_v{true}; + + for (auto group : groups) { + for (auto input_num : input_num_v) { + for (auto out_channels : out_channels_v) { + for (auto in_channels : in_channels_v) { + for (auto conv_kernel_h : conv_kernel_h_v) { + for (auto conv_kernel_w : conv_kernel_w_v) { + for (auto height : in_h_v) { + for (auto width : in_w_v) { + for (auto conv_stride_h : conv_stride_h_v) { + for (auto conv_stride_w : conv_stride_w_v) { + for (auto conv_dilation_h : conv_dilation_h_v) { + for (auto conv_dilation_w : conv_dilation_w_v) { + for (auto conv_pad_h : conv_pad_h_v) { + for (auto conv_pad_w : conv_pad_w_v) { + for (auto pool_kernel_h : pool_kernel_h_v) { + for (auto pool_kernel_w : pool_kernel_w_v) { + for (auto pool_stride_h : pool_stride_h_v) { + for (auto pool_stride_w : pool_stride_w_v) { + for (auto pool_pad_h : pool_pad_h_v) { + for (auto pool_pad_w : pool_pad_w_v) { + for (auto pool_type : pool_type_v) { + for (auto bias_term : bias_term_v) { + for (auto relu : relu_v) { + #ifdef USE_CUDA + #endif + #ifdef USE_X86_PLACE + + if (jit::mayiuse( + jit::avx512_core)&&jit::mayiuse( + jit::avx512_core_vnni)) { + test_conv_pool_results( + group, + input_num, + in_channels, + height, + width, + out_channels, + conv_kernel_h, + conv_kernel_w, + conv_stride_h, + conv_stride_w, + conv_dilation_h, + conv_dilation_w, + conv_pad_h, + conv_pad_w, + bias_term, + relu, + pool_stride_h, + pool_stride_w, + pool_pad_h, + pool_pad_w, + pool_kernel_h, + pool_kernel_w, + pool_type, + SPECIFY, + SABER_IMPL); + } + + + #endif + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } + +} + + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); +// InitTest(); +// RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_cos_sim.cpp b/test/saber/test_saber_cos_sim.cpp new file mode 100644 index 000000000..db936be0b --- /dev/null +++ b/test/saber/test_saber_cos_sim.cpp @@ -0,0 +1,100 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/cos_sim.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; + +template +void cossim_basic(const std::vector*>& inputs, + std::vector*>& outputs, + CosSimParam& param) { + CHECK_EQ(inputs.size(), 2) << "CosSim input num need be 2, but is" << inputs.size(); + CHECK_EQ(outputs.size(), 1) << "CosSim input num need be 1, but is" << outputs.size(); + size_t count_0 = inputs[0]->valid_size(); + size_t count_1 = inputs[1]->valid_size(); + CHECK_EQ(count_0, count_1) << "input0 and input1 valid size is not equal"; + + size_t num = inputs[0]->num(); + size_t inner_size = count_0 / inputs[0]->num(); + const dtype *input0_data = (const dtype*)inputs[0]->data(); + const dtype *input1_data = (const dtype*)inputs[1]->data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + + //z = x'y/ (|x|*|y|) + for (size_t n = 0; n < num; n++) { + auto input0_square_sum = (dtype)0; + auto input1_square_sum = (dtype)0; + auto input01_prod_sum = (dtype)0; + for (size_t i = 0; i < inner_size; i++) { + input0_square_sum += input0_data[i] * input0_data[i]; + input1_square_sum += input1_data[i] * input1_data[i]; + input01_prod_sum += input0_data[i] * input1_data[i]; + } + float bc = input0_square_sum * input1_square_sum; + if (bc < param.epsilon) { + output_data[n] = 0; + } else { + output_data[n] = input01_prod_sum / sqrt(bc); + } + input0_data += inner_size; + input1_data += inner_size; + } + +} + +template +void test_model() { + + TestSaberBase testbase(2, 1); + //test example + for (auto num : {1, 2, 16}) { + for (auto channel : {1, 16, 32}) { + for (auto height : {8, 15, 32}) { + for (auto width: {8, 13, 45}) { + Shape shape({num, channel, height, width}, Layout_NCHW); + CosSimParam param(0.f); + testbase.set_param(param);//set param + testbase.set_input_shape(shape); + testbase.run_test(cossim_basic, 0.00001, false, true);//run test + } + } + } + } +} +TEST(TestSaberFunc, test_func_cos_sim) { + +#ifdef USE_CUDA + //Init the test_base + Env::env_init(); + test_model(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_deconv.cpp b/test/saber/test_saber_deconv.cpp index c129a9989..087a0d2e2 100644 --- a/test/saber/test_saber_deconv.cpp +++ b/test/saber/test_saber_deconv.cpp @@ -8,6 +8,10 @@ #include #include "debug.h" #include "test/saber/conv_func_helper.h" +#ifdef USE_X86_PLACE +#include "saber/funcs/impl/x86/x86_utils.h" +#include "omp.h" +#endif using namespace anakin::saber; void fill_bias_relu(float* tensor, const float* bias, int channel, int channel_size, @@ -262,23 +266,188 @@ void deconv_test(int img_n = 1, }; +template +int test_deconv_results_x86_C8R(int group, + int input_num, int in_channels, int height, int width, + int out_channels, int kernel_h, int kernel_w, + int stride_h, int stride_w, int dilation_h, int dilation_w, + int pad_h, int pad_w, bool bias_term, bool with_relu, + SaberImplStrategy strategy, ImplEnum imp) { + + LOG(INFO) << " conv param: " + << " input_num = " << input_num + << " in_channels = " << in_channels + << " height = " << height + << " width = " << width + << " group = " << group + << " pad_h = " << pad_h + << " pad_w = " << pad_w + << " stride_h = " << stride_h + << " stride_w = " << stride_w + << " dilation_h = " << dilation_h + << " dilation_w = " << dilation_w + << " kernel_h = " << kernel_h + << " kernel_w = " << kernel_w + << " out_channels = " << out_channels + << " bias_term = " << (bias_term ? "true" : "false") + << " with_relu = " << (with_relu ? "true" : "false"); + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW_C8R); + Shape weights_s({out_channels, in_channels / group, kernel_h, kernel_w}, Layout_NCHW); + Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); + int kernel_extent_h = dilation_h * + (kernel_h - 1) + 1; + int output_dim_h = (height - 1) * + stride_h + kernel_extent_h - 2 * pad_h; + int kernel_extent_w = dilation_w * + (kernel_w - 1) + 1; + int output_dim_w = (width - 1) * + stride_w + kernel_extent_w - 2 * pad_w; + int out_height = output_dim_h; + int out_width = output_dim_w; + Shape output_dev_s({input_num, out_channels, out_height, out_width}, Layout_NCHW_C8R); + // init input Tensor + Tensor input_dev; + Tensor input_host; + input_dev.re_alloc(input_s, AK_FLOAT); + input_host.re_alloc(input_s, AK_FLOAT); +// { +// float *tmp= static_cast(input_dev.mutable_data()); +// for(int i=0;i weights_dev; + Tensor weights_host; + weights_dev.re_alloc(weights_s, AK_FLOAT); + weights_host.re_alloc(weights_s, AK_FLOAT); + fill_tensor_const(weights_dev, 1.f); + // fill_tensor_seq(weights_dev); +// fill_tensor_rand(weights_dev, -2.0f, 2.0f); + weights_host.copy_from(weights_dev); + + Tensor bias_dev; + Tensor bias_host; + + if (bias_term) { + bias_dev.re_alloc(bias_s, AK_FLOAT); + bias_host.re_alloc(bias_s, AK_FLOAT); + fill_tensor_const(bias_dev, -1.f); +// fill_tensor_rand(bias_dev, -2.0f, 2.0f); + bias_host.copy_from(bias_dev); + } + + Tensor output_dev(output_dev_s); + Tensor output_host(output_dev_s); + Tensor check_host; + fill_tensor_const(output_dev, -10.f); + Context ctx1(0, 1, 1); + // ActivationParam act_param(Active_relu); + ConvParam param(group, pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + &weights_dev, &bias_dev); + + if (with_relu) { + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + + Deconv conv; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + // output_dev.set_layout_without_shape(Layout_NCHW_C8); + conv.compute_output_shape(input_v, output_v, param); + // LOG(INFO)<<"layout "<::API::stream_t stream = ctx1.get_compute_stream(); + // output_v[0]->record_event(stream); + // output_v[0]->sync(); + // output_host.re_alloc(output_dev.valid_shape(), AK_FLOAT); + // output_host.copy_from(output_dev); + + // print_tensor(input_dev); + // print_tensor(output_dev); + // print_tensor(output_host); + Tensor nchwc8_input_check(Shape({input_num, in_channels, height, width})); + anakin::saber::reorder_nchwc_nchw(input_host, nchwc8_input_check); + check_host.re_alloc(Shape({input_num, out_channels, out_height, out_width}), AK_FLOAT); + Tensor nchw_output_check(check_host.valid_shape()); + std::vector*> check_in_vec{&nchwc8_input_check}; + std::vector*> check_out_vec{&check_host}; + gemm_transpose_conv(check_in_vec, check_out_vec,param); + LOG(INFO) << "cal check finish"; + // print_tensor_valid(check_host); + + // anakin::saber::input_reorder_nChwc8(check_host,nchw_output_check); + Tensor nchwc8_output_check(check_host.valid_shape()); + anakin::saber::reorder_nchwc_nchw(output_dev, nchwc8_output_check); + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)nchwc8_output_check.data(), (const float*)check_host.data(), + check_host.valid_size(), max_ratio, max_diff); + + if (max_ratio > 1e-3 && max_diff > 1e-3) { +// print_tensor(nchwc8_output_check); +// print_tensor(check_host); + +// print_tensor(input_host); +// print_tensor(weights_dev); + LOG(FATAL) << " max_ratio = " << max_ratio << " max_diff = " << max_diff; + } else { + LOG(INFO) << "passed"; + } + + return 0; +} + template void deconv_testbase() { Env::env_init(); Env::env_init(); TestSaberBase testbase; - std::vector kernel{4}; - std::vector pad{1}; +// std::vector kernel{3,4,5,6,7}; +// std::vector pad{0,1,2}; +// std::vector stride{1,2,3}; + std::vector kernel{3,4,5,6,7}; + std::vector pad{0,1}; std::vector stride{2}; std::vector dilation_v{1}; std::vector group_v{1}; - std::vector in_h_v{64}; - std::vector in_w_v{64}; + std::vector in_h_v{22}; + std::vector in_w_v{23}; std::vector input_num_v{1}; - std::vector input_channels_v{48}; - std::vector output_channels_v{16}; - std::vector bias_term_v{true, false}; - std::vector with_relu_v{true, false}; + std::vector input_channels_v{12}; + std::vector output_channels_v{21}; + std::vector bias_term_v{true,false}; + std::vector with_relu_v{true,false}; for (auto relu_flag : with_relu_v) for (auto kernel_h : kernel) @@ -302,6 +471,7 @@ void deconv_testbase() { weights_dev.re_alloc(weights_s, AK_FLOAT); fill_tensor_rand(weights_dev, -2.f, 2.0f); +// fill_tensor_const(weights_dev,1.f); if (bias_term) { bias_dev.re_alloc(bias_s, AK_FLOAT); @@ -320,7 +490,8 @@ void deconv_testbase() { for (auto height : in_h_v) for (auto width : in_w_v) { testbase.set_param(param_nv);//set param - testbase.set_rand_limit(-1, 1); + testbase.set_rand_limit(-1.f,1.f); +// testbase.set_rand_limit(1.f,1.f); testbase.set_input_shape(Shape({input_num, in_channels, height, width}, Layout_NCHW));//add some input shape LOG(INFO) << kernel_h << "," << kernel_w << "," << pad_h << "," << pad_w << "," << stride_h << "," @@ -339,10 +510,55 @@ TEST(TestSaberFunc, test_func_self_deconv_nv) { TEST(TestSaberFunc, test_func_self_deconv_x86) { #ifdef USE_X86_PLACE - deconv_testbase(); + Env::env_init(); + int group = 1; + int input_num = 1; + int in_channels = 8; + int height = 3; + int width = 3; + int out_channels = 16; + int kernel_h = 3; + int kernel_w = 3; + int stride_h = 2; + int stride_w = 2; + int dilation_h = 1; + int dilation_w = 1; + int pad_h = 0; + int pad_w = 0; + bool bias_term = false; + bool with_relu = false; + +// int group = 1; +// int input_num = 1; +// int in_channels = 16; +// int height = 15; +// int width = 28; +// int out_channels = 16; +// int kernel_h = 3; +// int kernel_w = 3; +// int stride_h = 2; +// int stride_w = 2; +// int dilation_h = 1; +// int dilation_w = 1; +// int pad_h = 0; +// int pad_w = 0; +// bool bias_term = true; +// bool with_relu = false; + test_deconv_results_x86_C8R(group, + input_num, in_channels, + height, width, + out_channels, kernel_h, + kernel_w, + stride_h, stride_w, + dilation_h, dilation_w, + pad_h, pad_w, bias_term, + with_relu, + SPECIFY, SABER_IMPL); #endif } + + int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); diff --git a/test/saber/test_saber_deconv_arm.cpp b/test/saber/test_saber_deconv_arm.cpp new file mode 100644 index 000000000..2748a00d9 --- /dev/null +++ b/test/saber/test_saber_deconv_arm.cpp @@ -0,0 +1,458 @@ +#include "saber/funcs/deconv.h" +#include "saber/funcs/timer.h" +#include "test/saber/test_saber_func.h" +#include "saber/core/tensor_op.h" +using namespace anakin::saber; + +#ifdef USE_ARM_PLACE + +int g_cluster = 0; +int g_threads = 1; +int g_test_iter = 1; + +bool g_basic_test = false; + +bool g_compare_result = true; +bool g_flag_bias = true; +bool g_flag_relu = false; + +int g_num = 1; +int g_ch_in = 128; +int g_h_in = 10; +int g_w_in = 10; + +int g_ch_out = 128; +int g_group = 128; +int g_kernel = 4; +int g_pad = 1; +int g_stride = 2; +int g_dila = 1; + +typedef Tensor TensorHf4; + +template +static void fill_bias_relu(Dtype* tensor, const Dtype* bias, int channel, int channel_size, \ + bool flag_bias, bool flag_relu) { + Dtype* data = tensor; + for (int j = 0; j < channel; ++j) { + Dtype bias_c = flag_bias? bias[j] : 0; + for (int i = 0; i < channel_size; i++) { + data[i] += bias_c; + if (flag_relu) { + data[i] = data[i] > 0 ? data[i] : 0.f; + } + } + data += channel_size; + } +} + +inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { + return static_cast(a) < static_cast(b); +} + +template +void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_im) { + memset(data_im, 0, height * width * channels * sizeof(Dtype)); + const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channel_size = height * width; + for (int channel = channels; channel--; data_im += channel_size) { + for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_row = -pad_h + kernel_row * dilation_h; + for (int output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + data_col += output_w; + } else { + int input_col = -pad_w + kernel_col * dilation_w; + for (int output_col = output_w; output_col; output_col--) { + if (is_a_ge_zero_and_a_lt_b(input_col, width)) { + data_im[input_row * width + input_col] += *data_col; + } + data_col++; + input_col += stride_w; + } + } + input_row += stride_h; + } + } + } + } +} + +template +static void basic_gemm(int m, int n, int k, const type* a, const type* b, const type2* bias, type2* c, \ + type2 alpha, type2 beta, \ + bool trans_a = false, bool trans_b = false, bool flag_bias = false, bool flag_relu = false) { +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + type2 bias_data = (type2)0; + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + type2 sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * m + i]; + } else{ + av = a[i * k + l]; + } + if (trans_b) { + bv = b[j * k + l]; + } else { + bv = b[l * n + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; + if (flag_relu) { + c[i * n + j] = tmp > (type2)0? tmp : (type2)0; + } else { + c[i * n + j] = tmp; + } + } + } +} + +//! for float, dtype1 and type2 is float +//! for int8, dytpe1 is char, dtype2 is int +template +void deconv_basic(const Dtype1* din, Dtype2* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const Dtype1* weights, const Dtype2* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, \ + int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu) { + + + int m = chout * kernel_w * kernel_h / group; + int n = hin * win; + int k = chin / group; + + if (chin != chout || group != chin) { + CHECK_EQ(chin % group, 0) << "input channel or group size error"; + CHECK_EQ(chout % group, 0) << "output channel or group size error"; + } + + Tensor workspace_tensor; + Shape workspace_shape({1, 1, 1, group * m * n}); + workspace_tensor.re_alloc(workspace_shape, anakin::saber::AK_FLOAT); + + int group_size_in = win * hin * chin / group; + int group_size_out = wout * hout * chout / group; + int group_size_coldata = m * n; + int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); + bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && \ + (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && \ + (dila_w == 1) && (dila_h == 1); + + Dtype2* workspace_ptr = static_cast(workspace_tensor.mutable_data()); + + for (int i = 0; i < num; ++i) { + const Dtype1* din_batch = din + i * chin * hin * win; + Dtype2* dout_batch = dout + i * chout * hout * wout; + + Dtype2* col_data = workspace_ptr; + if (flag_1x1s1p1) { + col_data = dout_batch; + } + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + for (int g = 0; g < group; ++g) { + const Dtype1* din_group = din_batch + g * group_size_in; + const Dtype1* weights_group = weights + g * group_size_weights; + Dtype2* coldata_group = col_data + g * group_size_coldata; + basic_gemm(m, n, k, weights_group, din_group, nullptr, coldata_group, \ + (Dtype2)1, (Dtype2)0, true, false, false, (!flag_bias && flag_relu)); + } + if (!flag_1x1s1p1) { + col2im(col_data, chout, hout, wout, kernel_h, kernel_w, pad_h, pad_w, \ + stride_h, stride_w, dila_h, dila_w, dout_batch); + } + //! add bias + if (flag_bias) { + fill_bias_relu(dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); + } + } +} + +SaberStatus test_arm_deconv(int n, int c, int h, int w, \ + int ch_out, int kernel, int stride, int pad, \ + int dila, int group, bool flag_bias, bool flag_relu, \ + int thread_num, int cluster_id) { + + double to = 0; + double min_time = 1000000; + SaberTimer t1; + + Context ctx1; + ctx1.set_run_mode(PowerMode(cluster_id), thread_num); + LOG(INFO) << "test threads activated"; +#pragma omp parallel + { +#ifdef USE_OPENMP + int thread = omp_get_num_threads(); + LOG(INFO) << "number of threads: " << thread; +#endif + } + + TensorHf4 tout_basic; + TensorHf4 tout_saber; + + TensorHf4 thin; + thin.re_alloc(Shape({n, c, h, w}), AK_FLOAT); + + std::vector tin; + std::vector tvout_saber; + + tin.push_back(&thin); + tvout_saber.push_back(&tout_saber); + + int num = n; + int chin = c; + int hin = h; + int win = w; + + LOG(INFO) << "deconv param: " << " img_num = " << num << " in_channels = " << chin \ + << " img_h = " << hin << " img_w = " << win << " group = " << group << " pad = " \ + << pad << " stride = " << stride << " dilation = " << dila << " kernel = " \ + << kernel << " out_channels = " << ch_out << " bias flag = " << (flag_bias? "true" : "false ") \ + << " relu flag = " << (flag_relu ? "true" : "false"); + + int kernel_exten = dila * (kernel - 1) + 1; + int hout = (h - 1) * stride + kernel_exten - 2 * pad; + + kernel_exten = dila * (kernel - 1) + 1; + int wout = (w - 1) * stride + kernel_exten - 2 * pad; + + if (hout <=0 || wout <= 0) { + return SaberSuccess; + } + + Shape shape_out({num, ch_out, hout, wout}); + + Shape shw({ch_out/group, chin, kernel, kernel}); + Shape shb({1, ch_out, 1, 1}); + TensorHf4 pweiht(shw); + TensorHf4 pweihtb(shw); + TensorHf4 pbias; + + fill_tensor_rand(thin, -1.f, 1.f); + fill_tensor_rand(pweiht, -1.f, 1.f); + +// fill_tensor_const(thin, 1.f); +// fill_tensor_const(pweiht, 1.f); +// fill_tensor_const(pbias, 1.f); + + TensorHf4* bias_ptr = nullptr; + if (flag_bias) { + pbias.re_alloc(shb); + fill_tensor_rand(pbias, -1.f, 1.f); + } + std::vector scale(ch_out, 1.f); + const float* din = static_cast(thin.data()); + + if (g_compare_result) { + LOG(INFO) << "run basic deconv for precision comparation"; + tout_basic.re_alloc(shape_out); + float* dout = static_cast(tout_basic.mutable_data()); + deconv_basic(din, dout, num, ch_out, hout, wout, chin, hin, win, \ + static_cast(pweiht.data()), static_cast(pbias.data()), \ + group, kernel, kernel, stride, stride, \ + dila, dila, pad, pad, flag_bias, flag_relu); +// print_tensor(tout_basic); + } + + Deconv deconv; + + ConvParam param(group, pad, pad, stride, stride, dila, dila, &pweiht, &pbias); + if (flag_relu){ + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + + deconv.compute_output_shape(tin, tvout_saber, param); + + Shape sh_out_saber = tvout_saber[0]->valid_shape(); + LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ + << shape_out[2] << ", " << shape_out[3]; + LOG(INFO) << "saber output shape: " << sh_out_saber[0] << ", " << sh_out_saber[1] << ", " \ + << sh_out_saber[2] << ", " << shape_out[3]; + //CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; + + //! re_alloc mem for output tensor + tvout_saber[0]->re_alloc(shape_out); + +// LOG(INFO) << "saber deconv impl init"; + CHECK_EQ(deconv.init(tin, tvout_saber, param, SPECIFY, SABER_IMPL, ctx1), SaberSuccess) << "Saber deconv init failed"; + + //! compute +// LOG(INFO) << "saber conv compute"; + to = 0; + + for (int i = 0; i < g_test_iter; ++i) { + t1.clear(); + t1.start(ctx1); + deconv(tin, tvout_saber, param, ctx1); + //tvout_saber[0]->record_event(ctx1.get_compute_stream()); + //tvout_saber[0]->sync(); + t1.end(ctx1); + to += t1.get_average_ms(); + if (t1.get_average_ms() < min_time) { + min_time = t1.get_average_ms(); + } + } + LOG(INFO) << "saber deconv running time, ave: " << to / g_test_iter << ", min time: " << min_time; +// print_tensor(tout_saber); + + if (g_compare_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host((const float*)tout_basic.data(), (const float*)tout_saber.data(), + tout_basic.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + if (fabsf(max_ratio) > 1e-4f) { + LOG(INFO) << "basic result:"; + print_tensor(tout_basic); + LOG(INFO) << "saber result:"; + print_tensor(tout_saber); + return SaberInvalidValue; + } +// CHECK_EQ(fabsf(max_ratio) < 1e-4f, true) << "compute result error"; + } + return SaberSuccess; +} + +TEST(TestSaberFunc, test_deconv_custom_size) { + + int num = g_num; + int chin = g_ch_in; + int hin = g_h_in; + int win = g_w_in; + + int dilation = g_dila; + int chout = g_ch_out; + + test_arm_deconv(num, chin, hin, win, chout, g_kernel, g_stride, g_pad, \ + dilation, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); +} + +TEST(TestSaberFunc, fp32_deconv_basic_test) { + + if (g_basic_test) { + for (auto& n : {1, 2}) { + for (auto& c : {1, 3, 8, 15}) { + for (auto& cout : {1, 3, 8, 16}) { + for (auto& h : {8, 15, 28, 32, 38, 75}) { + for (auto& kh : {2, 3, 4}) { + for (auto& stride : {1, 2}) { + for (auto &dila : {1, 2}) { + for (auto &g : {1, 2}) { + for (auto &bias : {false, true}) { + for (auto &relu : {false, true}) { + for (auto &threads : {1, 2, 4}) { + int w = h; + int group = g; + if (c % g != 0 || cout % g != 0) { + group = 1; + } + int pad = kh / 2; + auto flag = test_arm_deconv(n, c, h, w, cout, kh, stride, pad, dila, group, bias, relu, threads, 0); + if (flag == SaberSuccess) { + LOG(INFO) << "test fp32 depthwise conv: batchsize: " << n << ", channel: " << c << ", h & w: " << h << \ + "num_out: " << cout << ", group:" << group << ", kernel: " << kh << ", stride: " << stride << \ + ", pad: " << pad << ", dila: " << dila << \ + ", bias: " << (bias? "true" : "false") << ", relu: " << (relu? "true" : "false") << ", threads: " << \ + threads << ", cluster: " << g_cluster << " passed!!"; + } else { + LOG(FATAL) << "test fp32 depthwise conv: batchsize: " << n << ", channel: " << c << ", h & w: " << h << \ + "num_out: " << cout << ", group:" << group << ", kernel: " << kh << ", stride: " << stride << \ + ", pad: " << pad << ", dila: " << dila << \ + ", bias: " << (bias? "true" : "false") << ", relu: " << (relu? "true" : "false") << ", threads: " << \ + threads << ", cluster: " << g_cluster << " failed!!"; + } + + } + } + } + } + } + } + } + } + } + } + } + } +} + + +int main(int argc, const char** argv){ + Env::env_init(); + LOG(INFO) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila"; + if (argc >= 2) { + g_basic_test = atoi(argv[1]) > 0; + } + if (argc >= 3) { + g_cluster = atoi(argv[2]); + } + if (argc >= 4) { + g_threads = atoi(argv[3]); + } + if (argc >= 5) { + g_test_iter = atoi(argv[4]); + } + if (argc >= 6) { + g_compare_result = atoi(argv[5]) > 0; + } + if (argc >= 7) { + g_flag_bias = atoi(argv[6]) > 0; + } + if (argc >= 8) { + g_flag_relu = atoi(argv[7]) > 0; + } + if (argc >= 9) { + if (argc < 18) { + LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila"; + return 0; + } + g_num = atoi(argv[8]); + g_ch_in = atoi(argv[9]); + g_h_in = atoi(argv[10]); + g_w_in = atoi(argv[11]); + g_ch_out = atoi(argv[12]); + g_group = atoi(argv[13]); + g_kernel = atoi(argv[14]); + g_pad = atoi(argv[15]); + g_stride = atoi(argv[16]); + g_dila = atoi(argv[17]); + } + + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + +#else + +int main(int argc, const char** argv){ + LOG(INFO) << "this unit test only be used in TargetType is ARM"; + return 0; +} + +#endif + diff --git a/test/saber/test_saber_deconv_int8_arm.cpp b/test/saber/test_saber_deconv_int8_arm.cpp new file mode 100644 index 000000000..06bcad092 --- /dev/null +++ b/test/saber/test_saber_deconv_int8_arm.cpp @@ -0,0 +1,596 @@ +#include "saber/funcs/deconv.h" +#include "saber/funcs/type_trans.h" +#include "saber/funcs/timer.h" +#include "test/saber/test_saber_func.h" +#include "saber/core/tensor_op.h" + +using namespace anakin::saber; + +#ifdef USE_ARM_PLACE + +int g_cluster = 0; +int g_threads = 1; +int g_test_iter = 10; + +bool g_basic_test = false; +bool g_compare_result = true; +bool g_flag_relu = false; +bool g_flag_bias = false; + +int g_num = 1; +int g_chin = 32; +int g_h_in = 112; +int g_w_in = 112; + +int g_ch_out = 32; +int g_group = 32; +int g_kw = 3; +int g_pad_w = 1; +int g_stride_w = 1; +int g_dila_w = 1; +int g_kh = 3; +int g_pad_h = 1; +int g_stride_h = 1; +int g_dila_h = 1; + +typedef Tensor TensorH; + +template +static int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio, float tensor_scale) { + double sum_abs1 = 0.0; + double sum_abs2 = 0.0; + for (int i = 0; i < size; ++i) { + sum_abs1 += fabs(src1[i]); + sum_abs2 += fabs(src2[i]); + } + double mean_abs1 = sum_abs1 / size; + double mean_abs2 = sum_abs2 / size; + double mean_val = (mean_abs1 + mean_abs2) / 2.0; + if (max_ratio <= 0) { + max_ratio = 0.1; + } + int count = 0; + for (int i = 0; i < size; ++i) { + double abs_diff = fabs(src1[i] - src2[i]); + double ratio = abs_diff / (fabs(src1[i] + src2[i]) + 1e-12); + if (ratio > max_ratio && abs_diff > (tensor_scale + 1e-5f) && abs_diff > mean_val * 0.1f) { + ++count; + } + } + return count; +} + +template +static void basic_gemm(int m, int n, int k, const type* a, const type* b, const type2* bias, type2* c, \ + type2 alpha, type2 beta, \ + bool trans_a = false, bool trans_b = false, bool flag_bias = false, bool flag_relu = false) { +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + type2 bias_data = (type2)0; + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + type2 sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * m + i]; + } else{ + av = a[i * k + l]; + } + if (trans_b) { + bv = b[j * k + l]; + } else { + bv = b[l * n + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; + if (flag_relu) { + c[i * n + j] = tmp > (type2)0? tmp : (type2)0; + } else { + c[i * n + j] = tmp; + } + } + } +} + +inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { + return static_cast(a) < static_cast(b); +} + +template +static void col2im(const Dtype* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + Dtype* data_im) { + + memset(data_im, 0, height * width * channels * sizeof(Dtype)); + const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channel_size = height * width; + + for (int channel = channels; channel--; data_im += channel_size) { + for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { + for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + int input_row = -pad_h + kernel_row * dilation_h; + + for (int output_rows = output_h; output_rows; output_rows--) { + if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { + data_col += output_w; + } else { + int input_col = -pad_w + kernel_col * dilation_w; + + for (int output_col = output_w; output_col; output_col--) { + if (is_a_ge_zero_and_a_lt_b(input_col, width)) { + data_im[input_row * width + input_col] += *data_col; + } + data_col++; + input_col += stride_w; + } + } + input_row += stride_h; + } + } + } + } +} + +template +static void fill_bias_relu(Dtype* tensor, const Dtype* bias, int channel, int channel_size, \ + bool flag_bias, bool flag_relu) { + Dtype* data = tensor; + for (int j = 0; j < channel; ++j) { + Dtype bias_c = flag_bias? bias[j] : 0; + for (int i = 0; i < channel_size; i++) { + data[i] += bias_c; + if (flag_relu) { + data[i] = data[i] > 0 ? data[i] : 0.f; + } + } + data += channel_size; + } +} + +//! for float, dtype1 and type2 is float +//! for int8, dytpe1 is char, dtype2 is int +template +static void deconv_basic(const Dtype1* din, Dtype2* dout, \ + int num, int chout, int hout, int wout, \ + int chin, int hin, int win, \ + const Dtype1* weights, const Dtype2* bias, \ + int group, int kernel_w, int kernel_h, int stride_w, \ + int stride_h, int dila_w, int dila_h, \ + int pad_w, int pad_h, bool flag_bias, bool flag_relu) { + + + int m = chout * kernel_w * kernel_h / group; + int n = hin * win; + int k = chin / group; + + if (chin != chout || group != chin) { + CHECK_EQ(chin % group, 0) << "input channel or group size error"; + CHECK_EQ(chout % group, 0) << "output channel or group size error"; + } + + Tensor workspace_tensor; + Shape workspace_shape({1, 1, 1, group * m * n}); + workspace_tensor.re_alloc(workspace_shape, anakin::saber::AK_FLOAT); + + int group_size_in = win * hin * chin / group; + int group_size_out = wout * hout * chout / group; + int group_size_coldata = m * n; + int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); + bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && \ + (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && \ + (dila_w == 1) && (dila_h == 1); + + Dtype2* workspace_ptr = static_cast(workspace_tensor.mutable_data()); + + for (int i = 0; i < num; ++i) { + const Dtype1* din_batch = din + i * chin * hin * win; + Dtype2* dout_batch = dout + i * chout * hout * wout; + + Dtype2* col_data = workspace_ptr; + if (flag_1x1s1p1) { + col_data = dout_batch; + } + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + for (int g = 0; g < group; ++g) { + const Dtype1* din_group = din_batch + g * group_size_in; + const Dtype1* weights_group = weights + g * group_size_weights; + Dtype2* coldata_group = col_data + g * group_size_coldata; + basic_gemm(m, n, k, weights_group, din_group, nullptr, coldata_group, \ + (Dtype2)1, (Dtype2)0, true, false, false, (!flag_bias && flag_relu)); + } + + if (!flag_1x1s1p1) { + col2im(col_data, chout, hout, wout, kernel_h, kernel_w, pad_h, pad_w, \ + stride_h, stride_w, dila_h, dila_w, dout_batch); + } + //! add bias + if (flag_bias) { + fill_bias_relu(dout_batch, bias, chout, wout * hout, flag_bias, flag_relu); + } + } +} + +SaberStatus test_arm_deconv_int8(int n, int c, int h, int w, \ + int ch_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w, int pad_h, \ + int dila_w, int dila_h, int group, bool is_bias, bool is_relu, int thread_num, int cluster_id) { + + double to = 0; + double min_time = 1000000; + SaberTimer t1; + + Context ctx1; + PowerMode mode = static_cast(cluster_id); + ctx1.set_run_mode(mode, thread_num); + LOG(INFO) << "test threads activated"; +#pragma omp parallel + { +#ifdef USE_OPENMP + int thread = omp_get_num_threads(); + LOG(INFO) << "number of threads: " << thread; +#endif + } + + TensorH tout_basic_int32; + TensorH tout_basic_int8; + TensorH tout_saber_int32; + TensorH tout_saber_int8; + TensorH tout_basic_fp32; + TensorH tout_saber_fp32; + + TensorH thinf; + TensorH thinc; + Shape shin ({n, c, h, w}); + thinf.re_alloc(shin, AK_FLOAT); + thinc.re_alloc(shin, AK_INT8); + + std::vector tvin_fp32; + std::vector tvin_int8; + std::vector tvout_saber_fp32; + std::vector tvout_saber_int32; + std::vector tvout_saber_int8; + + tvin_fp32.push_back(&thinf); + tvin_int8.push_back(&thinc); + tvout_saber_fp32.push_back(&tout_saber_fp32); + tvout_saber_int32.push_back(&tout_saber_int32); + tvout_saber_int8.push_back(&tout_saber_int8); + + int num = n; + int chin = c; + int hin = h; + int win = w; + + LOG(INFO) << "conv param: "; + LOG(INFO) << " img_num = " << num << " in_channels = " << chin << " img_h = " << hin << " img_w = " << win; + LOG(INFO) << " num_out = " << ch_out << " group = " << group << " kernel_w = " << kernel_w << " kernel_h = " << kernel_h << \ + " stride_width = " << stride_w << " stride_height = " << stride_h << \ + " pad_width = " << pad_w << " pad_height = " << pad_h << \ + " dilation_w = " << dila_w << " dilation_h = " << dila_h; + LOG(INFO) << " bias flag = " << (is_bias? "true" : "false") << ", relu flag = " << (is_relu? "true" : "false"); + + int kernel_extent_h = dila_h * (kernel_h - 1) + 1; + int hout = (h - 1) * stride_h + kernel_extent_h - 2 * pad_h; + int kernel_extent_w = dila_w * (kernel_w - 1) + 1; + int wout = (w - 1) * stride_w + kernel_extent_w - 2 * pad_w; + + Shape shape_out({num, ch_out, hout, wout}); + + Shape shw({ch_out, chin / group, kernel_h, kernel_w}); + Shape shb({1, ch_out, 1, 1}); + + TensorH pweihtf; + TensorH pbiasf; + + TensorH pweihtc; + TensorH pbiasi; + + if (is_bias) { + pbiasf.re_alloc(shb, AK_FLOAT); + pbiasi.re_alloc(shb, AK_INT32); + fill_tensor_rand(pbiasf, -10, 10); + } + + pweihtf.re_alloc(shw, AK_FLOAT); + pweihtc.re_alloc(shw, AK_FLOAT); + + fill_tensor_rand(thinf, -20, 20); + fill_tensor_rand(pweihtf, -10, 10); + // LOG(INFO) << "thinf:"; + // print_tensor(thinf); +// fill_tensor_const(thinf, 1.f); +// fill_tensor_const(pweihtf, 1.f); +// fill_tensor_const(pbiasf, 1.f); + + pweihtc.copy_from(pweihtf); + + //! convert input data type + std::vector scale; + std::vector weights_scale(ch_out, 1.f); + get_tensor_scale(thinf, scale, 0, 63.f); + thinf.set_scale(scale); +// LOG(INFO) << "input tesnor scale at factor 63.f is " << thinf.get_scale()[0] << ", max_val: " << 63.f * thinf.get_scale()[0]; + trans_tensor_dtype(thinf, thinc, scale[0], 1.f, {1.f}); + thinc.set_scale(scale); + // LOG(INFO) << "thinc:"; + // print_tensor(thinc); + trans_weights_dtype(pweihtc, AK_INT8, 127.f, DECONV_TYPE, group); + std::vector w_scale = pweihtc.get_scale(); + trans_fp32_bias_to_int32(pbiasf, pbiasi, thinc.get_scale()[0], w_scale); +// print_tensor(pweihtc); +// print_tensor(pbiasi); + + //! get int8 and fp32 basic result + if (g_compare_result) { + LOG(INFO) << "run basic conv for precision comparation"; + const char* dinc = static_cast(thinc.data()); + const char* weightc = static_cast(pweihtc.data()); + const int* biasi = static_cast(pbiasi.data()); + const float* dinf = static_cast(thinf.data()); + const float* weightf = static_cast(pweihtf.data()); + const float* biasf = static_cast(pbiasf.data()); + tout_basic_fp32.re_alloc(shape_out, AK_FLOAT); + tout_basic_int32.re_alloc(shape_out, AK_INT32); + tout_basic_int8.re_alloc(shape_out, AK_INT8); + + float* dout_basic_fp32 = static_cast(tout_basic_fp32.mutable_data()); + int* dout_basic_int32 = static_cast(tout_basic_int32.mutable_data()); + +// LOG(INFO) << "do basic fp32 conv"; + deconv_basic(dinf, dout_basic_fp32, num, ch_out, hout, wout, chin, hin, win, \ + weightf, biasf, group, kernel_w, kernel_h, stride_w, stride_h, \ + dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); + +// LOG(INFO) << "do basic int8 conv, trans basic int32 to fp32"; +// deconv_basic(dinc, dout_basic_int32, num, ch_out, hout, wout, chin, hin, win, \ + weightc, biasi, group, kernel_w, kernel_h, stride_w, stride_h, \ + dila_w, dila_h, pad_w, pad_h, is_bias, is_relu); + +// LOG(INFO) << "trans basic int32 to int8"; +// trans_tensor_int32_to_int8(tout_basic_int32, tout_basic_int8, thinf.get_scale()[0], w_scale, &ctx1); + +// trans_tensor_int32_to_fp32(tout_basic_int32, tout_basic_fp32, thinf.get_scale()[0], w_scale, &ctx1); + +// print_tensor(tout_basic_fp32); +// print_tensor(tout_basic_int32); + } + + Deconv deconv_int8; + + ConvParam param(group, pad_h, pad_w, stride_h, stride_w, dila_h, dila_w, &pweihtf, &pbiasf); + if (is_relu){ + ActivationParam act_param(Active_relu); + param.activation_param = act_param; + } + +// deconv_int8.compute_output_shape(tvin_int8, tvout_saber_int32); +// Shape sh_out_saber = tvout_saber_int32[0]->valid_shape(); + deconv_int8.compute_output_shape(tvin_int8, tvout_saber_fp32, param); + Shape sh_out_saber = tvout_saber_fp32[0]->valid_shape(); + + + LOG(INFO) << "output shape: " << shape_out[0] << ", " << shape_out[1] << ", " \ + << shape_out[2] << ", " << shape_out[3]; + CHECK_EQ(shape_out == sh_out_saber, true) << "compute output shape error"; + + //! re_alloc mem for output tensor +// LOG(INFO) << "re-alloc output memory"; + tvout_saber_int32[0]->re_alloc(shape_out, AK_INT32); + tvout_saber_fp32[0]->re_alloc(shape_out, AK_FLOAT); + tvout_saber_int8[0]->re_alloc(shape_out, AK_INT8); + + //! init the op +// LOG(INFO) << "saber conv impl init"; +// states = deconv_int8.init(tvin_int8, tvout_saber_int32, ctx1); + auto states = deconv_int8.init(tvin_int8, tvout_saber_fp32, param, SPECIFY, SABER_IMPL, ctx1); + CHECK_EQ(states, SaberSuccess) << "Saber conv init failed"; + + //! compute +// LOG(INFO) << "saber conv compute"; + to = 0; + for (int i = 0; i < g_test_iter; ++i) { + t1.clear(); + t1.start(ctx1); +// states = deconv_int8.dispatch(tvin_int8, tvout_saber_int32); + states = deconv_int8(tvin_int8, tvout_saber_fp32, param, ctx1); + t1.end(ctx1); + to += t1.get_average_ms(); + if (t1.get_average_ms() < min_time) { + min_time = t1.get_average_ms(); + } + CHECK_EQ(states, SaberSuccess) << "Saber conv compute failed"; + } + long long gops = n * ch_out * wout * ch_out * (chin / group) * kernel_w * kernel_h; + LOG(INFO) << "saber conv running time, ave: " << to / g_test_iter << ", min time: " << min_time << \ + ", GOPS: " << 0.000001 * gops / min_time; + +// print_tensor(tout_saber_fp32); + + if (g_compare_result) { + + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host((const float*)tout_basic_fp32.data(), (const float*)tout_saber_fp32.data(), \ + tout_basic_fp32.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + double mean_basic = tensor_mean_value(tout_basic_fp32, nullptr); + double mean_saber = tensor_mean_value(tout_saber_fp32, nullptr); + LOG(INFO) << "mean_basic: " << mean_basic << ", mean_saber: " << mean_saber; + double max_ratio_thresh = 2e-1f; + long long diff_num = count_diff(static_cast(tout_basic_fp32.data()), \ + static_cast(tout_saber_fp32.data()), tout_saber_fp32.valid_size(), max_ratio_thresh, thinf.get_scale()[0]); + LOG(INFO) << "number of diff ratio > " << max_ratio_thresh << " is: " << diff_num << ", %" \ + << 100.f * diff_num / tout_basic_fp32.valid_size(); +// double mean_diff_ratio = fabs(mean_basic - mean_saber) / (fabs(mean_basic) + fabs(mean_saber)); +// LOG(INFO) << "mean val diff ratio: " << mean_diff_ratio; + if ((float)diff_num / tout_saber_fp32.valid_size() > 0.05/* || mean_diff_ratio > 0.1*/) { + LOG(INFO) << "basic result:"; + print_tensor(tout_basic_fp32); + LOG(INFO) << "saber result:"; + print_tensor(tout_saber_fp32); + return SaberInvalidValue; + } + } + return SaberSuccess; +} + +#if 1 +TEST(TestSaberFunc, test_func_deconv_gemm_int8) { + if (g_basic_test) { + for (auto& batch : {1, 2}) { + for (auto& c : {1, 3, 8, 16}) { + for (auto& cout : {1, 5, 16}) { + for (auto& g_div : {1, 2}) { + for (auto& h : {10, 28, 56, 112, 128, 150, 224, 300}) { + for (auto& kw : {1, 2, 3, 5}) { + for (auto& kh : {1, 2, 3, 5}) { + for (auto& pad : {1, 2}) { + for (auto& stride : {1, 2}) { + for (auto& dila : {1, 2}) { + for (auto &flag_bias : {false, true}) { + for (auto &flag_relu : {false, true}) { + for (auto &th : {1/*, 2, 4*/}) { + int w = h; + int g = g_div; + if ((c % g_div != 0) || (cout % g_div != 0)) { + g = 1; + } + auto flag = test_arm_deconv_int8(batch, c, h, w, cout, 1, 1, 1, 1, \ + 0, 0, 1, 1, g, flag_bias, flag_relu, th, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 deconv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(FATAL) << "test int8 deconv: batchsize: " << batch << ", channel: " + << c << ", h & w: " << h << ", num_out: " << cout << ", group: " << g << \ + ", bias: " << (flag_bias ? "true" : "false") << ", relu: " + << (flag_relu ? "true" : "false") << ", threads: " << \ + th << ", cluster: " << g_cluster << " failed!!\n"; + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } +} +#endif + +#if 1 +TEST(TestSaberFunc, test_deconv_int8_costom_size) { + auto flag = test_arm_deconv_int8(g_num, g_chin, g_h_in, g_w_in, g_ch_out, g_kw, g_kh, g_stride_w, g_stride_h, \ + g_pad_w, g_pad_h, g_dila_w, g_dila_h, g_group, g_flag_bias, g_flag_relu, g_threads, g_cluster); + if (flag == SaberSuccess) { + LOG(INFO) << "test int8 deconv: batchsize: " << g_num << ", channel: " + << g_chin << ", h & w: " << g_h_in << ", num_out: " << g_ch_out << ", group: " << g_group << \ + ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " + << (g_flag_relu ? "true" : "false") << ", threads: " << \ + g_threads << ", cluster: " << g_cluster << " passed!!\n"; + } else { + LOG(INFO) << "test int8 deconv: batchsize: " << g_num << ", channel: " + << g_chin << ", h & w: " << g_h_in << ", num_out: " << g_ch_out << ", group: " << g_group << \ + ", bias: " << (g_flag_bias ? "true" : "false") << ", relu: " + << (g_flag_relu ? "true" : "false") << ", threads: " << \ + g_threads << ", cluster: " << g_cluster << " failed!!\n"; + } +} +#endif + +int main(int argc, const char** argv){ + Env::env_init(); + LOG(ERROR) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; + + if (argc >= 2) { + g_basic_test = atoi(argv[1]) > 0; + } + + if (argc >= 3) { + g_cluster = atoi(argv[2]); + } + if (argc >= 4) { + g_threads = atoi(argv[3]); + } + if (argc >= 5) { + g_test_iter = atoi(argv[4]); + } + if (argc >= 6) { + g_compare_result = atoi(argv[5]) > 0; + } + if (argc >= 7) { + g_flag_bias = atoi(argv[6]) > 0; + } + if (argc >= 8) { + g_flag_relu = atoi(argv[7]) > 0; + } + if (argc >= 9) { + if (argc < 18) { + LOG(FATAL) << "usage: ./" << argv[0] << " basic_test cluster threads test_iter " << \ + " compare_result flag_bias flag_relu num ch_in h_in w_in ch_out group" << \ + " kernel pad stride dila [kernel_h] [pad_h] [stride_h] [dila_h]"; + return -1; + } + g_num = atoi(argv[8]); + g_chin = atoi(argv[9]); + g_h_in = atoi(argv[10]); + g_w_in = atoi(argv[11]); + g_ch_out = atoi(argv[12]); + g_group = atoi(argv[13]); + g_kw = atoi(argv[14]); + g_kh = g_kw; + g_pad_w = atoi(argv[15]); + g_pad_h = g_pad_w; + g_stride_w = atoi(argv[16]); + g_stride_h = g_stride_w; + g_dila_w = atoi(argv[17]); + g_dila_h = g_dila_w; + } + if (argc > 18) { + g_kh = atoi(argv[18]); + } + if (argc > 19) { + g_pad_h = atoi(argv[19]); + } + if (argc > 20) { + g_stride_h = atoi(argv[20]); + } + if (argc > 21) { + g_dila_h = atoi(argv[21]); + } + + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + +#else + +int main(int argc, const char** argv){ + LOG(INFO) << "this unit test only be used in TargetType is ARM"; + return 0; +} + +#endif + diff --git a/test/saber/test_saber_depthwise_conv.cpp b/test/saber/test_saber_depthwise_conv.cpp index 24dc773dc..daa5f791f 100644 --- a/test/saber/test_saber_depthwise_conv.cpp +++ b/test/saber/test_saber_depthwise_conv.cpp @@ -58,7 +58,7 @@ TEST(TestSaberFunc, test_saber_depthwise_conv_results) { int out_channels = group; int in_channels = group; - Shape weights_s({out_channels, in_channels, kernel_h, kernel_w}, Layout_NCHW); + Shape weights_s({out_channels, 1, kernel_h, kernel_w}, Layout_NCHW); Shape bias_s({1, out_channels, 1, 1}, Layout_NCHW); #ifdef USE_CUDA Tensor weights_dev; diff --git a/test/saber/test_saber_detection_output.cpp b/test/saber/test_saber_detection_output.cpp new file mode 100644 index 000000000..3e1abb774 --- /dev/null +++ b/test/saber/test_saber_detection_output.cpp @@ -0,0 +1,266 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "test_saber_base.h" +#include "saber/saber_types.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/debug.h" +#include "saber/funcs/detection_output.h" +#include +#include +using namespace anakin::saber; +#if defined(USE_CUDA) +using Target = NV; +using Target_H = NVHX86; +#elif defined(USE_X86_PLACE) +using Target = X86; +using Target_H = X86; +#elif defined(USE_ARM_PLACE) +using Target = ARM; +using Target_H = ARM; +#elif defined(AMD_GPU) +using Target = AMD; +using Target_H = X86; +#endif + +std::string g_bbox_file = "/home/public/multiclass_nms/result_box_clip_0.tmp_0.txt"; +std::string g_conf_file = "/home/public/multiclass_nms/result_softmax_0.tmp_0.txt"; +std::string g_priorbox_file = ""; +std::string g_result_file = "/home/public/multiclass_nms/result_multiclass_nms_0.tmp_0.txt"; +std::string g_img_file = "/home/public/000000000139.jpg"; + +#ifdef USE_OPENCV +#include "opencv2/opencv.hpp" + +using namespace cv; + +struct Object{ + int batch_id; + cv::Rect rec; + int class_id; + float prob; +}; + +void detect_object(Tensor& tout, const float thresh, std::vector& image, const std::string& name) { + int img_num = image.size(); + const float* dout = static_cast(tout.data()); + std::vector objects; + for (int iw = 0; iw < tout.height(); iw++) { + Object object; + const float *values = dout + iw * tout.width(); + int batch_id = static_cast(values[0]); + object.batch_id = batch_id; + object.class_id = (int)values[1]; + object.prob = values[2]; + object.rec.x = (int)(values[3]); + object.rec.y = (int)(values[4]); + object.rec.width = (int)(values[5] - values[3]); + object.rec.height = (int)(values[6] - values[4]); + objects.push_back(object); + } + + for (int i = 0; i < objects.size(); ++i) { + Object object = objects.at(i); + if (object.prob > thresh && object.batch_id < image.size()) { + cv::rectangle(image[object.batch_id], object.rec, cv::Scalar(255, 0, 0)); + std::ostringstream pro_str; + pro_str << "class: " << object.class_id << " + score: " << object.prob; + cv::putText(image[object.batch_id], pro_str.str(), cv::Point(object.rec.x, object.rec.y), \ + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + LOG(INFO) << "detection in batch: " << object.batch_id << ", image size: " << \ + image[object.batch_id].cols << ", " << image[object.batch_id].rows << \ + ", detect object: " << object.class_id << ", location: x=" << \ + object.rec.x << ", y=" << object.rec.y << ", width=" << object.rec.width << \ + ", height=" << object.rec.height; + } + } + for (int j = 0; j < image.size(); ++j) { + std::ostringstream str; + str << name << "_detection_out_" << j << ".jpg"; + cv::imwrite(str.str(), image[j]); + } +} +#endif + +template +static bool sort_score_pair_descend(const std::pair& pair1, \ + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +void get_max_score_index(const dtype* scores, int num, std::vector >* score_index_vec) { + //! Generate index score pairs. + for (int i = 0; i < num; ++i) { + score_index_vec->push_back(std::make_pair(scores[i], i)); + } + + //! Sort the score pair according to the scores in descending order + std::stable_sort(score_index_vec->begin(), score_index_vec->end(), \ + sort_score_pair_descend); +} + +void sort_result(const float* res, int count, Tensor& tout, const std::vector& offset = {}) { + std::vector>> vres; + tout.reshape(Shape({1, 1, count / 6, 7}, Layout_NCHW)); + float* dout = static_cast(tout.mutable_data()); + int batch_size = 1; + if (offset.size() > 0) { + batch_size = offset.size() - 1; + } + for (int k = 0; k < batch_size; ++k) { + int batch_id = k; + int cls_id = -1; + std::vector score; + for (int i = 0; i < count; i += 6) { + int id = static_cast(res[i]); + if (cls_id >= 0) { + if (id != cls_id) { + vres.emplace_back(std::make_pair(cls_id, score)); + cls_id = id; + score.clear(); + score.push_back(res[i + 1]); + } else { + score.push_back(res[i + 1]); + } + } else { + cls_id = id; + score.clear(); + score.push_back(res[i + 1]); + } + } + vres.emplace_back(std::make_pair(cls_id, score)); + LOG(INFO) << "num of classes: " << vres.size(); + const float* din = res; + for (int j = 0; j < vres.size(); ++j) { + float* scores = vres[j].second.data(); + int count = vres[j].second.size(); + std::vector> score_index_vec; + get_max_score_index(scores, count, &score_index_vec); + for (int i = 0; i < score_index_vec.size(); ++i) { + *(dout++) = batch_id; + *(dout++) = vres[j].first; + *(dout++) = score_index_vec[i].first; + *(dout++) = din[score_index_vec[i].second * 6 + 2]; + *(dout++) = din[score_index_vec[i].second * 6 + 3]; + *(dout++) = din[score_index_vec[i].second * 6 + 4]; + *(dout++) = din[score_index_vec[i].second * 6 + 5]; + } + din += score_index_vec.size() * 6; + } + } +} + +TEST(TestSaberFunc, test_func_detection_output) { + const int batch0_start = 0; + const int batch0_end = 112; + std::vector offset = {batch0_start, batch0_end}; + std::vector> seq_offset; + seq_offset.push_back(offset); + Shape shbbox({batch0_end - batch0_start, 81, 4, 1}, Layout_NCHW); + Shape shconf({batch0_end - batch0_start, 81, 1, 1}, Layout_NCHW); + Shape shres({1, 1, 112, 7}, Layout_NCHW); + Tensor thbbox(shbbox); + Tensor thconf(shconf); + Tensor thres_gt(shres); + Tensor tdbbox(shbbox); + Tensor tdconf(shconf); + Tensor tdres(shres); + Tensor thres(shres); + + std::vector vbbox; + std::vector vconf; + std::vector vres; + if (!read_file(vbbox, g_bbox_file.c_str())) { + LOG(ERROR) << "load bbox file failed"; + return; + } + if (!read_file(vconf, g_conf_file.c_str())) { + LOG(ERROR) << "load conf file failed"; + return; + } + if (!read_file(vres, g_result_file.c_str())) { + LOG(ERROR) << "load ground truth failed"; + return; + } + + thres_gt.reshape(Shape({1, 1, vres.size() / 6, 6}, Layout_NCHW)); + + memcpy(thbbox.mutable_data(), vbbox.data(), sizeof(float) * vbbox.size()); + memcpy(thconf.mutable_data(), vconf.data(), sizeof(float) * vconf.size()); + memcpy(thres_gt.mutable_data(), vres.data(), sizeof(float) * vres.size()); + + //! sort the ground truth + sort_result(static_cast(thres_gt.data()), thres_gt.valid_size(), thres); + print_tensor_valid(thres); +// print_tensor_valid(thbbox); +// print_tensor_valid(thconf); +// print_tensor_valid(thres); + tdbbox.copy_from(thbbox); + tdconf.copy_from(thconf); + tdbbox.set_seq_offset(seq_offset); + tdconf.set_seq_offset(seq_offset); + + //! init params + DetectionOutputParam det_param; + det_param.background_id = 0; + det_param.share_location = false; + det_param.class_num = 0; + det_param.type = CORNER; + det_param.conf_thresh = 0.05f; + det_param.keep_top_k = 100; + det_param.variance_encode_in_target = false; + det_param.nms_eta = 1.f; + det_param.nms_top_k = -1; + det_param.nms_thresh = 0.5f; + + //! create op + DetectionOutput det_op; + + //! create io + std::vector *> input_v; + std::vector *> output_v; + input_v.push_back(&tdbbox); + input_v.push_back(&tdconf); + output_v.push_back(&tdres); + + //! create context + Context ctx; + + //! init op + det_op.compute_output_shape(input_v, output_v, det_param); + output_v[0]->reshape(output_v[0]->valid_shape()); + SABER_CHECK(det_op.init(input_v, output_v, det_param, SPECIFY, SABER_IMPL, ctx)); + + //! op dispatch + SABER_CHECK(det_op(input_v, output_v, det_param, ctx)); + print_tensor_valid(*output_v[0]); + + Tensor thres_res(output_v[0]->valid_shape()); + thres_res.copy_from(*output_v[0]); + +#ifdef USE_OPENCV + cv::Mat img = cv::imread(g_img_file); + if (img.empty()) { + return; + } + cv::Mat img_gt = img.clone(); + cv::Mat img_res = img.clone(); + std::vector v_gt = {img_gt}; + std::vector v_res = {img_res}; + LOG(INFO) << "draw gt box to image"; + detect_object(thres, 0.05f, v_gt, "gt"); + LOG(INFO) << "draw test box to image"; + detect_object(thres_res, 0.05f, v_res, "test"); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + Env::env_init(); + Env::env_init(); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_eltwise.cpp b/test/saber/test_saber_eltwise.cpp index 104623af0..58456a7ca 100644 --- a/test/saber/test_saber_eltwise.cpp +++ b/test/saber/test_saber_eltwise.cpp @@ -55,6 +55,17 @@ void eltwise_cpu(const std::vector*>& input,std::vectordata(); + for (int e = 0; e < in_size; e++) { + dst[e] = dst[e] / src[e]; + } + } + break; default: break; @@ -73,58 +84,60 @@ void eltwise_cpu(const std::vector*>& input,std::vector +void test_eltwise() { + //Eltwise test; + for (int inputs_num: {2, 3}) { + TestSaberBase testbase(inputs_num, 1); + for (int num_in:{2, 3, 32}) { + for (int c_in:{1, 3, 32}) { + for (int h_in:{2, 3, 32}) { + for (int w_in:{2, 3, 32}) { + for (EltwiseType type:{Eltwise_prod, Eltwise_sum, Eltwise_max, Eltwise_div}) { + LOG(INFO)<<"input = "< -#include - -using namespace anakin::saber; - -/** - * @brief formula: x * scale / max(max(abs(x)) . - * where, - * local_size = 5(default), means 5 channels in succession. - * sigma((x(i))^2): sum of x^2 of k channels in succession. - * - * - * @tparam dtype - * @tparam TargetType_D - * @tparam TargetType_H - * @param input - * @param output - * @param param - */ -template -void fake_quantize_abs_max_cpu_base(const std::vector* >& input, - std::vector* >& output, FakeQuantizeAbsMaxParam& param) { - const dtype* src = (const dtype*)input[0]->data(); - auto dst = output[0]->mutable_data(); - int valid_size = input[0]->valid_size(); - auto max_data = 0.f; - for (int i = 0; i < valid_size; i++) { - auto abs_data = src[i] > 0.f ? src[i] : -src[i]; - max_data = abs_data > max_data ? abs_data : max_data; - } - auto range = (1<< (param.bit_length - 1)) - 1; - auto scale = 1.f / max_data * range; - LOG(INFO) <<"max_data" << max_data ; - LOG(INFO) << "range" << range; - if (param.bit_length == 8) { - char* dst_tmp = (char*)dst; - for (int i = 0; i < valid_size; i++) { - dst_tmp[i] = round(src[i] * scale); - //LOG(INFO) << i << " " << int(dst_tmp[i]); - } - } else if (param.bit_length == 16) { - int16_t* dst_tmp = (int16_t*)dst; - for (int i = 0; i < valid_size; i++) { - dst_tmp[i] = round(src[i] * scale); - LOG(INFO) << i << " " << dst_tmp[i]; - } - } else { - //LOG(FATAL) <<"other bit length has not been supported"; - } -} - -TEST(TestSaberFunc, test_op_fake_quantize_abs_max) { - -#ifdef USE_CUDA - TestSaberBase testbase; - - for (int w_in : {8, 8, 16}) { - for (int h_in : {2, 8, 32}) { - for (int ch_in : {2, 3, 8, 64}) { - for (int num_in : {1, 21, 32}) { - //for (int w_in : {8,}) { - // for (int h_in : {2,}) { - // for (int ch_in : {2,}) { - // for (int num_in : {3}) { - Shape shape({num_in, ch_in, h_in, w_in}); - for(int bit_length: {8}) { - FakeQuantizeAbsMaxParam param(bit_length); - testbase.set_param(param); - testbase.set_rand_limit(-5.0, 5.0); - testbase.set_input_shape(shape); - testbase.run_test(fake_quantize_abs_max_cpu_base, 2.1e-5f); - } - } - } - } - } -#endif - -#ifdef USE_X86_PLACE - TestSaberBase testbase_x86; - - //for (int w_in : {8,}) { - // for (int h_in : {2,}) { - // for (int ch_in : {2,}) { - // for (int num_in : {3}) { - for (int w_in : {8, 8, 16}) { - for (int h_in : {2, 8, 32}) { - for (int ch_in : {2, 3, 8, 64}) { - for (int num_in : {1, 21, 32}) { - Shape shape_x86({num_in, ch_in, h_in, w_in}); - for (int bit_length : {8}) { - FakeQuantizeAbsMaxParam param_x86(bit_length); - testbase_x86.set_param(param_x86); - testbase_x86.set_rand_limit(-5.0, 5.0); - testbase_x86.set_input_shape(shape_x86); - testbase_x86.run_test(fake_quantize_abs_max_cpu_base); - } - } - } - } - } -#endif - -} - -int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); - InitTest(); - RUN_ALL_TESTS(argv[0]); - return 0; -} diff --git a/test/saber/test_saber_fc.cpp b/test/saber/test_saber_fc.cpp index db07f1983..a8822529d 100644 --- a/test/saber/test_saber_fc.cpp +++ b/test/saber/test_saber_fc.cpp @@ -12,8 +12,9 @@ using namespace anakin::saber; //fc compute (native cpu version) template -void fc_cpu_base(const std::vector* > &input, std::vector* > &output, FcParam ¶m) { - +void fc_cpu_base(const std::vector* > &input, std::vector* > &output, \ + FcParam ¶m) { + const dtype *data_in = (const dtype*)input[0]->data(); const dtype *bias = param.bias ? (const dtype*)param.bias->data() : nullptr; @@ -42,20 +43,19 @@ void fc_cpu_base(const std::vector* > &input, std::vector testbase; - + Tensor weights_h; Tensor weights_d; - + //Shape shape_weight({}) - for(int w_in : {2, 8, 16}) { - for(int h_in : {2, 8, 32}){ - for(int ch_in : {2, 3, 8, 64}){ - for(int num_in:{1, 21, 32}){ + for (int w_in : {2, 8, 16}) { + for (int h_in : {2, 8, 32}){ + for (int ch_in : {2, 3, 8, 64}){ + for (int num_in:{1, 21, 32}){ int out_num = w_in * 2; Shape shape({num_in, ch_in, h_in, w_in}); Shape shape_w({ch_in, h_in, w_in, out_num}); @@ -77,14 +77,14 @@ TEST(TestSaberFunc, test_op_fc) { #ifdef USE_X86_PLACE TestSaberBase testbase0; - + Tensor weights_h0; - + //Shape shape_weight({}) - for(int w_in : {2, 8, 16}) { - for(int h_in : {2, 8, 32}){ - for(int ch_in : {2, 3, 8, 64}){ - for(int num_in:{1, 21, 32}){ + for (int w_in : {2, 8, 16}) { + for (int h_in : {2, 8, 32}){ + for (int ch_in : {2, 3, 8, 64}){ + for (int num_in:{1, 21, 32}){ int out_num = w_in * 2; Shape shape({num_in, ch_in, h_in, w_in}); Shape shape_w({ch_in, h_in, w_in, out_num}); @@ -92,20 +92,45 @@ TEST(TestSaberFunc, test_op_fc) { fill_tensor_rand(weights_h0, 0.1, 1.5); FcParam param(&weights_h0, out_num); testbase0.set_param(param); - testbase0.set_rand_limit(1, 12); + testbase0.set_rand_limit(-12, 12); testbase0.set_input_shape(shape); - testbase0.run_test(fc_cpu_base, 2.1e-5f); + testbase0.run_test(fc_cpu_base, 1.0e-3f); } } } } #endif -} +#ifdef USE_ARM_PLACE + TestSaberBase testbase1; + + Tensor weights_h1; + + for (int w_in : {2, 8, 16}) { + for (int h_in : {2, 8, 32}){ + for (int ch_in : {2, 3, 8}){ + for (int num_in:{1, 2, 16}){ + int out_num = w_in * 2; + //printf("w_in, h_in, ch_in, num_in, out_num: %d, %d, %d, %d, %d\n", w_in, h_in, ch_in, num_in, out_num); + Shape shape({num_in, ch_in, h_in, w_in}); + Shape shape_w({ch_in, h_in, w_in, out_num}); + weights_h1.re_alloc(shape_w, AK_FLOAT); + fill_tensor_rand(weights_h1, 0.1, 1.5); + FcParam param(&weights_h1, out_num); + testbase1.set_param(param); + testbase1.set_rand_limit(-12, 12); + testbase1.set_input_shape(shape); + testbase1.run_test(fc_cpu_base, 1.0e-3f); + } + } + } + } +#endif +} int main(int argc, const char** argv) { - // initial logger - //logger::init(argv[0]); + //!initial logger + logger::init(argv[0]); InitTest(); RUN_ALL_TESTS(argv[0]); return 0; diff --git a/test/saber/test_saber_fc_int8.cpp b/test/saber/test_saber_fc_int8.cpp new file mode 100644 index 000000000..b1a96f546 --- /dev/null +++ b/test/saber/test_saber_fc_int8.cpp @@ -0,0 +1,456 @@ +#include "saber/funcs/fc.h" +#include "saber/saber_types.h" +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" + +#include "test_saber_func.h" +#include "test_saber_base.h" +#if defined(USE_X86_PLACE) +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#endif + +using namespace anakin::saber; + +template +int count_diff(const void* input1, const void* input2, int size, + double max_ratio, bool with_print = false) { + auto src1 = static_cast(input1); + auto src2 = static_cast(input2); + + if (max_ratio <= 0) { + max_ratio = 1e-2; + } + + int count = 0; + + for (int i = 0; i < size; ++i) { + double ratio = fabs(src1[i] - src2[i]) / + fabs(src1[i] + src2[i] + 1e-12); + + if (ratio > max_ratio) { + if (with_print) { + LOG(ERROR) << "out = " << (float)src1[i] + << "\nout_ref = " << (float)src2[i]; + } + + ++count; + } + } + + return count; +} + +template +void fc_cpu_common(const std::vector* >& src, + std::vector* >& dst, + FcParam& param) { + int output_channel = dst[0]->count_valid(1, dst[0]->dims()); + int batch_size = src[0]->num(); + + Shape OutShape({batch_size, output_channel, 1, 1}, Layout_NCHW); + Tensor dst_tmp; + dst_tmp.re_alloc(OutShape, AK_INT32); + + auto dst_tmp_data = static_cast(dst_tmp.mutable_data()); + auto dst_data = static_cast(dst[0]->mutable_data()); + auto weights_data = static_cast(param.weights->data()); + auto bias_data = param.bias ? + static_cast(param.bias->data()) : + nullptr; + + for (int i = 0; i < src.size(); i++) { + int IC = src[i]->count_valid(1, src[i]->dims()); + auto src_data = static_cast(src[i]->data()); + + #pragma omp parallel for collapse(2) schedule(static) + + for (int mb = 0; mb < batch_size; mb++) { + for (int oc = 0; oc < output_channel; oc++) { + int oidx = mb * output_channel + oc; + + if (i == 0) { + if (src[0]->get_dtype() == AK_UINT8) { + dst_tmp_data[oidx] = bias_data ? bias_data[oc] : dst_dtype{0}; + } else { + dst_data[oidx] = bias_data ? bias_data[oc] : dst_dtype{0}; + } + } + + for (int ic = 0; ic < IC; ic++) { + int iidx = mb * IC + ic; + int widx = oc * IC + ic; + + if (src[0]->get_dtype() == AK_UINT8) { + dst_tmp_data[oidx] += src_data[iidx] * weights_data[widx]; + } else { + dst_data[oidx] += src_data[iidx] * weights_data[widx]; + } + } + } + } + + weights_data += output_channel * IC; + } + + if (src[0]->get_dtype() == AK_UINT8) { + for (int mb = 0; mb < batch_size; mb++) { + for (int oc = 0; oc < output_channel; oc++) { + int dst_index = mb * output_channel + oc; + float scale = (src[0]->get_scale()[0] * param.weights->get_scale()[oc]) / + dst[0]->get_scale()[0]; + dst_data[dst_index] = scale * dst_tmp_data[dst_index]; + } + } + } +} + +template +void test_fc_cpu(int mb, + std::vector ic, + int oc, + bool with_bias = false, + std::vectorscale = {1.f, 1.f, 1.f}, + LayoutType layout = Layout_NCHW) { + Env::env_init(); + Context ctx_host; + + std::vector *> inputs, outputs, outputs_ref; + Tensor weights, bias; + + int total_ic = 0; + + for (int i = 0; i < ic.size(); i++) { + total_ic += ic[i]; + Shape InputShape({mb, layout == Layout_NCHW ? ic[i] : 1, + 1, layout == Layout_NCHW ? 1 : ic[i]}, layout); + inputs.push_back(new Tensor); + inputs[i]->re_alloc(InputShape, inDtype); + + if (inDtype == AK_FLOAT) { + fill_tensor_rand(*inputs[i], -10.f, 10.f); + } else { + fill_tensor_rand(*inputs[i], 0, 255); + inputs[i]->set_scale({scale[0]}); + } + } + + Shape WeightShape({oc, layout == Layout_NCHW ? total_ic : 1, + 1, layout == Layout_NCHW ? 1 : total_ic}, layout); + Shape BiasShape({layout == Layout_NCHW ? oc : 1, 1, + 1, layout == Layout_NCHW ? 1 : oc}, layout); + Shape OutShape({mb, layout == Layout_NCHW ? oc : 1, + 1, layout == Layout_NCHW ? 1 : oc}, layout); + + outputs.push_back(new Tensor); + outputs_ref.push_back(new Tensor); + + weights.re_alloc(WeightShape, opDtype); + bias.re_alloc(BiasShape, biasDtype); + outputs[0]->re_alloc(OutShape, outDtype); + outputs_ref[0]->re_alloc(OutShape, outDtype); + + fill_tensor_rand(weights, -10, 10); + fill_tensor_rand(bias, -10, 10); + + std::vector scale_weights; + + for (int i = 0; i < oc; i ++) { + scale_weights.push_back(scale[1]); + } + + weights.set_scale(scale_weights); + outputs[0]->set_scale({scale[2]}); + outputs_ref[0]->set_scale({scale[2]}); + + FcParam param(&weights, with_bias ? &bias : nullptr, oc); + Fc VenderFc; + + VenderFc.init(inputs, outputs, param, SPECIFY, VENDER_IMPL, ctx_host); + VenderFc(inputs, outputs, param, ctx_host); + + int flag = 10; + + if (opDtype == AK_FLOAT) { + fc_cpu_common(inputs, outputs_ref, param); + flag = count_diff(outputs[0]->data(), outputs_ref[0]->data(), + outputs[0]->valid_size(), 1e-3); + } else { + if (outDtype == AK_FLOAT) { + fc_cpu_common(inputs, outputs_ref, param); + flag = count_diff(outputs[0]->data(), outputs_ref[0]->data(), + outputs[0]->valid_size(), 1e-5); + } else if (outDtype == AK_INT32) { + fc_cpu_common(inputs, outputs_ref, param); + flag = count_diff(outputs[0]->data(), outputs_ref[0]->data(), + outputs[0]->valid_size(), 1e-5); + } else if (outDtype == AK_INT8) { + fc_cpu_common(inputs, outputs_ref, param); + flag = count_diff(outputs[0]->data(), outputs_ref[0]->data(), + outputs[0]->valid_size(), 1e-5); + } + } + + if (flag <= 5) { + LOG(INFO) << "Test fc x86 passed"; + } else { + LOG(ERROR) << "Test fc x86 failed"; + } + + return; +} + +template +static void fc_cpu_base(const std::vector* >& input, + std::vector* >& output, FcParam& param) { + + const dtype* data_in = (const dtype*)input[0]->data(); + const dtype* bias = param.bias ? (const dtype*)param.bias->data() : nullptr; + + Tensor weights_h(param.weights->valid_shape()); + weights_h.copy_from(*param.weights); + + const dtype* weights = (const dtype*)weights_h.data(); + dtype* data_out = (dtype*)output[0]->mutable_data(); + + //is_trans: flase. + //output: data_out; inputs: data_in ; weights: weights. + //data_out = data_in * weights. Get weights' elements continuosly. + int out_rows = input[0]->num(); + int in_cols = input[0]->valid_size() / out_rows; + int out_cols = param.weights->valid_size() / in_cols; + + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + int index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + + for (int k = 0; k < in_cols; k++) { + //data_out[index_out] += data_in[i * in_cols + k] * weights[k * out_cols + j]; + data_out[index_out] += data_in[i * in_cols + k] * weights[j * in_cols + k]; + } + } + } +} + +template +static int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) { + if (max_ratio <= 0) { + max_ratio = 0.1; + } + + int count = 0; + + for (int i = 0; i < size; ++i) { + double ratio = fabs(src1[i] - src2[i]) / fabs(src1[i] + src2[i] + 1e-12); + + if (ratio > max_ratio) { + ++count; + } + } + + return count; +} +template +static void test_fc_int8(int in_num, int in_channel, int in_height, int in_width, int num_output, + bool with_bias) { + Env::env_init(); + Env::env_init(); + Shape input_shape({in_num, in_channel, in_height, in_width}); + Shape weights_shape({1, 1, num_output, in_channel* in_height * in_width}); + Shape bias_shape({1, 1, 1, num_output}); + Tensor host_input(input_shape); + Tensor dev_input{input_shape}; + Tensor host_weights(weights_shape); + Tensor dev_weights{weights_shape}; + Tensor host_bias; + Tensor dev_bias; + Tensor host_output; + Tensor dev_output; + Tensor check_output; + + float input_max = 1.f; + fill_tensor_rand(host_input, -input_max, input_max); + // fill_tensor_const(host_input, input_max); + dev_input.copy_from(host_input); + dev_input.set_scale({input_max / 127.f}); + + fill_tensor_rand(host_weights, -input_max, input_max); + // fill_tensor_seq(host_weights); + // fill_tensor_const(host_weights, input_max); + dev_weights.copy_from(host_weights); + + + if (with_bias) { + host_bias.re_alloc(bias_shape); + dev_bias.re_alloc(bias_shape); + fill_tensor_const(host_bias, input_max); + // fill_tensor_rand(host_bias, -input_max, input_max); + dev_bias.copy_from(host_bias); + } + + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&dev_input); + output_v.push_back(&dev_output); + + Context ctx1(0, 1, 1); + FcParam param(&dev_weights, &dev_bias, num_output); + Fc fc; + + fc.compute_output_shape(input_v, output_v, param); + dev_output.re_alloc(dev_output.valid_shape()); + dev_output.set_scale({1.f}); + host_output.re_alloc(dev_output.valid_shape()); + check_output.re_alloc(dev_output.valid_shape()); + + SABER_CHECK(fc.init(input_v, output_v, param, SPECIFY, VENDER_IMPL, ctx1)); + SABER_CHECK(fc(input_v, output_v, param, ctx1)); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + + std::vector* > input_h; + std::vector* > output_h; + input_h.push_back(&host_input); + output_h.push_back(&check_output); + fc_cpu_base(input_h, output_h, param); + + + host_output.copy_from(dev_output); + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host_mlu((const float*)check_output.data(), (const float*)host_output.data(), + host_output.valid_size(), max_ratio, max_diff); + + if (max_ratio > 0.1) { + write_tensorfile(dev_weights, "input_weights"); + write_tensorfile(dev_output, "output_dev"); + write_tensorfile(check_output, "check_host"); + LOG(FATAL) << "ratio " << max_ratio; + } else { + // write_tensorfile(dev_output,"output_dev"); + // write_tensorfile(check_output,"check_host"); + LOG(ERROR) << "passed " << max_ratio; + } + + +}; +#ifdef USE_X86_PLACE +void test_int8_perf(int m, int n, int k, int iter = 100) { + signed char* ptr_a = new signed char[m * k]; + unsigned char* ptr_b = new unsigned char[k * n]; + int* ptr_c = new int[m * n]; + Tensora(Shape({1, 1, m, k}), AK_INT8); + Tensorb(Shape({1, 1, k, n}), AK_UINT8); + Tensorc(Shape({1, 1, 1, m}), AK_INT32); + + for (int i = 0; i < m * k; i++) { + ptr_a[i] = 127; + } + + for (int i = 0; i < k * n; i++) { + ptr_b[i] = 255; + } + + int c_offset = 0; + cblas_gemm_s8u8s32(CblasColMajor, // Layout + CblasTrans, // a need to transpose or not + CblasNoTrans, // b need to transpose or not + CblasFixOffset, // c_offset_layout + m, // m + n, // n + k, // k + 1.0, // scale + ptr_a, // a + k, // lda + 0, // a_offset + ptr_b, // b + k, // ldb + 0, // b_offset + 0.0, // beta + ptr_c, // c + m, // ldc + &c_offset); + Context ctx(0, 1, 1); + SaberTimer timer; + timer.start(ctx); + + for (int i = 0; i < iter; i++) { + cblas_gemm_s8u8s32(CblasColMajor, // Layout + CblasTrans, // a need to transpose or not + CblasNoTrans, // b need to transpose or not + CblasFixOffset, // c_offset_layout + m, // m + n, // n + k, // k + 1.0, // scale + ptr_a, // a + k, // lda + 0, // a_offset + ptr_b, // b + k, // ldb + 0, // b_offset + 0.0, // beta + ptr_c, // c + m, // ldc + &c_offset); + } + + timer.end(ctx); + double work = 2 * m * n * k; + double use_time = timer.get_average_ms() / iter; + double speed = work / use_time / 1000 / 1000; + LOG(INFO) << m << "," << n << "," << k << "::" << "gfloat " << speed; +} +#endif + +TEST(TestSaberFunc, test_op_fc) { +#ifdef USE_CUDA +#endif + +#ifdef USE_X86_PLACE + Env::env_init(); + + if (jit::mayiuse(jit::avx512_core_vnni)) { + + for (auto m : { + 1, 3, 5, 7 + }) { + for (auto n : { + 3, 12, 17 + }) { + for (auto k : { + 7, 16, 22 + }) { + for (auto with_bias : { + false, true + }) { + test_fc_int8(m, 1, 1, k, n, with_bias); + } + } + } + } + + int m = 3; + int n = 5; + int k = 7; + test_fc_int8(m, 1, 1, k, n, true); + } + +#endif +} + + +int main(int argc, const char** argv) { + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} \ No newline at end of file diff --git a/test/saber/test_saber_func.h b/test/saber/test_saber_func.h index 16b2a9bfc..f3267c31f 100644 --- a/test/saber/test_saber_func.h +++ b/test/saber/test_saber_func.h @@ -33,8 +33,27 @@ static void split_string(const std::string& s, char delim, elems.push_back(item); } } +template +void tensor_cmp_host_mlu(const Dtype* src1, const Dtype* src2, \ + int size, double& max_ratio, double& max_diff) { + double sum_diff_sq = 0.0; + double sum_x_sq = 0.0; + double eps = 1e-10; + for (size_t i = 0; i < size; i++) { + if (std::isnan(src1[i]) || std::isnan(src2[2])){ + max_ratio = 9999; + max_diff = 9999; + return; + } + sum_diff_sq += (src1[i] - src2[i]) * (src1[i] - src2[i]); + sum_x_sq += src2[i] * src2[i]; + } + + max_ratio = sqrt(sum_diff_sq / (sum_x_sq + eps)); + max_diff = max_ratio; +} -int read_file(std::vector& results, const char* file_name, char split_char, int index) { +bool read_file(std::vector& results, const char* file_name, char split_char, int index) { std::ifstream infile(file_name); @@ -52,9 +71,9 @@ int read_file(std::vector& results, const char* file_name, char split_cha results.push_back((float)atof(vec[index].c_str())); } - return 0; + return true; } -int read_file(std::vector& results, const char* file_name) { +bool read_file(std::vector& results, const char* file_name) { std::ifstream infile(file_name); @@ -70,7 +89,7 @@ int read_file(std::vector& results, const char* file_name) { results.push_back((float)atof(line.c_str())); } - return 0; + return true; } class TestSaberFunc : public Test { public: diff --git a/test/saber/test_saber_gemm.cpp b/test/saber/test_saber_gemm.cpp index bdd58d69c..fd2df125c 100644 --- a/test/saber/test_saber_gemm.cpp +++ b/test/saber/test_saber_gemm.cpp @@ -6,9 +6,35 @@ #include "saber/saber_types.h" #include "test_saber_func.h" #include "conv_func_helper.h" + +#if defined(USE_X86_PLACE) +#include "saber/funcs/impl/x86/mkl_gemm.h" +#include "saber/funcs/impl/x86/mkl_packed_int8_gemm.h" +#include +#include "saber/funcs/impl/x86/kernel/jit_generator.h" +#endif + #include +#include "debug.h" using namespace anakin::saber; +#define CLEAR_CACHE 0 + + +#ifdef USE_X86_PLACE + +void flush_tensor_cache_out(Tensor& tensor) { +#ifdef USE_X86_PLACE + char* ptr = static_cast(tensor.data()); + size_t amount = tensor.valid_size() * tensor.get_dtype_size(); + + for (size_t i = 0; i < amount; i += 32) { + _mm_clflush(ptr + i); + } + +#endif +} +#endif void gemm_check(const int m, const int n, const int k, const float* a, const float* b, float* c, @@ -18,9 +44,11 @@ void gemm_check(const int m, const int n, const int k, int lda = k; int ldb = n; int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { for (int n_i = 0; n_i < n; ++n_i) { c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { c[m_i * ldc + n_i] += alpha * a[m_i * lda + k_i] * b[k_i * ldb + n_i]; } @@ -30,9 +58,11 @@ void gemm_check(const int m, const int n, const int k, int lda = k; int ldb = k; int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { for (int n_i = 0; n_i < n; ++n_i) { c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { c[m_i * ldc + n_i] += alpha * a[m_i * lda + k_i] * b[n_i * ldb + k_i]; } @@ -42,9 +72,11 @@ void gemm_check(const int m, const int n, const int k, int lda = m; int ldb = n; int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { for (int n_i = 0; n_i < n; ++n_i) { c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { c[m_i * ldc + n_i] += alpha * a[k_i * lda + m_i] * b[k_i * ldb + n_i]; } @@ -54,9 +86,11 @@ void gemm_check(const int m, const int n, const int k, int lda = m; int ldb = k; int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { for (int n_i = 0; n_i < n; ++n_i) { c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { c[m_i * ldc + n_i] += alpha * a[k_i * lda + m_i] * b[n_i * ldb + k_i]; } @@ -66,8 +100,195 @@ void gemm_check(const int m, const int n, const int k, } +#if defined(USE_X86_PLACE) +template +void test_gemm_result_mkldnn(int m, int n, int k, bool trans_a, bool trans_b, + MKLGemmMode gemm_mode = NORMAL_MKLGEMM) { + + Tensor a_dev, b_dev, c_dev; + Tensor a_host, b_host, c_host, c_check; + + Context ctx1(0, 1, 0); + MklDnnGemm gemm_vender; + + + float alpha = 1.f; + float beta = 0.f; + + Shape a_shape({m, k}, Layout_HW); + Shape b_shape({k, n}, Layout_HW); + Shape c_shape({m, n}, Layout_HW); + + a_dev.re_alloc(a_shape, AK_FLOAT); + b_dev.re_alloc(b_shape, AK_FLOAT); + c_dev.re_alloc(c_shape, AK_FLOAT); + + a_host.re_alloc(a_shape, AK_FLOAT); + b_host.re_alloc(b_shape, AK_FLOAT); + c_host.re_alloc(c_shape, AK_FLOAT); + c_check.re_alloc(c_shape, AK_FLOAT); + fill_tensor_rand(a_dev, -10.f, 10.f); + fill_tensor_rand(b_dev, -10.f, 10.f); + a_host.copy_from(a_dev); + b_host.copy_from(b_dev); + SaberTimer vender_time, saber_time; + int ts = 1000; + int warm_up = 100; + + SaberStatus vender_status = gemm_vender.init(trans_a, trans_b, 1, n, k, ctx1, + static_cast(b_dev.data()), gemm_mode); + + if (vender_status == SaberSuccess) { + gemm_vender.dispatch(alpha, beta, m, + (const float*) a_dev.data(), + (const float*) b_dev.data(), + (float*) c_dev.mutable_data()); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + c_host.copy_from(c_dev); + gemm_check(m, n, k, (const float*) a_host.data(), (const float*) b_host.data(), + (float*) c_check.mutable_data(), + alpha, beta, trans_a, trans_b); + double max_ratio = 0.f, max_diff = 0.f; + tensor_cmp_host((const float*) c_check.data(), (const float*) c_host.data(), + c_check.valid_size(), max_ratio, max_diff); + + if (max_ratio > 1e-3) { + print_tensor(a_dev); + print_tensor(b_dev); + print_tensor_valid(c_check); + print_tensor_valid(c_host); + LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " << max_ratio << " max_diff: " << max_diff + << "m = " << m << " n = " << n << " k = " << k; + } + + for (int t = 0; t < warm_up; t++) { + gemm_vender.dispatch(alpha, beta, m, + (const float*) a_dev.data(), + (const float*) b_dev.data(), + (float*) c_dev.mutable_data()); + } + + for (int t = 0; t < ts; ++t) { + vender_time.start(ctx1); + gemm_vender.dispatch(alpha, beta, m, + (const float*) a_dev.data(), + (const float*) b_dev.data(), + (float*) c_dev.mutable_data()); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + vender_time.end(ctx1); + } + } + + double work = (double)m * n * k * 2; + double vender_time_ms = (vender_status == SaberSuccess ? vender_time.get_average_ms() : 1e10); + double vender_speed = work / vender_time_ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn " << m << "," << n << "," << k << "::" << "gops " << vender_speed << ", ms = " + << vender_time_ms; + // LOG(INFO) << "Vender time: " << (vender_status == SaberSuccess ? vender_time.get_average_ms() : 0) + // << "ms ,speed = " << vender_speed << "gfloat/s"; +} + +template +void test_gemm_result_mkl_warp(int m, int n, int k, bool trans_a, bool trans_b) { + + Tensor a_dev, b_dev, c_dev; + Tensor a_host, b_host, c_host, c_check; + + Context ctx1(0, 1, 0); + PackedMKLInt8Gemm gemm_vender; + float input_max = 3.f; + + float alpha = 1.f; + float beta = 0.f; + + Shape a_shape({1, 1, m, k}, Layout_NCHW); + Shape b_shape({1, 1, k, n}, Layout_NCHW); + Shape c_shape({1, 1, m, n}, Layout_NCHW); + + a_dev.re_alloc(a_shape, AK_FLOAT); + b_dev.re_alloc(b_shape, AK_FLOAT); + c_dev.re_alloc(c_shape, AK_FLOAT); + + a_host.re_alloc(a_shape, AK_FLOAT); + b_host.re_alloc(b_shape, AK_FLOAT); + c_host.re_alloc(c_shape, AK_FLOAT); + c_check.re_alloc(c_shape, AK_FLOAT); +// fill_tensor_rand(a_dev, -input_max, input_max); +// fill_tensor_rand(b_dev, -input_max, input_max); + fill_tensor_const(a_dev,input_max); + fill_tensor_const(b_dev,input_max); + a_host.copy_from(a_dev); + b_host.copy_from(b_dev); + SaberTimer vender_time, saber_time; + int ts = 1000; + int warm_up = 100; + + a_dev.set_scale({input_max / 127.f}); + SaberStatus vender_status = gemm_vender.init(trans_a, trans_b, 1, n, k, b_dev, + a_dev.get_scale()[0]); + + if (vender_status == SaberSuccess) { + gemm_vender.dispatch(alpha, beta, m, + a_dev, + c_dev, nullptr); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + c_host.copy_from(c_dev); + gemm_check(m, n, k, (const float*) a_host.data(), (const float*) b_host.data(), + (float*) c_check.mutable_data(), + alpha, beta, trans_a, trans_b); + double max_ratio = 0.f, max_diff = 0.f; + + tensor_cmp_host_mlu((const float*)c_host.data(), (const float*)c_check.data(), + c_check.valid_size(), max_ratio, max_diff); + + if (max_ratio > 0.1) { + write_tensorfile(c_dev, "output_dev"); + write_tensorfile(c_check, "check_host"); + LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " << max_ratio << " max_diff: " << max_diff + << "m = " << m << " n = " << n << " k = " << k; + } else { + LOG(INFO) << "passed " << max_ratio; + } + + for (int t = 0; t < warm_up; t++) { + gemm_vender.dispatch(alpha, beta, m, + a_dev, + c_dev, nullptr); + } + + for (int t = 0; t < ts; ++t) { + vender_time.start(ctx1); + gemm_vender.dispatch(alpha, beta, m, + a_dev, + c_dev, nullptr); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + vender_time.end(ctx1); + } + } + + double work = (double)m * n * k * 2; + double vender_time_ms = (vender_status == SaberSuccess ? vender_time.get_average_ms() : 1e10); + double vender_speed = work / vender_time_ms / 1000.0 / 1000.0; + LOG(INFO) << "mkldnn " << m << "," << n << "," << k << "::" << "gops " << vender_speed << ", ms = " + << vender_time_ms; + // LOG(INFO) << "Vender time: " << (vender_status == SaberSuccess ? vender_time.get_average_ms() : 0) + // << "ms ,speed = " << vender_speed << "gfloat/s"; +} + +#endif + template -void test_gemm_result (int m, int n, int k, bool trans_a, bool trans_b) { +void test_gemm_result(int m, int n, int k, bool trans_a, bool trans_b) { Tensor a_dev, b_dev, c_dev; Tensor a_host, b_host, c_host, c_check; @@ -98,76 +319,110 @@ void test_gemm_result (int m, int n, int k, bool trans_a, bool trans_b) { a_host.copy_from(a_dev); b_host.copy_from(b_dev); SaberTimer vender_time, saber_time; - int ts = 100; + + int ts = 300; + int warm_up = 50; + if (vender_status == SaberSuccess) { gemm_vender.dispatch(alpha, beta, - (const float *) a_dev.data(), - (const float *) b_dev.data(), - (float *) c_dev.mutable_data()); + (const float*) a_dev.data(), + (const float*) b_dev.data(), + (float*) c_dev.mutable_data()); typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); c_dev.record_event(stream); c_dev.sync(); c_host.copy_from(c_dev); - gemm_check(m, n, k, (const float *) a_host.data(), (const float *) b_host.data(), - (float *) c_check.mutable_data(), + gemm_check(m, n, k, (const float*) a_host.data(), (const float*) b_host.data(), + (float*) c_check.mutable_data(), alpha, beta, trans_a, trans_b); double max_ratio = 0.f, max_diff = 0.f; - tensor_cmp_host((const float *) c_check.data(), (const float *) c_host.data(), + tensor_cmp_host((const float*) c_check.data(), (const float*) c_host.data(), c_check.valid_size(), max_ratio, max_diff); + if (max_ratio > 1e-3) { print_tensor_valid(c_check); print_tensor_valid(c_host); - LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " < -void test_gemv_result (int m, int n, bool trans) { +void test_gemv_result(int m, int n, bool trans) { Tensor a_dev, b_dev, c_dev; Tensor a_host, b_host, c_host, c_check; @@ -209,8 +471,8 @@ void test_gemv_result (int m, int n, bool trans) { float beta = 0.f; Shape a_shape({m, n}, Layout_HW); - Shape b_shape({(trans? m : n)}, Layout_W); - Shape c_shape({(trans? n : m)}, Layout_W); + Shape b_shape({(trans ? m : n)}, Layout_W); + Shape c_shape({(trans ? n : m)}, Layout_W); a_dev.re_alloc(a_shape, AK_FLOAT); b_dev.re_alloc(b_shape, AK_FLOAT); @@ -226,102 +488,160 @@ void test_gemv_result (int m, int n, bool trans) { b_host.copy_from(b_dev); SaberTimer vender_time, saber_time; - int ts = 100; + int ts = 1000; + if (vender_status == SaberSuccess) { gemv_vender.dispatch(alpha, beta, - (const float *) a_dev.data(), - (const float *) b_dev.data(), - (float *) c_dev.mutable_data()); + (const float*) a_dev.data(), + (const float*) b_dev.data(), + (float*) c_dev.mutable_data()); typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); c_dev.record_event(stream); c_dev.sync(); c_host.copy_from(c_dev); - gemv_check(m, n, (const float *) a_host.data(), (const float *) b_host.data(), - (float *) c_check.mutable_data(), + gemv_check(m, n, (const float*) a_host.data(), (const float*) b_host.data(), + (float*) c_check.mutable_data(), alpha, beta, trans); double max_ratio = 0.f, max_diff = 0.f; - tensor_cmp_host((const float *) c_check.data(), (const float *) c_host.data(), + tensor_cmp_host((const float*) c_check.data(), (const float*) c_host.data(), c_check.valid_size(), max_ratio, max_diff); + if (max_ratio > 1e-3) { print_tensor_valid(a_host); print_tensor_valid(b_host); print_tensor_valid(c_check); print_tensor_valid(c_host); - LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " < m_v = {5, 100, 150, 200, 250, 300}; - std::vector n_v = {5, 100, 150, 200, 250, 300}; - std::vector k_v = {5, 100, 150, 200, 250, 300}; - std::vector trans_a_v{false}; - std::vector trans_b_v{false}; + // std::vector m_v = {5, 100, 150, 200, 250, 300}; + // std::vector n_v = {5, 100, 150, 200, 250, 300}; + // std::vector k_v = {5, 100, 150, 200, 250, 300}; + // std::vector trans_a_v{false}; + // std::vector trans_b_v{false}; + // + // for (auto m : m_v) + // for (auto n : n_v) + // for (auto k : k_v) + // for (auto trans_a : trans_a_v) + // for (auto trans_b : trans_b_v) { + // + //#ifdef USE_CUDA + // test_gemm_result(m, n, k, trans_a, trans_b); + //#endif + // + //#ifdef USE_X86_PLACE + // test_gemm_result(m, n, k, trans_a, trans_b); + //#endif + // } - for (auto m : m_v) - for (auto n : n_v) - for (auto k : k_v) - for (auto trans_a : trans_a_v) - for (auto trans_b : trans_b_v) { + // test_gemm_result_mkldnn(2,3,4,false,false); -#ifdef USE_CUDA - test_gemm_result(m, n, k, trans_a, trans_b); -#endif + // int n = 4096, k = 1024; + // + // for (int m : { + // 1, 2, 3, 4, 12, 16 + // }) { + //// test_gemm_result(m, n, k, false, false); + // test_gemm_result_mkldnn(m,n,k,false,false); + // } -#ifdef USE_X86_PLACE - test_gemm_result(m, n, k, trans_a, trans_b); -#endif + if (jit::mayiuse(jit::avx512_core_vnni)) { + test_gemm_result_mkl_warp(222, 333, 444, false, false); } + test_gemm_result_mkldnn(12, 1536 * 4, 512, false, false, PACKED_MKLGEMM); + test_gemm_result_mkldnn(12, 1536 * 4, 2048, false, false, PACKED_MKLGEMM); + test_gemm_result_mkldnn(4, 1536 * 4, 512, false, false, PACKED_MKLGEMM); + test_gemm_result_mkldnn(4, 512, 1536, false, false, PACKED_MKLGEMM); + test_gemm_result_mkldnn(1, 1536 * 4, 512, false, false, PACKED_MKLGEMM); + test_gemm_result_mkldnn(1, 512, 1536, false, false, PACKED_MKLGEMM); + + // + // test_gemm_result(16,1536*4,512,false,false); + // test_gemm_result(16,1536*4,2048,false,false); + // test_gemm_result(4,1536*4,512,false,false); + // test_gemm_result(512,512,512,false,false); + // test_gemm_result(1024,1024,1024,false,false); + + // test_gemm_result(4,512,1536,false,false); + // test_gemm_result(1,1536*4,512,false,false); + // test_gemm_result(1,512,1536,false,false); + // test_gemm_result(256,24*24,128,false,false); + // test_gemm_result(256,24*24,128,false,false); + + + // test_mkl_gemm(32,32,32); + // test_mkl_gemm(64,64,64); + // test_mkl_gemm(128,128,128); + // test_mkl_gemm(256,256,256); + // test_mkl_gemm(512,512,512); + + // test_gemm_result(32,32,32,false,false); + // test_gemm_result(64,64,64,false,false); + // test_gemm_result(128,128,128,false,false); + // test_gemm_result(256,256,256,false,false); + // test_gemm_result(512,512,512,false,false); + // test_gemm_result(2048,2048,2048,false,false); + // test_gemm_result(4096,4096,4096,false,false); +#endif } TEST(TestSaberFunc, test_vender_gemv_float) { @@ -331,17 +651,17 @@ TEST(TestSaberFunc, test_vender_gemv_float) { std::vector trans_v{false, true}; for (auto m : m_v) - for (auto n : n_v) - for (auto trans : trans_v) { + for (auto n : n_v) + for (auto trans : trans_v) { #ifdef USE_CUDA - test_gemv_result(m, n, trans); + test_gemv_result(m, n, trans); #endif #ifdef USE_X86_PLACE - test_gemv_result(m, n, trans); + // test_gemv_result(m, n, trans); #endif - } + } } int main(int argc, char* argv[]) { diff --git a/test/saber/test_saber_gemm_int8.cpp b/test/saber/test_saber_gemm_int8.cpp index 27ec98d1e..98e89bd9c 100644 --- a/test/saber/test_saber_gemm_int8.cpp +++ b/test/saber/test_saber_gemm_int8.cpp @@ -8,8 +8,30 @@ #include "test_saber_func.h" #include "conv_func_helper.h" #include - +#if defined(USE_X86_PLACE) +#include "saber/funcs/impl/x86/mkl_gemm.h" +#include "saber/funcs/impl/x86/intrinsic_gemm.h" +#include "saber/funcs/impl/x86/intrinsic_packed_fc.h" +#include +#define CLEAR_CACHE 1 +#endif using namespace anakin::saber; +#if defined(USE_X86_PLACE) +const size_t g_cache_size = 10 * 1000 * 1000; +char g_cache[g_cache_size]; +void clear_cache(){ + for (int i = 0;i < g_cache_size;i += 64){ + g_cache[i]++; + } +} +void flush_tensor_cache_out(Tensor& tensor){ + char* ptr = static_cast(tensor.data()); + size_t amount=tensor.valid_size() * tensor.get_dtype_size(); + for (size_t i = 0;i < amount;i += 32){ + _mm_clflush(ptr + i); + } +} +#endif void gemm_check(const int m, const int n, const int k, const float* a, const float* b, float* c, @@ -66,6 +88,95 @@ void gemm_check(const int m, const int n, const int k, } } +template +void gemm_check_int8(const int m, const int n, const int k, + const AType* a, const BType* b, CType* c, + const float alpha, const float beta, + const bool trans_a, const bool trans_b,bool is_base_gemm=false) { + if(is_base_gemm){ +// LOG(INFO)<<"in"; + int lda = k; + int ldb = k; + int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { + for (int n_i = 0; n_i < n; ++n_i) { + c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { + c[m_i * ldc + n_i] += static_cast(alpha * (int)a[m_i * lda + k_i] * (int)b[n_i * ldb + k_i]); + } + } + } + return; + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + int32_t old_c = (beta == 0) ? 0 : c[i * ldc + j]; + int32_t res = 0; + c[i * ldc + j]*=beta; + for (int d = 0; d < k; ++d) { + res += a[i * lda + d] * b[j * ldb + d]; + } + c[i * ldc + j] += res * alpha; + } + } + return; + } + if (!trans_a && !trans_b) { + int lda = k; + int ldb = n; + int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { + for (int n_i = 0; n_i < n; ++n_i) { + c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { + c[m_i * ldc + n_i] += static_cast(alpha * (int)a[m_i * lda + k_i] * (int)b[k_i * ldb + n_i]); + } + } + } + } else if (!trans_a && trans_b) { + int lda = k; + int ldb = k; + int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { + for (int n_i = 0; n_i < n; ++n_i) { + c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { + c[m_i * ldc + n_i] += static_cast(alpha * a[m_i * lda + k_i] * b[n_i * ldb + k_i]); + } + } + } + } else if (trans_a && !trans_b) { + int lda = m; + int ldb = n; + int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { + for (int n_i = 0; n_i < n; ++n_i) { + c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { + c[m_i * ldc + n_i] += static_cast(alpha * a[k_i * lda + m_i] * b[k_i * ldb + n_i]); + } + } + } + } else { + int lda = m; + int ldb = k; + int ldc = n; + for (int m_i = 0; m_i < m; ++m_i) { + for (int n_i = 0; n_i < n; ++n_i) { + c[m_i * ldc + n_i] *= beta; + for (int k_i = 0; k_i < k; ++k_i) { + c[m_i * ldc + n_i] += static_cast(alpha * a[k_i * lda + m_i] * b[n_i * ldb + k_i]); + } + } + } + } +} +template<> +void gemm_check_int8(const int m, const int n, const int k, + const float* a, const float* b, float* c, + const float alpha, const float beta, + const bool trans_a, const bool trans_b,bool is_base_gemm){ + gemm_check(m,n,k,a,b,c,alpha,beta,trans_a,trans_b); +} template int count_diff(const dtype* src1, const dtype* src2, int size, double max_ratio) { if (max_ratio <= 0) { @@ -88,6 +199,13 @@ void test_gemm_int8_result (int m, int n, int k, bool trans_a, bool trans_b) { Tensor a_host, b_host, c_host, c_check; Context ctx1(0, 1, 0); + int generate_arch = Env::cur_env()[ctx1.get_device_id()]._info._generate_arch; + // only support 61 arch for now. + bool arch_check = (generate_arch == 61); + if (!arch_check) { + LOG(INFO) << "device not support int8 op!!"; + return; + } Gemm gemm_vender; Gemm gemm_saber; SaberStatus vender_status = gemm_vender.init(trans_a, trans_b, m, n, k, ctx1); @@ -266,14 +384,379 @@ void test_gemm_int8_result (int m, int n, int k, bool trans_a, bool trans_b) { << " ms Saber time: " << (saber_status == SaberSuccess ? saber_time.get_average_ms() : 0) << " ms"; } +#if defined(USE_X86_PLACE) +template +void test_gemm_result_mkldnn(int m, int n, int k, bool trans_a, bool trans_b, bool packed_gemm = false) { + + Tensor a_dev, b_dev, c_dev; + Tensor a_host, b_host, c_host, c_check; + typedef typename DataTrait::Dtype AType; + typedef typename DataTrait::Dtype BType; + Context ctx1(0, 1, 0); + MklDnnGemm gemm_vender; + + + float alpha = 1.f; + float beta = 0.f; + + Shape a_shape({m, k}, Layout_HW); + Shape b_shape({k, n}, Layout_HW); + Shape c_shape({m, n}, Layout_HW); + a_dev.re_alloc(a_shape, AK_AType); + b_dev.re_alloc(b_shape, AK_BType); + c_dev.re_alloc(c_shape, AK_INT32); + + a_host.re_alloc(a_shape, AK_AType); + b_host.re_alloc(b_shape, AK_BType); + c_host.re_alloc(c_shape, AK_INT32); + c_check.re_alloc(c_shape, AK_INT32); + if (AK_AType==AK_UINT8){ + fill_tensor_rand(a_dev, 0.f, 240.f); + fill_tensor_rand(b_dev, -150.f, 150.f); + }else if(AK_AType==AK_INT8){ + fill_tensor_rand(a_dev, -126.f, 126.f); + fill_tensor_rand(b_dev, -126.f, 126.f); + }else{ + fill_tensor_rand(a_dev, -126.f, 126.f); + fill_tensor_rand(b_dev, -126.f, 126.f); + } + + a_host.copy_from(a_dev); + b_host.copy_from(b_dev); + + SaberStatus vender_status =SaberSuccess; + if(packed_gemm) { + vender_status = gemm_vender.init(trans_a, trans_b, m, n, k, ctx1, (BType *) b_dev.data(),PACKED_MKLGEMM); + }else{ + vender_status = gemm_vender.init(trans_a, trans_b, m, n, k, ctx1, (BType *) b_dev.data(),NORMAL_MKLGEMM); + fill_tensor_rand(b_dev, -150.f, 150.f); + b_host.copy_from(b_dev); + } + + SaberTimer vender_time, saber_time; + int ts = 200; + + if (vender_status == SaberSuccess) { + gemm_vender.dispatch(alpha, beta,m, + (const AType*) a_dev.data(), + (const BType*) b_dev.data(), + (int*) c_dev.mutable_data()); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + c_host.copy_from(c_dev); + + gemm_check_int8(m, n, k, (const AType*) a_host.data(), (const BType*) b_host.data(), + (int*) c_check.mutable_data(), + alpha, beta, trans_a, trans_b); + double max_ratio = 0.f, max_diff = 0.f; + tensor_cmp_host_mlu((const int*) c_check.data(), (const int*) c_host.data(), + c_check.valid_size(), max_ratio, max_diff); + + if (max_ratio > 1e-3) { + LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " << max_ratio << " max_diff: " << max_diff + << "m = " << m << " n = " << n << " k = " << k; + } + + for (int t = 0; t < ts; ++t) { +#if CLEAR_CACHE + flush_tensor_cache_out(a_dev); + flush_tensor_cache_out(b_dev); + flush_tensor_cache_out(c_dev); +#endif + vender_time.start(ctx1); + gemm_vender.dispatch(alpha, beta,m, + (const AType*) a_dev.data(), + (const BType*) b_dev.data(), + (int*) c_dev.mutable_data()); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + vender_time.end(ctx1); + } + }else{ + LOG(ERROR)<<"MklDnnGemm not impl"; + } + + double work = (double)m * n * k * 2; + double vender_time_ms = (vender_status == SaberSuccess ? vender_time.get_average_ms() : 1e10); + double vender_speed = work / vender_time_ms / 1000.0 / 1000.0; + LOG(INFO)<<"mkldnn " < +struct MyDataTrait { + typedef __invalid_type Dtype; +}; +template <> +struct MyDataTrait { + typedef float Dtype; +}; +template <> +struct MyDataTrait { + typedef int Dtype; +}; +template <> +struct MyDataTrait { + typedef int8_t Dtype; +}; +template <> +struct MyDataTrait { + typedef uint8_t Dtype; +}; + +template +void test_gemm_result_intrin_me(int m, int n, int k, bool trans_a, bool trans_b,bool check_correct=true,PackedFCAlg alg=DotReduction) { + + Tensor a_dev, b_dev, c_dev; + Tensor a_host, b_host, c_host, c_check; + typedef typename MyDataTrait::Dtype AType; + typedef typename MyDataTrait::Dtype BType; + typedef typename MyDataTrait::Dtype CType; + Context ctx1(0, 1, 0); + PackedFC gemm_vender; + + + float alpha = 1.f; + float beta = 0.f; + + Shape a_shape({1,1,m, k}, Layout_NCHW); + Shape b_shape({1,1,k, n}, Layout_NCHW); + Shape c_shape({1,1,m, n}, Layout_NCHW); + + a_dev.re_alloc(a_shape, AK_AType); + b_dev.re_alloc(b_shape, AK_BType); + c_dev.re_alloc(c_shape, AK_CType); + + a_host.re_alloc(a_shape, AK_AType); + b_host.re_alloc(b_shape, AK_BType); + c_host.re_alloc(c_shape, AK_CType); + c_check.re_alloc(c_shape, AK_CType); + if(AK_AType==AK_UINT8){ + fill_tensor_rand(a_dev, 0.f, 220.f); + + }else if(AK_AType==AK_FLOAT){ + fill_tensor_rand(a_dev, -1.f, 1.f); + a_dev.set_scale({1.f/127.f}); + } else{ +// fill_tensor_const(a_dev,1); + fill_tensor_rand(a_dev); + + } + + if(AK_BType==AK_INT8){ +// fill_tensor_const(b_dev,1); + fill_tensor_rand(b_dev); + }else if(AK_BType==AK_FLOAT){ + fill_tensor_rand(b_dev,-1.f,1.f); + b_dev.set_scale({1.f/127.f}); + }else{ + LOG(FATAL)<<"not impl"; + } + + if(AK_CType==AK_FLOAT){ + c_dev.set_scale({1.f}); + } + + a_host.copy_from(a_dev); + b_host.copy_from(b_dev); + + + SaberStatus vender_status = SaberNotInitialized; + if(AK_CType==AK_FLOAT){ + CHECK_EQ(a_dev.get_scale().size(),1); + CHECK_EQ(c_dev.get_scale().size(),1); + vender_status=gemm_vender.init(n,k,b_dev,a_dev.get_scale()[0],c_dev.get_scale()[0],alg); + + }else{ + vender_status=gemm_vender.init(n,k,b_dev,1.f,1.f,alg); + } + + + + if (vender_status == SaberSuccess) { + +// LOG(INFO)<<"m = "<::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + c_host.copy_from(c_dev); + gemm_check_int8(m, n, k, (const AType*) a_host.data(), (const BType*) b_host.data(), + (CType*) c_check.mutable_data(), + alpha, beta, trans_a, trans_b); + double max_ratio = 0.f, max_diff = 0.f; + double mlu_diff=0.f; +// tensor_cmp_host((const CType*) c_check.data(), (const CType*) c_host.data(), +// c_check.valid_size(), max_ratio, max_diff); + + tensor_cmp_host_mlu((const CType*) c_check.data(), (const CType*) c_host.data(), + c_check.valid_size(), mlu_diff); +// LOG(INFO)<<"mludiff = "< 1e-2) { +// print_tensor(a_dev); +// print_tensor(b_dev); + print_tensor_valid(c_check); + print_tensor_valid(c_host); + LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " << max_ratio << " max_diff: " << max_diff + << "m = " << m << " n = " << n << " k = " << k; + } +// LOG(INFO)<<"passed"; + } + + }else{ + LOG(ERROR)<<"MklDnnGemm not impl"; + } + + SaberTimer vender_time, saber_time; + int ts = 300; + int warm_up=0; + + for (int t = 0; t < warm_up; ++t) { + gemm_vender.dispatch(m,n,k, + a_dev, + c_dev); + } + for (int t = 0; t < ts; ++t) { +#if CLEAR_CACHE + flush_tensor_cache_out(a_dev); + flush_tensor_cache_out(b_dev); + flush_tensor_cache_out(c_dev); + flush_tensor_cache_out((gemm_vender._inner_weights)); +#endif + vender_time.start(ctx1); + gemm_vender.dispatch(m,n,k, + a_dev, + c_dev); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + vender_time.end(ctx1); + } + + double work = (double)m * n * k * 2; + double vender_time_ms = (vender_status == SaberSuccess ? vender_time.get_average_ms() : 1e10); + double vender_speed = work / vender_time_ms / 1000.0 / 1000.0; + LOG(INFO)<<"me " < +void test_gemm_result_intrin(int m, int n, int k, bool trans_a, bool trans_b,bool is_base_gemm=false) { + + Tensor a_dev, b_dev, c_dev; + Tensor a_host, b_host, c_host, c_check; + typedef typename DataTrait::Dtype AType; + typedef typename DataTrait::Dtype BType; + Context ctx1(0, 1, 0); + IntrinsicGemm gemm_vender; + SaberStatus vender_status = gemm_vender.init(trans_a, trans_b, m, n, k, ctx1); + + float alpha = 1.f; + float beta = 0.f; + + Shape a_shape({m, k}, Layout_HW); + Shape b_shape({k, n}, Layout_HW); + Shape c_shape({m, n}, Layout_HW); + + a_dev.re_alloc(a_shape, AK_AType); + b_dev.re_alloc(b_shape, AK_BType); + c_dev.re_alloc(c_shape, AK_INT32); + + a_host.re_alloc(a_shape, AK_AType); + b_host.re_alloc(b_shape, AK_BType); + c_host.re_alloc(c_shape, AK_INT32); + c_check.re_alloc(c_shape, AK_INT32); + if(AK_AType==AK_UINT8){ +// fill_tensor_rand(a_dev, 0.f, 250.f); +// fill_tensor_rand(b_dev, -126.f, 126.f); + fill_tensor_rand(a_dev, 0.f, 220.f); + fill_tensor_rand(b_dev, -150.f, 150.f); + }else{ + fill_tensor_rand(a_dev); + fill_tensor_rand(b_dev); + } + + a_host.copy_from(a_dev); + b_host.copy_from(b_dev); + SaberTimer vender_time, saber_time; + int ts = 1000; + int warm_up = 100; +// LOG(INFO)<<"vender_status "<::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + c_host.copy_from(c_dev); + gemm_check_int8(m, n, k, (const AType*) a_host.data(), (const BType*) b_host.data(), + (int*) c_check.mutable_data(), + alpha, beta, trans_a, trans_b,is_base_gemm); + double max_ratio = 0.f, max_diff = 0.f; + tensor_cmp_host((const int*) c_check.data(), (const int*) c_host.data(), + c_check.valid_size(), max_ratio, max_diff); + + if (max_ratio > 1e-3) { + + LOG(FATAL) << "VENDER: FAIL!!!! max_ratio = " << max_ratio << " max_diff: " << max_diff + << "m = " << m << " n = " << n << " k = " << k; + } + for (int t = 0; t < warm_up; ++t) { + gemm_vender.dispatch(alpha, beta, + (const AType*) a_dev.data(), + (const BType*) b_dev.data(), + (int*) c_dev.mutable_data()); + } + + for (int t = 0; t < ts; ++t) { + vender_time.start(ctx1); + gemm_vender.dispatch(alpha, beta, + (const AType*) a_dev.data(), + (const BType*) b_dev.data(), + (int*) c_dev.mutable_data()); + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + c_dev.record_event(stream); + c_dev.sync(); + vender_time.end(ctx1); + } + }else{ + LOG(ERROR)<<"MklDnnGemm not impl"; + } + + double work = m * n * k * 2; + double vender_time_ms = (vender_status == SaberSuccess ? vender_time.get_average_ms() : 1e10); + double vender_speed = work / vender_time_ms / 1000.0 / 1000.0; + LOG(INFO)<<"audio "< m_v = {40, 20, 140, 200, 300}; std::vector n_v = {10, 20, 140, 200, 300}; std::vector k_v = {40, 20, 140, 200, 300}; - std::vector trans_a_v{false, true}; - std::vector trans_b_v{false, true}; + std::vector trans_a_v{false}; + std::vector trans_b_v{false}; for (auto m : m_v) for (auto n : n_v) @@ -284,11 +767,84 @@ TEST(TestSaberFunc, test_vender_gemm_float) { #ifdef USE_CUDA test_gemm_int8_result(m, n, k, trans_a, trans_b); #endif - -#ifdef USE_X86_PLACE -// test_gemm_int8_result(m, n, k, trans_a, trans_b); + } +#if defined(USE_X86_PLACE) +#if 1//defined(__AVX2__) + // test_gemm_result_intrin(12,1536*4,512,false,false,true); + // test_gemm_result_intrin(12,1536*4,2048,false,false,true); + // test_gemm_result_intrin(4,1536*4,512,false,false,true); + // test_gemm_result_intrin(4,512,1536,false,false,true); + // test_gemm_result_intrin(1,1536*4,512,false,false,true); + // test_gemm_result_intrin(1,512,1536,false,false,true); + + + // test_gemm_result_intrin_me(4,4,32,false,false); + // test_gemm_result_intrin_me(16,1536*4,512,false,false); + // test_gemm_result_intrin_me(16,1536*4,2048,false,false); + + +// test_gemm_result_intrin_me(16,1536*4,512,false,false,true,DotSplitK); + + +// test_gemm_result_intrin_me(16,1536*4,512,false,false,true,DotReductionPacked); +// test_gemm_result_intrin_me(16,1536*4,2048,false,false,true,DotReductionPacked); +// test_gemm_result_intrin_me(4,1536*4,512,false,false,true,DotReductionPacked); +// test_gemm_result_intrin_me(512,512,512,false,false,true,DotReductionPacked); +// test_gemm_result_intrin_me(1024,1024,1024,false,false,true,DotReductionPacked); + +// test_gemm_result_intrin_me(16,1536*4,512,false,false,true,DotReduction); +// test_gemm_result_intrin_me(16,1536*4,2048,false,false,true,DotReduction); +// test_gemm_result_intrin_me(4,1536*4,512,false,false,true,DotReduction); +// test_gemm_result_intrin_me(512,512,512,false,false,true,DotReduction); +// test_gemm_result_intrin_me(1024,1024,1024,false,false,true,DotReduction); + +// test_gemm_result_intrin_me(16,1536*4,512,false,false,true,DotAdd); +// test_gemm_result_intrin_me(16,1536*4,2048,false,false,true,DotAdd); +// test_gemm_result_intrin_me(4,1536*4,512,false,false,true,DotAdd); +// test_gemm_result_intrin_me(512,512,512,false,false,true,DotAdd); +// test_gemm_result_intrin_me(1024,1024,1024,false,false,true,DotAdd); + +// test_gemm_result_intrin_me(16,1536*4,512,false,false,true,DotSplitK); +// test_gemm_result_intrin_me(16,1536*4,2048,false,false,true,DotSplitK); +// test_gemm_result_intrin_me(4,1536*4,512,false,false,true,DotSplitK); +// test_gemm_result_intrin_me(512,512,512,false,false,true,DotSplitK); +// test_gemm_result_intrin_me(1024,1024,1024,false,false,true,DotSplitK); +// test_gemm_result_mkldnn(16, 16, 16, false, false); #endif + + if (jit::mayiuse(jit::avx512_core_vnni)) { + for (auto m : { + 1, 3, 6, 16 + }) { + for (auto n : { + 4, 12, 17, 23 + }) { + for (auto k : { + 3, 12, 16, 32, 33 + }) { + test_gemm_result_mkldnn(m, n, k, false, false, true); + test_gemm_result_mkldnn(m, n, k, false, false, true); + } + } + } +// test_gemm_result_mkldnn(4, 4, 4, false, false, true); +// test_gemm_result_mkldnn(2, 3, 4, false, false, true); +// test_gemm_result_mkldnn(2, 3, 4, false, false, true); +// test_gemm_result_mkldnn(2, 3, 4, false, false, false); + +// test_gemm_result_mkldnn(16, 1536 * 4, 512, false, false, true); +// test_gemm_result_mkldnn(16, 1536 * 4, 512, false, false, false); +// test_gemm_result_mkldnn(16, 1536 * 4, 512, false, false, true); +// test_gemm_result_mkldnn(16, 1536 * 4, 512, false, false, false); + +// test_gemm_result_mkldnn(16, 1536 * 4, 2048, false, false); +// test_gemm_result_mkldnn(4, 1536 * 4, 512, false, false); +// test_gemm_result_mkldnn(512, 512, 512, false, false); +// test_gemm_result_mkldnn(1024, 1024, 1024, false, false); } + +#endif + } int main(int argc, char* argv[]) { diff --git a/test/saber/test_saber_generate_proposals.cpp b/test/saber/test_saber_generate_proposals.cpp new file mode 100644 index 000000000..c067dab8c --- /dev/null +++ b/test/saber/test_saber_generate_proposals.cpp @@ -0,0 +1,548 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/generate_proposals.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include +#include +//#define TEST_GENERATE_PROPOSALS +#ifdef TEST_GENERATE_PROPOSALS + +using namespace anakin::saber; + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +void read_tensor_from_file(float* data, int length, const char* path) { + std::fstream fs(path); + int i = 0; + if (fs.is_open()) { + std::string str; + while (true) { + std::getline(fs, str); + std::size_t found = str.find(" "); + if (found != std::string::npos) { + std::cout << "first 'needle' found at: " << found << '\n'; + break; + } + data[i++] = (atof)(str.c_str()); + } + fs.close(); + } +} + +/*NCHW->NHWC*/ +template +static inline void trans(Tensor* out, Tensor* in) { + auto shape = in->valid_shape(); + out->reshape(Shape({shape[0], shape[2], shape[3], shape[1]}, Layout_NCHW)); + auto stride = in->get_stride(); + auto dst = (Dtype*) out->mutable_data(); + auto src = (const Dtype*) in->data(); + for (auto i = 0; i < shape.count(); i++) { + int n = i / stride[0]; + int c = (i / stride[1]) % shape[1]; + int hw = i % (stride[1]); + int out_id = n * stride[0] + hw*shape[1] + c; + dst[out_id] = src[i]; + } +} + +template +static inline void box_coder(Tensor* proposals, + const Tensor* anchors, + const Tensor* bbox_deltas, + const Tensor* variances, + std::vector& index + ) { + proposals->reshape(Shape({index.size(), 4, 1, 1}, Layout_NCHW)); + int anchor_nums = index.size(); + int len = anchors->shape()[3]; + CHECK_EQ(len, 4) << "anchor length is 4"; + auto anchor_data = (const Dtype*) anchors->data(); + auto bbox_deltas_data = (const Dtype*) bbox_deltas->data(); + auto proposals_data = (Dtype*) proposals->data(); + const Dtype *variances_data = nullptr; + if (variances) { + variances_data = (const Dtype*)variances->data(); + } + for (int i = 0; i < index.size(); i++) { + int offset = index[i] * len; + auto anchor_data_tmp = anchor_data + offset; + auto variances_data_tmp = variances_data + offset; + auto bbox_deltas_data_tmp = bbox_deltas_data + offset; + auto proposals_data_tmp = proposals_data + i*len; + auto anchor_width = anchor_data_tmp[2] - anchor_data_tmp[0] + 1.0; + auto anchor_height = anchor_data_tmp[3] - anchor_data_tmp[1] + 1.0; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + Dtype bbox_center_x = 0, bbox_center_y = 0; + Dtype bbox_width = 0, bbox_height = 0; + if (variances) { + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data_tmp[1] * + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(variances_data_tmp[ 2] * + bbox_deltas_data_tmp[2], + kBBoxClipDefault)) * anchor_width; + bbox_height = std::exp(std::min(variances_data_tmp[3] * + bbox_deltas_data_tmp[3], + kBBoxClipDefault)) * anchor_height; + } else { + bbox_center_x = + bbox_deltas_data_tmp[0] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data_tmp[1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data_tmp[2], + kBBoxClipDefault)) * anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data_tmp[3], + kBBoxClipDefault)) * anchor_height; + } + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - 1; + } +} + +template +static inline void clip_tiled_boxes(Tensor *boxes, + const Tensor *im_info) { + Dtype *boxes_data = (Dtype*)boxes->mutable_data(); + auto im_info_data = (const Dtype*)im_info->data(); + Dtype zero(0); + for (int64_t i = 0; i < boxes->valid_size(); i += 4) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); //left + boxes_data[i+1] = + std::max(std::min(boxes_data[i+1], im_info_data[0] - 1), zero); //top + boxes_data[i+2] = + std::max(std::min(boxes_data[i+2], im_info_data[1] - 1), zero); // right + boxes_data[i+3] = + std::max(std::min(boxes_data[i+3], im_info_data[0] - 1), zero);//bottom + } +} + +template +void filter_boxes(std::vector& keep, + const Tensor *boxes, + const float min_size, + const Tensor *im_info) { + const Dtype *im_info_data = (const Dtype*)im_info->data(); + const Dtype *boxes_data = (const Dtype*)boxes->data(); + Dtype im_scale = im_info_data[2]; + auto min_size_final = std::max(min_size, 1.0f); + keep.clear(); + + for (int i = 0; i < boxes->valid_size(); i += 4 ) { + Dtype left = boxes_data[i]; + Dtype right = boxes_data[i+2]; + Dtype top = boxes_data[i+1]; + Dtype bottom = boxes_data[i+3]; + Dtype ws = right - left + 1; + Dtype hs = bottom - top + 1; + Dtype ws_origin_scale = + (right - left) / im_scale + 1; + Dtype hs_origin_scale = + (bottom - top) / im_scale + 1; + Dtype x_ctr = left + ws / 2; + Dtype y_ctr = top + hs / 2; + if (ws_origin_scale >= min_size_final && hs_origin_scale >= min_size_final && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { + keep.push_back(i>>2); + } else { + //LOG(INFO) << "filter id : " << (i>>2); + } + } +} + +template +static inline std::vector> get_sorted_score_index( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + +std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first > b.first; + }); + return sorted_indices; +} + +template +static inline Dtype BBoxArea(const Dtype *box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + return static_cast(0.); + } else { + const Dtype w = box[2] - box[0]; + const Dtype h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + return (w + 1) * (h + 1); + } + } +} + +template +static inline Dtype jaccard_overlap(const Dtype *box1, const Dtype *box2, bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const Dtype inter_xmin = std::max(box1[0], box2[0]); + const Dtype inter_ymin = std::max(box1[1], box2[1]); + const Dtype inter_xmax = std::min(box1[2], box2[2]); + const Dtype inter_ymax = std::min(box1[3], box2[3]); + const Dtype inter_w = std::max(Dtype(0), inter_xmax - inter_xmin + 1); + const Dtype inter_h = std::max(Dtype(0), inter_ymax - inter_ymin + 1); + const Dtype inter_area = inter_w * inter_h; + const Dtype bbox1_area = BBoxArea(box1, normalized); + const Dtype bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline void NMS(std::vector& selected_indices, + Tensor *bbox, + std::vector& indices, + Dtype nms_threshold, + float eta) { + int64_t num_boxes = bbox->num(); + int64_t box_size = bbox->channel(); + + int selected_num = 0; + Dtype adaptive_threshold = nms_threshold; + const Dtype *bbox_data = (const Dtype*)(bbox->data()); + selected_indices.clear(); + //while (indices.size() != 0) { + for (int i = 0; i < indices.size(); i++) { + //int idx = indices.back(); + auto idx = indices[i]; + bool flag = true; + for (int kept_idx : selected_indices) { + if (flag) { + Dtype overlap = jaccard_overlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + //indices.erase(indices.end() - 1); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void gather(Tensor* out, + const Tensor* in, + std::vector& index, + const int inner_dim) { + Shape shape = in->valid_shape(); + int index_num = index.size(); + shape[0] = index_num; + out->reshape(shape); + auto in_data = (const Dtype*) in->data(); + auto out_data = (Dtype*)out->data(); + for (int i = 0; i < index_num; i++) { + memcpy(out_data + i * inner_dim, in_data + index[i] * inner_dim, sizeof(Dtype) * inner_dim); + } +} + + +template +void get_score_sorted_index(const Tensor* scores, + int sort_num, + std::vector& sorted_score, + std::vector& score_index) { + auto scores_data = (const Dtype*)scores->data(); + std::vector> index; + for (int i = 0; i < scores->valid_size(); i++) { + index.emplace_back(std::make_pair(scores_data[i], i)); + } + std::partial_sort(index.begin(), index.begin() + sort_num, index.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first;}); + + sorted_score.resize(sort_num); + score_index.resize(sort_num); + for (int i = 0; i < sort_num; i++) { + sorted_score[i] = index[i].first; + score_index[i] = index[i].second; + } +} + +template +void proposal_for_one_image( + Tensor &proposals_sel, + Tensor &scores_sel, + Tensor &proposals, + const Tensor &im_info_slice,//[1, 3] + const Tensor &anchors_slice,//[H, W, A, 4] + const Tensor &variances_slice, //[H, W, A, 4] + const Tensor &bbox_deltas_slice, // [1, H, W, A*4] + const Tensor &scores_slice, // [1, H, W, A] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + + int scores_num = scores_slice.valid_size(); + int index_num = 0; + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_num) { + index_num = scores_num; + } else { + index_num = pre_nms_top_n; + } + std::vector scores_sorted; + std::vector index; + get_score_sorted_index(&scores_slice, index_num, scores_sorted, index); + + + box_coder(&proposals, &anchors_slice, &bbox_deltas_slice, &variances_slice, index); + + clip_tiled_boxes(&proposals, &im_info_slice); + + std::vector keep; + filter_boxes(keep, &proposals, min_size, &im_info_slice); + //for (int i = 0; i < keep.size(); i++) { + // LOG(INFO) << "cpu filter box keep : " << i <<" , "<< keep[i]; + //} + + if (nms_thresh <= 0) { + gather(&proposals_sel, &proposals, keep, 4); + std::vector scores_index; + for (int i = 0; i < keep.size(); i++) { + scores_index[i] = index[keep[i]]; + } + gather(&scores_sel, &scores_slice, scores_index, 1); + return; + } + + std::vector keep_nms; + NMS(keep_nms, &proposals, keep, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.size()) { + keep_nms.resize(post_nms_top_n); + } + + std::vector scores_index(keep_nms.size()); + for (int id = 0; id < keep_nms.size(); id++) { + scores_index[id] = index[keep_nms[id]]; + } + gather(&scores_sel, &scores_slice, scores_index, 1); + gather(&proposals_sel, &proposals, keep_nms, 4); +} + +template +void AppendProposals(Tensor *dst, + int64_t offset, + const int im_id, + const Tensor *src) { + auto *out_data = (Dtype*)dst->data(); + auto *in_data = (const Dtype*)src->data(); + out_data += offset; + for (int i = 0; i < src->valid_size()/4; i++) { + out_data[0] = im_id; + std::memcpy(out_data + 1, in_data, 4* sizeof(Dtype)); + out_data += 5; + in_data += 4; + } +} + +template +void AppendScores(Tensor *dst, + int64_t offset, + const Tensor *src) { + auto *out_data = (Dtype*)dst->data(); + auto *in_data = (const Dtype*)src->data(); + out_data += offset; + std::memcpy(out_data, in_data, src->valid_size() * sizeof(Dtype)); +} + + +template +void generate_proposals_basic(const std::vector*>& inputs, + std::vector*>& outputs, + GenerateProposalsParam& param) { + auto anchors = *inputs[0]; + auto bbox_deltas = *inputs[1]; + auto im_info = *inputs[2]; + auto scores = *inputs[3]; + auto variances = *inputs[4]; + auto rpn_rois = outputs[0]; + auto rpn_roi_probs = outputs[1]; + int pre_nms_top_n = param.pre_nms_top_n;; + int post_nms_top_n = param.post_nms_top_n; + float nms_thresh = param.nms_thresh;; + float min_size = param.min_size;; + float eta = param.eta; + auto scores_shape = scores.valid_shape(); + auto bbox_shape = bbox_deltas.valid_shape(); + rpn_rois->reshape(Shape({bbox_deltas.valid_size() / 4, 4, 1, 1}, Layout_NCHW)); + rpn_roi_probs->reshape(Shape({scores.valid_size(), 1, 1, 1}, Layout_NCHW)); + Tensor bbox_deltas_swap; + Tensor scores_swap; + Tensor proposals; + Tensor proposals_sel; + Tensor scores_sel; + + trans(&scores_swap, &scores); + trans(&bbox_deltas_swap, &bbox_deltas); + + int num_proposals = 0; + int img_num = scores_shape[0]; + Shape im_info_slice_shape = im_info.valid_shape(); + Shape bbox_deltas_slice_shape = bbox_deltas.valid_shape(); + Shape scores_slice_shape({scores.valid_size()/ img_num, 1, 1, 1}, Layout_NCHW); + im_info_slice_shape[0] = 1; + bbox_deltas_slice_shape[0] = 1; + std::vector proposals_offset; + for (int i = 0; i < img_num; i++) { + Tensor im_info_slice((void*)((dtype*)im_info.mutable_data() + i * im_info.get_stride()[0]), TargetType_H(), 0, im_info_slice_shape); + Tensor bbox_deltas_slice((void*)((dtype*)bbox_deltas_swap.mutable_data() + i * bbox_deltas.get_stride()[0]), TargetType_H(), 0, bbox_deltas_slice_shape); + Tensor scores_slice((void*)((dtype*)scores_swap.mutable_data() + i * scores.get_stride()[0]), TargetType_H(), 0, scores_slice_shape); + proposal_for_one_image(proposals_sel, + scores_sel, + proposals, + im_info_slice, + anchors, + variances, + bbox_deltas_slice, // [M, 4] + scores_slice, // [N, 1] + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + eta); + + AppendProposals(rpn_rois, 5 * num_proposals, i, &proposals_sel); + AppendScores(rpn_roi_probs, num_proposals, &scores_sel); + num_proposals += scores_sel.valid_size();; + proposals_offset.push_back(num_proposals); + } + rpn_roi_probs->reshape(Shape({num_proposals, 1, 1, 1}, Layout_NCHW)); + rpn_rois->reshape(Shape({num_proposals, 5, 1, 1}, Layout_NCHW)); + + std::vector> out_offset; + out_offset.push_back(proposals_offset); + for (size_t i = 0; i < outputs.size(); i++) { + outputs[i]->set_seq_offset(out_offset); + } +} +template +void test_model() { + typedef typename DataTrait::Dtype dtype; + int pre_nms_top_n = 6000; + int post_nms_top_n = 1000; + float eta = 1.0f; + dtype nms_thresh = 0.699999; + dtype min_size = 0.f; + //std::string file_path = "/home/chengyujuan/baidu/sys-hic-gpu/Anakin-2.0/generate_proposals_data/"; + //std::string scores_file = file_path + "result_rpn_cls_score_prob.tmp_0.txt"; + //std::string bbox_deltas_file = file_path + "result_rpn_bbox_pred.tmp_1.txt"; + //std::string im_info_file = file_path + "result_im_info.txt"; + //std::string anchors_file = file_path + "result_anchor_generator_0.tmp_0.txt"; + //std::string variances_file = file_path + "result_anchor_generator_0.tmp_1.txt"; + //TestSaberBase testbase(5, 2); + //Shape bbox_deltas_shape({1, 60, 84, 84}, Layout_NCHW); + //Shape im_info_shape({1, 3, 1, 1}, Layout_NCHW); + //Shape anchors_shape({84, 84, 15, 4}, Layout_NCHW); + //Shape variances_shape({84, 84, 15, 4}, Layout_NCHW); + //Shape scores_shape({1, 15, 84, 84},Layout_NCHW); + std::string file_path = "/home/chengyujuan/baidu/sys-hic-gpu/Anakin-2.0/generate_proposal/"; + std::string scores_file = file_path + "scores.txt"; + std::string bbox_deltas_file = file_path + "box_deltas.txt"; + std::string im_info_file = file_path + "im_info.txt"; + std::string anchors_file = file_path + "anchors.txt"; + std::string variances_file = file_path + "var.txt"; + TestSaberBase testbase(5, 2); + Shape anchors_shape({27, 40, 15, 4}, Layout_NCHW); + Shape bbox_deltas_shape({1, 60, 27, 40}, Layout_NCHW); + Shape im_info_shape({1, 3, 1, 1}, Layout_NCHW); + Shape scores_shape({1, 15, 27, 40},Layout_NCHW); + Shape variances_shape({27, 40, 15, 4}, Layout_NCHW); + std::vector input_shape_vec = {anchors_shape, bbox_deltas_shape, im_info_shape, scores_shape, variances_shape}; + GenerateProposalsParam param(pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + testbase.set_param(param); + testbase.add_inputs_shape(input_shape_vec); + + Tensor scores(scores_shape); + Tensor bbox_deltas(bbox_deltas_shape); + Tensor im_info(im_info_shape); + Tensor anchors(anchors_shape); + Tensor variances(variances_shape); + std::vector*> input_vec; + input_vec.push_back(&anchors); + input_vec.push_back(&bbox_deltas); + input_vec.push_back(&im_info); + input_vec.push_back(&scores); + input_vec.push_back(&variances); + Tensor h_scores(scores_shape); + Tensor h_bbox_deltas(bbox_deltas_shape); + Tensor h_im_info(im_info_shape); + Tensor h_anchors(anchors_shape); + Tensor h_variances(variances_shape); + + read_tensor_from_file((dtype*)h_scores.mutable_data(), h_scores.valid_size(), scores_file.c_str()); + read_tensor_from_file((dtype*)h_bbox_deltas.mutable_data(), h_bbox_deltas.valid_size(), bbox_deltas_file.c_str()); + read_tensor_from_file((dtype*)h_im_info.mutable_data(), h_im_info.valid_size(), im_info_file.c_str()); + read_tensor_from_file((dtype*)h_anchors.mutable_data(), h_anchors.valid_size(), anchors_file.c_str()); + read_tensor_from_file((dtype*)h_variances.mutable_data(), h_variances.valid_size(), variances_file.c_str()); + scores.copy_from(h_scores); + bbox_deltas.copy_from(h_bbox_deltas); + im_info.copy_from(h_im_info); + anchors.copy_from(h_anchors); + variances.copy_from(h_variances); + testbase.add_custom_input(input_vec); +#ifdef USE_CUDA + cudaDeviceSynchronize(); +#endif + testbase.run_test(generate_proposals_basic); +} + +TEST(TestSaberFunc, test_func_generate_proposals) { + +#ifdef USE_CUDA + //Init the test_base + Env::env_init(); + test_model(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +#endif + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_gru.cpp b/test/saber/test_saber_gru.cpp index 73ba2bc57..1393843e0 100644 --- a/test/saber/test_saber_gru.cpp +++ b/test/saber/test_saber_gru.cpp @@ -16,7 +16,7 @@ using namespace std; template static Dtype InValidAct(Dtype a) { - CHECK(false)<<"InValidAct"; + return 0; } template diff --git a/test/saber/test_saber_lrn.cpp b/test/saber/test_saber_lrn.cpp index 8fe7e19af..4c93c81cf 100644 --- a/test/saber/test_saber_lrn.cpp +++ b/test/saber/test_saber_lrn.cpp @@ -160,6 +160,25 @@ TEST(TestSaberFunc, test_op_lrn) { } #endif +#ifdef USE_ARM_PLACE + TestSaberBase testbase_arm; + + for (int w_in : {8, 8, 16}) { + for (int h_in : {2, 8, 32}) { + for (int ch_in : {2, 3, 8, 64}) { + for (int num_in : {1, 21, 32}) { + Shape shape_arm({num_in, ch_in, h_in, w_in}); + LrnParam param_arm(local_size, alpha, beta, k, norm_region); + testbase_arm.set_param(param_arm); + testbase_arm.set_rand_limit(-5.0, 5.0); + testbase_arm.set_input_shape(shape_arm); + testbase_arm.run_test(lrn_cpu_base, 0.00001, true, true); + } + } + } + } +#endif + } int main(int argc, const char** argv) { @@ -168,4 +187,4 @@ int main(int argc, const char** argv) { InitTest(); RUN_ALL_TESTS(argv[0]); return 0; -} \ No newline at end of file +} diff --git a/test/saber/test_saber_lstm.cpp b/test/saber/test_saber_lstm.cpp index bbfa8c05d..c90d5066f 100644 --- a/test/saber/test_saber_lstm.cpp +++ b/test/saber/test_saber_lstm.cpp @@ -5,6 +5,7 @@ #include "saber/core/context.h" #include "saber/funcs/lstm.h" +#include "saber/funcs/lstmp.h" #include "saber/funcs/impl/x86/x86_utils.h" #include "saber/core/tensor_op.h" #include "debug.h" @@ -17,7 +18,7 @@ using namespace std; template static Dtype InValidAct(Dtype a) { - CHECK(false)<<"InValidAct"; + return 0; } template @@ -71,7 +72,7 @@ template void compute_ref_lstm_one_word(const Dtype* wx_i,const Dtype* wx_f,const Dtype* wx_c,const Dtype* wx_o,Dtype* h_new,const Dtype* cell_old,Dtype* cell_new, const Dtype* bias_i,const Dtype* bias_f,const Dtype* bias_c,const Dtype* bias_o,const Dtype* w_c_i, const Dtype* w_c_f,const Dtype* w_c_o,int hidden_size, - ActiveType gate_activity,ActiveType cell_activity,ActiveType candidate_activity, bool with_peephole){ + ActiveType gate_activity,ActiveType cell_activity,ActiveType candidate_activity, bool with_peephole,bool show=false){ typename ACTIVATION::Act gate_func=Activate(gate_activity); typename ACTIVATION::Act cell_func=Activate(cell_activity); @@ -85,7 +86,6 @@ void compute_ref_lstm_one_word(const Dtype* wx_i,const Dtype* wx_f,const Dtype* Dtype gate_o = gate_func(wx_o[i] + w_c_o[i] * gate_c + bias_o[i]); h_new[i] = gate_o * candi_func(gate_c); cell_new[i] = gate_c; -// DLOG(INFO)<<"gate_i = "< vec_c(seq_sum*hidden_size,0); +// vector vec_wx(seq_sum*4*hidden_size,0); + + Dtype *c= static_cast(vec_c.data()); + Dtype *wx= static_cast(vec_wx.data()); + std::vector seq_offset = input_tensor->get_seq_offset()[input_tensor->get_seq_offset().size()-1]; + + gemm_naive(seq_sum,4*hidden_size,word_size,1,x,weights_x,0,wx); +// write_tensorfile(vec_wx,"ref_wx_tensor"); +// print_tensor(vec_wx); + if(param.skip_num>1){ + CHECK_EQ(param.is_reverse,false); + CHECK_EQ(seq_offset.size(),2)<<"only support batch = 1 now"; +// CHECK_EQ(seq_sum%param.skip_num,0); + int skip_num=param.skip_num; + for(int seq_id=0;seq_id=4); +// printf_pointer(h_new,hidden_size); + Dtype *output_h_this_word=output_h+word_id*output_hidden_size; + gemm_naive(1,output_hidden_size,hidden_size,1.f,h_new,weights_project,0.f,output_h_this_word); + for(int i=0;i0){ +// +// gemm_naive(seq_sum,param.project_dim,hidden_size,1.f,(Dtype*)inner_tensor.mutable_data(),weights_project,0.f, +// static_cast(dst[0]->mutable_data())); +//// Dtype* gemm_output=static_cast(inner_tensor.mutable_data()); +// Dtype* output=(Dtype*)dst[0]->mutable_data(); +// for(int i=0;i void lstm_ut(int word_size = 222, @@ -213,7 +340,7 @@ void lstm_ut(int word_size = 222, ActiveType gate_activity=Active_sigmoid, ActiveType cell_activity=Active_tanh, ActiveType candi_activity=Active_tanh, - int perf_iter=0,ImplEnum test_mode=SABER_IMPL){ + int perf_iter=0,ImplEnum test_mode=SABER_IMPL,bool perf=false){ typedef Tensor TensorHf4; typedef Tensor TensorDf4; Context ctx_dev(0, 1, 1); @@ -313,15 +440,270 @@ void lstm_ut(int word_size = 222, LOG(INFO)<<"impl = "< +void lstm_ut_int8(int word_size = 222, + int hidden_size = 333, + std::vector offsets = {0, 3,13,22,30,50}, + bool is_reverse = true, + bool with_peephole= true, + ActiveType gate_activity=Active_sigmoid, + ActiveType cell_activity=Active_tanh, + ActiveType candi_activity=Active_tanh, + int perf_iter=0,ImplEnum test_mode=SABER_IMPL,bool perf=false){ + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + Context ctx_dev(0, 1, 1); + + Shape shape_weight({1, 1, 1,hidden_size*hidden_size*4+hidden_size*word_size*4},Layout_NCHW); + Shape shape_bias; + if(with_peephole){ + shape_bias=Shape({1,1,1,hidden_size*7},Layout_NCHW); + }else{ + shape_bias=Shape({1,1,1,hidden_size*4},Layout_NCHW); + } + Shape shape_x({offsets[offsets.size() - 1], word_size, 1, 1},Layout_NCHW); + Shape shape_h({offsets[offsets.size() - 1], hidden_size, 1, 1},Layout_NCHW); + TensorHf4 host_x(shape_x); + TensorHf4 host_weight(shape_weight); + TensorHf4 host_bias(shape_bias); + TensorHf4 host_hidden_out(shape_h); + TensorDf4 dev_x(shape_x); + TensorDf4 dev_weight(shape_weight); + TensorDf4 dev_bias(shape_bias); + TensorDf4 dev_hidden_out(shape_h); +#ifdef COMPARE_FILE + readTensorData(host_weight, "host_w"); + readTensorData(host_x, "host_x"); + readTensorData(host_bias, "host_b"); +#else + fill_tensor_rand(host_weight,-1,1); + fill_tensor_rand(host_x,-1,1); +// fill_tensor_const(host_weight,0.f); +// fill_tensor_const(host_x,0.f); + fill_tensor_rand(host_bias,-1,1); +#endif + dev_weight.copy_from(host_weight); + dev_x.copy_from(host_x); + dev_bias.copy_from(host_bias); + + host_x.set_seq_offset({offsets}); + dev_x.set_seq_offset({offsets}); + LstmParam param(&dev_weight, &dev_bias,nullptr,Active_unknow,gate_activity,cell_activity,candi_activity, + with_peephole,false,is_reverse); + Lstm lstm_op; + + std::vector inputs; + std::vector outputs; + inputs.push_back(&dev_x); + outputs.push_back(&dev_hidden_out); + + SABER_CHECK(lstm_op.init(inputs, outputs, param, SPECIFY, test_mode, ctx_dev)); + SABER_CHECK(lstm_op.compute_output_shape(inputs, outputs, param)); + outputs[0]->re_alloc(outputs[0]->valid_shape(),outputs[0]->get_dtype()); + SABER_CHECK(lstm_op(inputs, outputs, param, ctx_dev)); + outputs[0]->record_event(ctx_dev.get_compute_stream()); + outputs[0]->sync(); + + if(perf_iter>0) { + SaberTimer t1; + t1.start(ctx_dev); + for (int i = 0; i < perf_iter; ++i) { + SABER_CHECK(lstm_op(inputs, outputs, param, ctx_dev)); + outputs[0]->record_event(ctx_dev.get_compute_stream()); + outputs[0]->sync(); + } + t1.end(ctx_dev); + LOG(INFO) << "!!saber care: iter = " << perf_iter << " , total time: " << t1.get_average_ms() << + "avg time : " << t1.get_average_ms() / perf_iter << " args [" << offsets[offsets.size() - 1] + << "," << offsets.size() - 1 << ","<< word_size << "," << hidden_size << "]"; + } + + host_hidden_out.copy_from(dev_hidden_out); + TensorHf4 compare_g(shape_h); +#ifdef COMPARE_FILE + readTensorData(compare_g, "host_correct"); + write_tensorfile(host_hidden_out, "host_g.txt"); + write_tensorfile(compare_g, "host_correct.txt"); +#else + std::vector inputs_ref; + std::vector outputs_ref; + outputs_ref.push_back(&compare_g); + inputs_ref.push_back(&host_x); + LstmParam param_ref(&host_weight, &host_bias,nullptr,Active_unknow,gate_activity,cell_activity,candi_activity, + with_peephole,false,is_reverse); + compute_ref_lstm_fwd_me(inputs_ref,outputs_ref,param_ref); +#endif + double maxdiff = 0; + double maxratio = 0; + tensor_cmp_host((const float*)host_hidden_out.data(), (const float*)compare_g.data(), host_hidden_out.valid_size(), maxratio, maxdiff); + if (abs(maxratio) <= 0.005||abs(maxdiff)<0.005) { + LOG(INFO) << "passed " << maxratio<<","< +void lstmp_ut(int word_size , + int hidden_size , + int project_size, + std::vector offsets, + int skip_num, + bool is_reverse , + bool with_peephole, + ActiveType gate_activity, + ActiveType cell_activity, + ActiveType candi_activity, + int perf_iter=0,ImplEnum test_mode=SABER_IMPL){ + typedef Tensor TensorHf4; + typedef Tensor TensorDf4; + Context ctx_dev(0, 1, 1); + + Shape shape_weight({1, 1, 1,project_size*hidden_size*4+hidden_size*word_size*4+hidden_size*project_size},Layout_NCHW); + Shape shape_bias; + if(with_peephole){ + shape_bias=Shape({1,1,1,hidden_size*7},Layout_NCHW); + }else{ + shape_bias=Shape({1,1,1,hidden_size*4},Layout_NCHW); + } + Shape shape_x({offsets[offsets.size() - 1], word_size, 1, 1},Layout_NCHW); + Shape shape_h({offsets[offsets.size() - 1], project_size, 1, 1},Layout_NCHW); + TensorHf4 host_x(shape_x); + TensorHf4 host_weight(shape_weight); + TensorHf4 host_bias(shape_bias); + TensorHf4 host_hidden_out(shape_h); + TensorDf4 dev_x(shape_x); + TensorDf4 dev_weight(shape_weight); + TensorDf4 dev_bias(shape_bias); + TensorDf4 dev_hidden_out(shape_h); +#ifdef COMPARE_FILE + readTensorData(host_weight, "host_w"); + readTensorData(host_x, "host_x"); + readTensorData(host_bias, "host_b"); +#else + fill_tensor_rand(host_weight); + fill_tensor_rand(host_x); + fill_tensor_rand(host_bias); + +// fill_tensor_rand(host_weight,-1,1); +// fill_tensor_rand(host_x,-1,1); +// fill_tensor_rand(host_bias,-1,1); + +// fill_tensor_const(host_weight,1.f); +// fill_tensor_const(host_x,1.f); +// fill_tensor_const(host_bias,1); + +#endif + dev_weight.copy_from(host_weight); + dev_x.copy_from(host_x); + dev_bias.copy_from(host_bias); + + host_x.set_seq_offset({offsets}); + dev_x.set_seq_offset({offsets}); + if (precise==AK_INT8){ + dev_x.set_scale({1.f/127.f}); + } + LstmParam param(&dev_weight, &dev_bias,nullptr,Active_unknow,gate_activity,cell_activity,candi_activity, + with_peephole,false,is_reverse,1,1,1,skip_num,project_size,hidden_size); + Lstmp lstm_op; + + std::vector inputs; + std::vector outputs; + inputs.push_back(&dev_x); + outputs.push_back(&dev_hidden_out); + + SABER_CHECK(lstm_op.init(inputs, outputs, param, SPECIFY, test_mode, ctx_dev)); + SABER_CHECK(lstm_op.compute_output_shape(inputs, outputs, param)); + outputs[0]->re_alloc(outputs[0]->valid_shape(),outputs[0]->get_dtype()); + LOG(INFO)<<"output ptr = "<data(); + SABER_CHECK(lstm_op(inputs, outputs, param, ctx_dev)); +// float* output_ptr = static_cast(outputs[0]->mutable_data()); +// for(int i=0;ivalid_size();i++){ +// output_ptr[i]=12; +// } + outputs[0]->record_event(ctx_dev.get_compute_stream()); + outputs[0]->sync(); + + if(perf_iter>0) { + SaberTimer t1; + t1.start(ctx_dev); + for (int i = 0; i < perf_iter; ++i) { + SABER_CHECK(lstm_op(inputs, outputs, param, ctx_dev)); + outputs[0]->record_event(ctx_dev.get_compute_stream()); + outputs[0]->sync(); + } + t1.end(ctx_dev); + LOG(INFO) << "!!saber care: iter = " << perf_iter << " , total time: " << t1.get_average_ms() << + "avg time : " << t1.get_average_ms() / perf_iter << " args [" << offsets[offsets.size() - 1] + << "," << word_size << ","<< hidden_size << "," << project_size << "]"; + } + + host_hidden_out.copy_from(dev_hidden_out); + TensorHf4 compare_g(shape_h); +#ifdef COMPARE_FILE + readTensorData(compare_g, "host_correct"); + write_tensorfile(host_hidden_out, "host_g.txt"); + write_tensorfile(compare_g, "host_correct.txt"); +#else + std::vector inputs_ref; + std::vector outputs_ref; + outputs_ref.push_back(&compare_g); + inputs_ref.push_back(&host_x); + LstmParam param_ref(&host_weight, &host_bias,nullptr,Active_unknow,gate_activity,cell_activity,candi_activity, + with_peephole,false,is_reverse,1,1,1,skip_num,project_size,hidden_size); + compute_ref_lstmp_fwd_me(inputs_ref,outputs_ref,param_ref); +#endif + double maxdiff = 0; + double maxratio = 0; + double mlu_ration = 0.0; + tensor_cmp_host_mlu((const float*)host_hidden_out.data(), (const float*)compare_g.data(), host_hidden_out.valid_size(), mlu_ration); + tensor_cmp_host((const float*)host_hidden_out.data(), (const float*)compare_g.data(), host_hidden_out.valid_size(), maxratio, maxdiff); + LOG(INFO)<<"ratios :: "<< maxratio<<","<::env_init(); -#ifdef COMPARE_FILE + srand(12345); +// lstmp_ut(512,1536,512,{0,8},4,false,true,Active_sigmoid,Active_tanh,Active_tanh); +// lstmp_ut(32,32,32,{0,4},4,false,true,Active_sigmoid,Active_tanh,Active_tanh); +// exit(0); + lstmp_ut(8,8,8,{0,8},4,false,true,Active_sigmoid,Active_tanh,Active_tanh); +// lstmp_ut(32,32,32,{0,16},4,false,true,Active_sigmoid,Active_tanh,Active_tanh,100); + lstmp_ut(512,1536,512,{0,16},4,false,true,Active_sigmoid,Active_tanh,Active_tanh,100); +// lstmp_ut(512,1536,512,{0,16},4,false,true,Active_sigmoid,Active_tanh,Active_tanh,100); + +// return; +#if 0 lstm_ut(15,333,{0,5}, true, true,Active_tanh,Active_tanh,Active_tanh,0,SABER_IMPL); #else for(int word_size:{15,222}) @@ -342,7 +724,9 @@ TEST(TestSaberFunc, test_func_lstm_x86) { #ifdef NVIDIA_GPU TEST(TestSaberFunc, test_func_lstm_nv) { Env::env_init(); - + srand(12345); + lstmp_ut(512,1536,512,{0,10},6,false,true,Active_sigmoid,Active_tanh,Active_tanh,100); +// exit(0); for(int word_size:{15,222}) for(int hidden_size:{15,333}) for(bool reverse:{true,false}) diff --git a/test/saber/test_saber_match_matrix.cpp b/test/saber/test_saber_match_matrix.cpp index 5e5a6d69e..e64e9b2bf 100644 --- a/test/saber/test_saber_match_matrix.cpp +++ b/test/saber/test_saber_match_matrix.cpp @@ -114,24 +114,43 @@ void match_matrix_basic(const std::vector*>& inputs, dtype* input_l_transform_reorganize = (dtype*)_input_l_transform_reorganize.mutable_data(); dtype* output_tmp = (dtype*)_output_tmp.mutable_data(); dtype* output_data = (dtype*) outputs[0]->mutable_data(); - gemm(weight_data, - input_l, - dim_t * dim_in, len_l, dim_in, - true, true, - 1.0f, 0.0f, input_l_transform); - for (int i = 0; i < dim_t; i++) { - int offset = i * dim_in * len_l; - transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); - } - gemm(input_r, - input_l_transform_reorganize, - len_r, dim_t*len_l, dim_in, - false, true, - 1.0f, 0.0f, output_tmp); + if (param.is_l_same) { + gemm(weight_data, + input_l, + dim_t * dim_in, len_l, dim_in, + true, true, + 1.0f, 0.0f, input_l_transform); + for (int i = 0; i < dim_t; i++) { + int offset = i * dim_in * len_l; + transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); + } + gemm(input_r, + input_l_transform_reorganize, + len_r, dim_t*len_l, dim_in, + false, true, + 1.0f, 0.0f, output_tmp); + } else { + for (int i = 0; i < batch; i++) { + gemm(weight_data, + input_l + i * len_l * dim_in, + dim_t * dim_in, len_l, dim_in, + true, true, + 1.0f, 0.0f, input_l_transform); + for (int i = 0; i < dim_t; i++) { + int offset = i * dim_in * len_l; + transpose(input_l_transform + offset, dim_in, len_l, input_l_transform_reorganize + offset); + } + gemm(input_r+offset_r[i]*dim_in, + input_l_transform_reorganize, + offset_r[i+1] - offset_r[i], dim_t*len_l, dim_in, + false, true, + 1.0f, 0.0f, output_tmp + offset_r[i] * dim_t * len_l); + } + } padding_out(output_tmp, offset_r, dim_t, len_l, output_data); LOG(INFO )<< "*******************************"; - write_tensorfile(_input_l_transform, "./_input_l_transform"); + // write_tensorfile(_input_l_transform, "./_input_l_transform"); // record_dev_tensorfile(input_l_transform_reorganize, _input_l_transform_reorganize.valid_size(), ("_input_l_transform_reorganize").c_str()); // record_dev_tensorfile(output_tmp, _output_tmp.valid_size(), ("_output_tmp").c_str()); // record_dev_tensorfile(output_data, outputs[0]->valid_size(), ("output").c_str()); @@ -151,12 +170,13 @@ void test_model(){ TestSaberBase testbase(2,1); //test example + for (auto is_l_same : {false, true}) { for (auto dim_t: {1, 3, 5}) { Shape weight_shape = std::vector{dim_in*dim_t*dim_in, 1, 1, 1}; Tensor weight(weight_shape); fill_tensor_rand(weight, -1, 1); - MatchMatrixParam param(dim_in, dim_t, &weight); + MatchMatrixParam param(dim_in, dim_t, is_l_same, &weight); testbase.set_param(param);//set param std::vector> left_seq_offset; std::vector> right_seq_offset; @@ -188,6 +208,7 @@ void test_model(){ testbase.add_custom_input (input_vec); testbase.run_test(match_matrix_basic, 5e-5);//run test } + } } #endif diff --git a/test/saber/test_saber_mean.cpp b/test/saber/test_saber_mean.cpp new file mode 100644 index 000000000..b8f632a06 --- /dev/null +++ b/test/saber/test_saber_mean.cpp @@ -0,0 +1,80 @@ +#include "saber/core/context.h" +#include "saber/funcs/mean.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include + +using namespace anakin::saber; +/** + * @brief compute a mean of input tensor's all elements. + * + * + * @tparam dtype + * @tparam TargetType_D + * @tparam TargetType_H + * @param input + * @param output + * @param param + */ +template +void mean_cpu_base(const std::vector* >& input, + std::vector* >& output, MeanParam& param) { + + int n = input[0]->valid_size(); + const dtype* input_ptr = (const dtype*)input[0]->data(); + dtype* output_ptr = (dtype*)output[0]->mutable_data(); + dtype s = (dtype)0.0; + for (int i = 0; i < n; i++) { + s += input_ptr[i]; + } + s /= n; + output_ptr[0] = s; +} + +template +void test_mean(){ + TestSaberBase testbase; + MeanParam param; + + for (int w_in : {8, 8, 16}) { + for (int h_in : {2, 8, 32}) { + for (int ch_in : {3, 4, 8, 64}) { + for (int num_in:{1, 21, 32}) { + Shape shape({num_in, ch_in, h_in, w_in}); + testbase.set_param(param); + //testbase.set_rand_limit(); + testbase.set_input_shape(shape); + testbase.run_test(mean_cpu_base); + } + } + } + } +} + +TEST(TestSaberFunc, test_op_Mean) { + +#ifdef USE_CUDA + //Init the test_base + test_mean(); +#endif +#ifdef USE_X86_PLACE + test_mean(); +#endif +#ifdef USE_ARM_PLACE + //test_Mean(); +#endif +#ifdef USE_BM + // Env::env_init(); + //test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_normalize.cpp b/test/saber/test_saber_normalize.cpp index ada081197..70ea4a8cc 100644 --- a/test/saber/test_saber_normalize.cpp +++ b/test/saber/test_saber_normalize.cpp @@ -9,15 +9,64 @@ #include using namespace anakin::saber; +template +void group_normlize(const dtype* in_data, const dtype* scale, const dtype* bias, + int n, int c, int h, int w, float eps, int group, + dtype* out_data, dtype* out_mean, dtype* out_var){ + int group_size = (c - 1) / group + 1; + int im_size = h * w; + for (int n_index = 0; n_index < n; ++n_index){ + for (int g_index = 0; g_index < group; ++g_index){ + dtype t_mean = 0; + dtype t_var = 0; + int real_channels = c - g_index * group_size >= group_size ? + group_size : c - g_index * group_size; + int compute_size = im_size * real_channels; + for (int im_index = 0; im_index < compute_size; ++im_index){ + t_mean += in_data[im_index]; + t_var += in_data[im_index] * in_data[im_index]; + } + t_mean /= compute_size; + t_var /= compute_size; + t_var -= t_mean * t_mean; + dtype t_var_inv = 1 / sqrt(t_var + eps); + if (out_mean){ + out_mean[n * group + g_index] = t_mean; + } + if (out_var){ + out_var[n * group + g_index] = t_var; + } + + int scale_bias_start_index = g_index * group_size; + for (int c_index = 0; c_index < real_channels; ++c_index){ + int c_start = c_index * im_size; + for (int im_index = 0; im_index < im_size; ++im_index){ + dtype dest_val = (in_data[c_start + im_index] - t_mean) * t_var_inv; + if (scale){ + dest_val *= scale[scale_bias_start_index + c_index]; + } + if (bias){ + dest_val += bias[scale_bias_start_index + c_index]; + } + out_data[c_start + im_index] = dest_val; + } + + } + out_data += compute_size; + in_data += compute_size; + } + } +} /*CPU function form: void FuncName(const std::vector*>& input,std::vector*>& output,Param& param,Shape shape) */ template void norm_cpu_func(const std::vector*>& input,std::vector*>& output,NormalizeParam& param) { - + int p=param.p; bool across_spatial=param.across_spatial; bool has_scale=param.has_scale; + bool has_bias = param.has_bias; bool channel_shared=param.channel_shared; dtype eps=param.eps; int n=input[0]->num(); @@ -25,22 +74,43 @@ void norm_cpu_func(const std::vector*>& input,std::vectorheight(); int w=input[0]->width(); Tensor th_scale; - const dtype* scale; - if(has_scale){ - th_scale.re_alloc(param.scale->shape(),AK_FLOAT); + Tensor th_bias; + dtype* scale = nullptr; + dtype* bias = nullptr; + dtype* out_mean = nullptr; + dtype* out_var = nullptr; + if (has_scale){ + th_scale.re_alloc(param.scale->shape(), AK_FLOAT); th_scale.copy_from(*param.scale); - scale=static_cast(th_scale.data()); + scale = static_cast(th_scale.data()); + } + if (has_bias){ + th_bias.re_alloc(param.bias->shape(), AK_FLOAT); + th_bias.copy_from(*param.bias); + bias = static_cast(th_bias.data()); } + const dtype* src_ptr = static_cast(input[0]->data()); dtype* dst_ptr = static_cast(output[0]->mutable_data()); - + if (param.group > 0){ + //group>1, do group normal + if (output.size() > 1){ + out_mean = static_cast(output[1]->mutable_data()); + } + if (output.size() > 2){ + out_var = static_cast(output[2]->mutable_data()); + } + group_normlize(src_ptr, scale, bias, n, c, h, w, eps, param.group, + dst_ptr, out_mean, out_var); + return; + } if (across_spatial) { int compute_size = h * w * c; int outer_size = n * c * h * w / compute_size; - + for (int i = 0; i < outer_size; ++i) { dtype sum = 0; - + for (int j = 0; j < compute_size; ++j) { if (p == 1) { sum += fabsf(src_ptr[j]); @@ -48,15 +118,15 @@ void norm_cpu_func(const std::vector*>& input,std::vector*>& input,std::vector*>& input,std::vector*>& input,std::vector :: Dtype dtype; //Init the test_base TestSaberBase testbase; - + //combine param by yourself bool scale_flag=false; int total_count=2 * 2 * 2 * 3 * 3 * 2 * 2; @@ -143,7 +213,6 @@ void test_normalize(){ for (bool sp_flag : {false}){ for (bool channel_flag : {false,true}) { for (int p : {1, 2}) { - for(int w_in:{32, 64}){ for(int h_in: {32, 64}){ for(int ch_in:{3, 8}){ @@ -166,14 +235,12 @@ void test_normalize(){ NormalizeParam param_tmp(sp_flag, eps, p); param = param_tmp; } - + //testbase test testbase.set_param(param);//set param //testbase.set_rand_limit(255,255); testbase.set_input_shape(Shape({num_in, ch_in, h_in, w_in}));//add some input shape testbase.run_test(norm_cpu_func);//run test - - } } } @@ -181,6 +248,37 @@ void test_normalize(){ } } } + + for (int w_in:{2}){ + for (int h_in: {2}){ + for (int ch_in:{3, 8}){ + for (int num_in:{1, 2}){ + for (int group : {1, 2 ,3}){ + LOG(ERROR) << w_in << "," << h_in << "," << ch_in << "," << num_in << "," << group; + //make param + NormalizeParam param; + Shape sh_slope({1, 1, 1, ch_in}); + Tensor th_scale(sh_slope); + Tensor tdscale; + tdscale.re_alloc(sh_slope,AK_FLOAT); + for (int i = 0; i < ch_in; ++i) { + static_cast(th_scale.mutable_data())[i] = 0.1f * (i + 1); + } + tdscale.copy_from(th_scale); + NormalizeParam param_tmp(true, &tdscale, false, nullptr, group, 0.00001); + param = param_tmp; + + //testbase test + testbase.set_param(param);//set param + //testbase.set_rand_limit(255,255); + testbase.set_input_shape(Shape({num_in, ch_in, h_in, w_in}));//add some input shape + testbase.run_test(norm_cpu_func);//run test + } + + } + } + } + } } TEST(TestSaberFunc, test_func_normalize) { @@ -197,9 +295,9 @@ TEST(TestSaberFunc, test_func_normalize) { int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); - + InitTest(); RUN_ALL_TESTS(argv[0]); - + return 0; } diff --git a/test/saber/test_saber_one_hot.cpp b/test/saber/test_saber_one_hot.cpp new file mode 100644 index 000000000..e97e546c2 --- /dev/null +++ b/test/saber/test_saber_one_hot.cpp @@ -0,0 +1,70 @@ +#include "saber/core/context.h" +#include "test_saber_base.h" +#include "test_saber_func.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "saber/funcs/one_hot.h" +#include "saber/core/data_traits.h" + +using namespace anakin::saber; + +template +void one_hot_cpu_func(const std::vector*>& input, + std::vector*>& output, + OneHotParam& param) { + + memset(output[0]->mutable_data(), 0, output[0]->valid_size() * output[0]->get_dtype_size()); + + int depth = param.depth; + const float* in_ptr = (const float*)input[0]->data(); + float* out_ptr = (float*)output[0]->mutable_data(); + int dims = input[0]->valid_size(); + for (int i = 0; i < dims; ++i) { + out_ptr[i * depth + (int)in_ptr[i]] = 1.0; + } +} + +//test template for different device and dtype +template +void test_one_hot() { + + std::vector in_n_v{2, 3, 4, 5, 6}; + std::vector in_c_v{2, 3, 4, 5, 6}; + std::vector in_h_v{2, 3, 4, 5, 6}; + std::vector in_w_v{1}; + + std::vector depth_v{4, 5, 6, 7, 8, 9}; + Env::env_init(); + Env::env_init(); + TestSaberBase testbase; + + for (int in_n : in_n_v) + for (int in_c : in_c_v) + for (int in_h : in_h_v) + for (int in_w : in_w_v) + for (int depth : depth_v) { + OneHotParam param(depth); + testbase.set_param(param);//set param + testbase.set_rand_limit(0, depth); + testbase.set_input_shape(Shape({in_n, in_c, in_h, in_w})); //add some input shape + testbase.run_test(one_hot_cpu_func, 0.0001);//run test + + } +} + +TEST(TestSaberFunc, test_func_pool) { +#ifdef USE_CUDA + test_one_hot(); +#endif +#ifdef USE_X86_PLACE + test_one_hot(); +#endif +} + +int main(int argc, const char** argv) { + // initial logger +// logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_pad.cpp b/test/saber/test_saber_pad.cpp index 9b9f98aa4..2fd7b6010 100644 --- a/test/saber/test_saber_pad.cpp +++ b/test/saber/test_saber_pad.cpp @@ -58,29 +58,60 @@ void test_pad(){ typedef typename DataTrait :: Dtype dtype; TestSaberBase testbase; - for (int pad_c0 : {0, 1, 2}){ - for (int pad_c1 : {0, 1, 2}){ + for (int pad_c0 : {0, 1}){ + for (int pad_c1 : {0, 1}){ std::vector pad_c{pad_c0, pad_c1}; - for (int pad_h0 : {0, 1, 2}){ - for (int pad_h1 : {0, 1, 2}){ + for (int pad_h0 : {0, 1}){ + for (int pad_h1 : {0, 1}){ std::vector pad_h{pad_h0, pad_h1}; - for (int pad_w0 : {0, 1, 2}){ - for (int pad_w1 : {0, 1, 2}){ + for (int pad_w0 : {0, 1}){ + for (int pad_w1 : {0, 1}){ std::vector pad_w{pad_w0, pad_w1}; PadParam param(pad_c, pad_h, pad_w); LOG(INFO)<); } } } } - + + } + } + } + } + } + } + + + for (int pad_c0 : {1}){ + for (int pad_c1 : {2}){ + std::vector pad_c{pad_c0, pad_c1}; + for (int pad_h0 : {1}){ + for (int pad_h1 : {2}){ + std::vector pad_h{pad_h0, pad_h1}; + for (int pad_w0 : {1}){ + for (int pad_w1 : {2}){ + std::vector pad_w{pad_w0, pad_w1}; + PadParam param(pad_c, pad_h, pad_w); + LOG(INFO)<); + } + } + } + } + } } } @@ -94,6 +125,10 @@ TEST(TestSaberFunc, test_func_pool) #ifdef USE_CUDA test_pad(); #endif + +#ifdef USE_X86_PLACE + test_pad(); +#endif } int main(int argc, const char** argv) { diff --git a/test/saber/test_saber_pad2d.cpp b/test/saber/test_saber_pad2d.cpp new file mode 100644 index 000000000..59a934bae --- /dev/null +++ b/test/saber/test_saber_pad2d.cpp @@ -0,0 +1,168 @@ +#include +#include "saber/core/context.h" +#include "test/saber/test_saber_base.h" +#include "test/saber/test_saber_func.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "saber/funcs/pad2d.h" +#include "saber/core/data_traits.h" + +using namespace anakin::saber; + +template +void pad_cpu_func(const std::vector*>& input, \ + std::vector*>& output, PadParam& param) +{ + const dtype* src_ptr = static_cast(input[0]->data()); + dtype* dst_ptr = static_cast(output[0]->mutable_data()); + + int in_n = input[0]->num(); + int in_c = input[0]->channel(); + int in_h = input[0]->height(); + int in_w = input[0]->width(); + int out_n = output[0]->num(); + int out_c = output[0]->channel(); + int out_h = output[0]->height(); + int out_w = output[0]->width(); + Shape in_stride = input[0]->get_stride(); + Shape out_stride = output[0]->get_stride(); + int in_idn = input[0]->num_index(); + int in_idc = input[0]->channel_index(); + int in_idh = input[0]->height_index(); + int in_idw = input[0]->width_index(); + int out_idn = output[0]->num_index(); + int out_idc = output[0]->channel_index(); + int out_idh = output[0]->height_index(); + int out_idw = output[0]->width_index(); + + fill_tensor_const(*output[0], 0); + + int c0 = param.pad_c[0]; + int h0 = param.pad_h[0]; + int w0 = param.pad_w[0]; + int offset = c0 * out_stride[out_idc] + h0 * out_stride[out_idh] + w0 * out_stride[out_idw]; + for (int id = 0; id < input[0]->valid_size(); ++id){ + int i_n = (id / in_stride[in_idn]) % in_n; + int i_c = (id / in_stride[in_idc]) % in_c; + int i_h = (id / in_stride[in_idh]) % in_h; + int i_w = (id / in_stride[in_idw]) % in_w; + int out_id = i_n * out_stride[out_idn] + i_c * out_stride[out_idc] + \ + i_h * out_stride[out_idh] + i_w * out_stride[out_idw]; + dst_ptr[out_id + offset] = src_ptr[id]; + } + +} +template +void pad_cpu_func(const std::vector*>& input, \ + std::vector*>& output, Pad2DParam& param){ + const dtype* din = static_cast(input[0]->data()); + dtype* dout = static_cast(output[0]->mutable_data()); + int n = output[0]->num(); + int c = output[0]->channel(); + int h = output[0]->height(); + int w = output[0]->width(); + int pad_top = param._pad_h[0]; + int pad_bottom = param._pad_h[1]; + int pad_left = param._pad_w[0]; + int pad_right = param._pad_w[1]; + PadMode pad_mode = param._mode; + float pad_value = param._pad_value; + + int in_w = w - pad_left - pad_right; + int in_h = h - pad_bottom - pad_top; + int spatial_size_out = w * h; + int spatial_size_in = in_w * in_h; +#pragma omp parallel for + for (int i = 0; i < n * c; ++i) { + const float* din_batch = din + i * spatial_size_in; + float* dout_batch = dout + i * spatial_size_out; + int in_y = 0; + int in_x = 0; + for (int y = 0; y < h; ++y){ + for (int x = 0; x < w; ++x){ + switch (pad_mode){ + case PAD_CONSTANT: + in_y = y - pad_top; + in_x = x - pad_left; + dout_batch[y * w + x] = (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h) ? \ + din_batch[in_y * in_w + in_x] : pad_value; + break; + case PAD_EDGE: + in_x = std::min(std::max(pad_left, x), in_w + pad_left - 1) - pad_left; + in_y = std::min(std::max(pad_top, y), in_h + pad_top - 1) - pad_top; + dout_batch[y * w + x] = din_batch[in_y * in_w + in_x]; + break; + case PAD_REFLECT: + in_y = y - pad_top; + in_x = x - pad_left; + in_y = std::max(in_y, -in_y); + in_y = std::min(in_y, 2 * in_h - in_y - 2); + in_x = std::max(in_x, -in_x); + in_x = std::min(in_x, 2 * in_w - in_x - 2); + dout_batch[y * w + x] = din_batch[in_y * in_w + in_x]; + break; + default: + LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode; + } + } + } + } +} + +//test template for different device and dtype +template +void test_pad(){ + typedef typename DataTrait::Dtype dtype; + TestSaberBase testbase; + + for (int pad_top : {0, 1}){ + for (int pad_bottom : {0, 1}){ + std::vector pad_h{pad_top, pad_bottom}; + for (int pad_left : {0, 1}){ + for (int pad_right : {0, 1}){ + std::vector pad_w{pad_left, pad_right}; + for (int pad_mode : {0, 1, 2}){ + for (float pad_value : {0.f, 1.0f}){ + Pad2DParam param(pad_h, pad_w, pad_value, pad_mode); + LOG(INFO) << "pad param: " << pad_mode<<" "<< pad_value<<" "<); + } + } + } + } + + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_func_pad2d) +{ +#ifdef USE_CUDA + // test_pad(); +#endif + +#ifdef USE_X86_PLACE + // test_pad(); +#endif +#ifdef USE_ARM_PLACE + test_pad(); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_permute.cpp b/test/saber/test_saber_permute.cpp index 643eeb931..e21552975 100644 --- a/test/saber/test_saber_permute.cpp +++ b/test/saber/test_saber_permute.cpp @@ -75,6 +75,9 @@ TEST(TestSaberFunc, test_func_permute) #ifdef USE_X86_PLACE test_permute(); #endif +#ifdef USE_ARM_PLACE + test_permute(); +#endif } int main(int argc, const char** argv) { diff --git a/test/saber/test_saber_pixel_shuffle.cpp b/test/saber/test_saber_pixel_shuffle.cpp new file mode 100644 index 000000000..7e4f3c36f --- /dev/null +++ b/test/saber/test_saber_pixel_shuffle.cpp @@ -0,0 +1,143 @@ +#include +#include + +#include "saber/core/context.h" +#include "test/saber/test_saber_base.h" +#include "test/saber/test_saber_func.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "saber/funcs/pixel_shuffle.h" + +using namespace anakin::saber; + +template +void pixel_shuffle_cpu_func(const std::vector*>& input, + std::vector*>& output, + PixelShuffleParam& param) +{ + const float* src_ptr = static_cast(input[0]->data()); + float* dst_ptr = static_cast(output[0]->mutable_data()); + + int out_size = output[0]->valid_size(); + Shape in_sh = input[0]->valid_shape(); + + int num_axes = input[0]->valid_shape().size() + 2; + int rw = param.rw; + int rh = param.rh; + int new_c = in_sh.channel()/(rw*rh); + std::vector order; + Shape in_new_sh; + Shape out_new_sh; + Shape out_sh; + + in_new_sh.push_back(in_sh.num()); + out_new_sh.push_back(in_sh.num()); + if (param.channel_first){ + in_new_sh.push_back(new_c); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + order = std::vector({0, 1, 4, 2, 5, 3}); + out_new_sh.push_back(new_c); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + out_sh = Shape({in_sh.num(), new_c, + param.rh * in_sh.height(), param.rw * in_sh.width()}); + + } else { + in_new_sh.push_back(in_sh.height()); + in_new_sh.push_back(in_sh.width()); + in_new_sh.push_back(param.rh); + in_new_sh.push_back(param.rw); + in_new_sh.push_back(new_c); + order = std::vector({0, 1, 3, 2, 4, 5}); + out_new_sh.push_back(in_sh.height()); + out_new_sh.push_back(param.rh); + out_new_sh.push_back(in_sh.width()); + out_new_sh.push_back(param.rw); + out_new_sh.push_back(new_c); + out_sh = Shape({in_sh.num(), + param.rh * in_sh.height(), param.rw * in_sh.width(), new_c}); + + } + Shape out_step = out_new_sh.get_stride(); + Shape in_step = in_new_sh.get_stride(); + + if (input[0]->is_continue_mem() && output[0]->is_continue_mem()){ + for (int j=0; j= 0; --i) { + int ord = order[i]; + int new_step = out_step[i]; + int old_step = in_step[ord]; + int id = (j / new_valid_stride) % out_new_sh[i]; + in_idx += id * old_step; + out_idx += id * new_step; + new_valid_stride *= out_new_sh[i]; + } + dst_ptr[out_idx] = src_ptr[in_idx]; + } + } + + output[0]->set_shape(out_sh); + +} + +template +void test_pixel_shuffle(){ + typedef typename DataTrait :: Dtype dtype; + TestSaberBase testbase; + for (int rw : {2, 3, 4}){ + for (int rh : {2, 3, 4}){ + PixelShuffleParam param(rh, rw); + for (int n : {1, 3}){ + for (int c : {144, 288}){ + for (int h : {8, 32}){ + for (int w: {8, 32}){ + testbase.set_param(param); + testbase.set_input_shape(Shape({n, c, h, w})); + testbase.run_test(pixel_shuffle_cpu_func); + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_func_permute) +{ +#ifdef USE_CUDA + test_pixel_shuffle(); +#endif +#ifdef USE_X86_PLACE + test_pixel_shuffle(); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_pooling.cpp b/test/saber/test_saber_pooling.cpp index 4e0c4dc7e..88aabf89d 100644 --- a/test/saber/test_saber_pooling.cpp +++ b/test/saber/test_saber_pooling.cpp @@ -45,7 +45,7 @@ void pooling_cpu_func(const std::vector*>& input, ew = (ew - param.pad_w) > in_w ? in_w : ew - param.pad_w; - dtype result; + dtype result= static_cast(0); int dst_ind = ind_n * size_out_n + ind_c * size_out_c + ind_h * out_w + ind_w; @@ -103,14 +103,88 @@ void pooling_cpu_func(const std::vector*>& input, } } +template +int test_pooling_results(int window_h,int window_w,int pad_h,int pad_w,PoolingType pooling_type,int stride_h,int stride_w, + int in_n,int in_c,int in_h,int in_w) { + + Env::env_init(); + Env::env_init(); + Shape input_s({in_n, in_c, in_h, in_w}, Layout_NCHW); + Shape input_nchwc8({in_n, in_c,in_h,in_w}, Layout_NCHW_C8R); + int out_h = static_cast((static_cast( + in_h + 2 * pad_h - window_h) / stride_h)) + 1; + + int out_w = static_cast((static_cast( + in_w + 2 * pad_w - window_w) / stride_w)) + 1; + Shape output_s({in_n, in_c, out_h, out_w}, Layout_NCHW); + Shape output_nchwc8({in_n, in_c, out_h, out_w}, Layout_NCHW_C8R); + // init input Tensor + Tensor input_dev(input_nchwc8); + Tensor input_host(input_nchwc8); + fill_tensor_rand(input_dev, -10.0f, 10.0f); + input_host.copy_from(input_dev); + + Tensor output_dev(output_nchwc8); + Tensor output_host(output_nchwc8); + Tensor check_host; + + Context ctx1(0, 1, 1); +// ActivationParam act_param(Active_relu); + PoolingParam param(window_h,window_w,pad_h,pad_w,stride_h,stride_w,pooling_type); + + Pooling pooling; + std::vector* > input_v; + std::vector* > output_v; + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); +// pooling.compute_output_shape(input_v, output_v, param); +// output_dev.re_alloc(output_dev.valid_shape(), AK_FLOAT); + + pooling.init(input_v, output_v, param, SPECIFY, SABER_IMPL, ctx1); + pooling(input_v, output_v, param, ctx1); + + typename Tensor::API::stream_t stream = ctx1.get_compute_stream(); + output_v[0]->record_event(stream); + output_v[0]->sync(); + output_host.re_alloc(output_dev.valid_shape(), AK_FLOAT); + output_host.copy_from(output_dev); + + Tensor input_check(input_s); + Tensor output_check(output_s); + Tensor output_check_from_dev(output_s); + reorder_nchwc8_nchw(input_host,input_check); + reorder_nchwc8_nchw(output_dev,output_check_from_dev); + std::vector* > input_v_h; + std::vector* > output_v_h; + input_v_h.push_back(&input_check); + output_v_h.push_back(&output_check); + pooling_cpu_func(input_v_h,output_v_h,param); + +// print_tensor_valid(check_host); + double max_ratio = 0.0; + double max_diff = 0.0; + tensor_cmp_host((const float*)output_check.data(), (const float*)output_check_from_dev.data(), + check_host.valid_size(), max_ratio, max_diff); +// print_tensor(input_check); +// print_tensor(output_check); +// print_tensor(output_dev); + if (max_ratio > 1e-3) { + print_tensor(output_check); + print_tensor_valid(output_check_from_dev); + LOG(FATAL) << " max_ratio = " << max_ratio << " max_diff = " << max_diff; + }else{ + LOG(INFO)<<"passed"; + } + return 0; +} + + //test template for different device and dtype template void test_pooling() { typedef typename DataTrait :: Dtype dtype; TestSaberBase testbase; - - for (int window_h : {2, 3, 5, 7}) { for (int window_w : {2, 3, 5, 7}) { for (int pad_h : {1, 2}) { @@ -118,7 +192,7 @@ void test_pooling() { if (pad_h >= window_h || pad_w >= window_w){ continue; } - for (int pooling_type : {Pooling_max, Pooling_average_include_padding, Pooling_average_exclude_padding}) { + for (PoolingType pooling_type : {Pooling_max, Pooling_average_include_padding, Pooling_average_exclude_padding}) { for (int stride_h : {1, 2 }) { for (int stride_w : {1, 2}) { PoolingParam param(window_h, window_w, pad_h, pad_w, stride_h, stride_w, @@ -134,7 +208,7 @@ void test_pooling() { for (int in_w : {7, 8, 13, 28, 32, 64}) { LOG(INFO) << "n:" << in_n << ",in_c:" << in_c << ",in_h:" << in_h << ",in_w:" << in_w; testbase.set_param(param);//set param - testbase.set_input_shape(Shape({in_n, in_c, in_h, in_w})); //add some input shape + testbase.set_input_shape(Shape({in_n, in_c, in_h, in_w}), SPECIAL); //add some input shape testbase.run_test(pooling_cpu_func, 0.0001);//run test } @@ -156,10 +230,118 @@ TEST(TestSaberFunc, test_func_pool) { test_pooling(); #endif #ifdef USE_X86_PLACE - test_pooling(); +// test_pooling(); +#if 0 + int window_h=2; + int window_w=3; + int pad_h=1; + int pad_w=1; + PoolingType pooling_type=Pooling_max; + int stride_h=1; + int stride_w=2; + int in_n=1; + int in_c=1; + int in_h=7; + int in_w=8; + test_pooling_results( window_h, window_w, pad_h, pad_w, pooling_type, stride_h, stride_w, + in_n, in_c, in_h, in_w); +#else + for (int window_h : {2, 3, 5, 7}) { + for (int window_w : {2, 3, 5, 7}) { + for (int pad_h : {1, 2}) { + for (int pad_w : {1, 2}) { + if (pad_h >= window_h || pad_w >= window_w) { + continue; + } + for (PoolingType pooling_type : {Pooling_max, Pooling_average_include_padding, + Pooling_average_exclude_padding}) { + for (int stride_h : {1, 2}) { + for (int stride_w : {1, 2}) { + + LOG(INFO) << "win_h:" << window_h << "win_w:" << window_w \ + << "pad_h:" << pad_h << "pad_w:" << pad_w \ + << "stride_h:" << stride_h << "stride_w:" << stride_w \ + << "pooling_type:" << pooling_type; + + for (int in_n : {1, 2}) { + for (int in_c : {1, 3}) { + for (int in_h : {7, 8, 13, 28, 32, 64}) { + for (int in_w : {7, 8, 13, 28, 32, 64}) { + LOG(INFO) << "n:" << in_n << ",in_c:" << in_c << ",in_h:" << in_h << ",in_w:" << in_w; + + test_pooling_results( window_h, window_w, pad_h, pad_w, pooling_type, stride_h, stride_w, + in_n, in_c, in_h, in_w); + + } + } + } + } + + } + } + + } + } + } + } + } +#endif +#endif +#ifdef USE_ARM_PLACE + test_pooling(); #endif } + +#ifdef USE_CUDA +TEST(TestSaberFunc, test_func_pool_res) { + Env::env_init(); + Env::env_init(); + + int window_h = 2; + int window_w = 2; + int pad_h = 0; + int pad_w = 0; + PoolingType pooling_type = Pooling_max; + int stride_h = 2; + int stride_w = 2; + int input_num = 1; + int in_channels = 4; + int height = 4; + int width = 4; + + Shape input_s({input_num, in_channels, height, width}, Layout_NCHW); + input_s.set_layout(Layout_NCHW_C4); + Tensor input_dev; + Tensor output_dev; + + input_dev.re_alloc(input_s, AK_INT8); + fill_tensor_rand(input_dev, -10, 10); + PoolingParam param(window_h,window_w,pad_h,pad_w,stride_h,stride_w,pooling_type); + + std::vector*> input_v; + std::vector*> output_v; + + input_dev.set_scale({1.f}); + output_dev.set_scale({1.f}); + input_v.push_back(&input_dev); + output_v.push_back(&output_dev); + + Pooling pool; + pool.compute_output_shape(input_v, output_v, param); + output_dev.re_alloc(output_dev.valid_shape(), AK_INT8); + fill_tensor_const(output_dev, 0); +// output_dev.set_layout(Layout_NCHW_C4); + Context ctx(0, 0, 1); + pool.init(input_v, output_v, param, SPECIFY, SABER_IMPL, ctx); + + pool(input_v, output_v, param, ctx); + cudaDeviceSynchronize(); +// print_tensor(input_dev); +// print_tensor(output_dev); +// cudaDeviceSynchronize(); +} +#endif int main(int argc, const char** argv) { // initial logger logger::init(argv[0]); diff --git a/test/saber/test_saber_pooling_int8.cpp b/test/saber/test_saber_pooling_int8.cpp new file mode 100644 index 000000000..93151bfca --- /dev/null +++ b/test/saber/test_saber_pooling_int8.cpp @@ -0,0 +1,190 @@ +#include +#include + +#include "saber/core/context.h" +#include "test/saber/test_saber_base.h" +#include "test_saber_func.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "saber/funcs/pooling.h" +#include "saber/core/data_traits.h" +#if defined(USE_X86_PLACE) +#include "jit_generator.h" +#endif +using namespace anakin::saber; + +template +void pooling_cpu_func(const std::vector*>& input, + std::vector*>& output, + PoolingParam& param) { + typedef typename DataTrait :: Dtype dtype_in; + typedef typename DataTrait :: Dtype dtype_out; + + const dtype_in* src_ptr = static_cast(input[0]->data()); + dtype_out* dst_ptr = static_cast(output[0]->mutable_data()); + + int in_n = input[0]->num(); + int in_c = input[0]->channel(); + int in_h = input[0]->height(); + int in_w = input[0]->width(); + int size_in_n = in_c * in_h * in_w; + int size_in_c = 1; + + int out_h = output[0]->height(); + int out_w = output[0]->width(); + int size_out_n = in_c * out_h * out_w; + int size_out_c = 1; + + for (int ind_n = 0; ind_n < in_n; ++ind_n) { + for (int ind_h = 0; ind_h < out_h; ++ind_h) { + int sh = ind_h * param.stride_h; + int eh = sh + param.window_h; + + if (param.pad_h > 0) { + sh = (sh - param.pad_h) < 0 ? 0 : sh - param.pad_h; + eh = (eh - param.pad_h) > in_h ? in_h : eh - param.pad_h; + } + + for (int ind_w = 0; ind_w < out_w; ++ind_w) { + int sw = ind_w * param.stride_w; + int ew = sw + param.window_w; + + if (param.pad_w > 0) { + sw = (sw - param.pad_w) < 0 ? 0 : sw - param.pad_w; + ew = (ew - param.pad_w) > in_w ? in_w : ew - param.pad_w; + } + + float result = 0; + + for (int ind_c = 0; ind_c < in_c; ++ind_c) { + int dst_ind = ind_n * size_out_n + ind_h * out_w * in_c + ind_w * in_c + ind_c; + + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_ind = ind_n * size_in_n + kh * in_w * in_c + kw * in_c + ind_c; + + if (kh == sh && kw == sw) { + result = src_ptr[src_ind]; + } else { + if (param.pooling_type == Pooling_max) { + result = result >= src_ptr[src_ind] ? result : src_ptr[src_ind]; + } + + if (param.pooling_type == Pooling_average_include_padding) { + result += src_ptr[src_ind]; + } + + if (param.pooling_type == Pooling_average_exclude_padding) { + result += src_ptr[src_ind]; + } + } + } + } + + if (param.pooling_type == Pooling_average_include_padding) { + result /= param.window_h * param.window_w; + } + + if (param.pooling_type == Pooling_average_exclude_padding) { + result /= (ew - sw) * (eh - sh); + } + + if (Dtype_OUT != AK_FLOAT) { + dst_ptr[dst_ind] = static_cast(nearbyintf(result)); + } else { + dst_ptr[dst_ind] = result; + } + } + } + } + } +} + +//test template for different device and dtype +template +void test_pooling() { + typedef typename DataTrait :: Dtype dtype_in; + typedef typename DataTrait :: Dtype dtype_out; + TestSaberBase testbase; + + for (int window_h : { + 2, 4 + }) { + for (int window_w : { + 2, 4 + }) { + for (int pad_h : { + 0, 1 + }) { + for (int pad_w : { + 0, 1 + }) { + for (PoolingType pooling_type : { + Pooling_max, Pooling_average_include_padding, Pooling_average_exclude_padding + }) { + for (int stride_h : { + 1, 2 + }) { + for (int stride_w : { + 1, 2 + }) { + PoolingParam param(window_h, window_w, pad_h, pad_w, stride_h, stride_w, + pooling_type); + LOG(INFO) << "win_h:" << window_h << "win_w:" << window_w \ + << "pad_h:" << pad_h << "pad_w:" << pad_w \ + << "stride_h:" << stride_h << "stride_w:" << stride_w \ + << "pooling_type:" << pooling_type; + + for (int in_n : { + 1, 2 + }) { + for (int in_c : { + 1, 3, 8 + }) { + for (int in_h : { + 32, 64 + }) { + for (int in_w : { + 32, 64 + }) { + LOG(INFO) << "n:" << in_n << ",in_h:" << in_h << ",in_w:" << in_w << ",in_c:" << in_c; + testbase.set_param(param);//set param + testbase.set_input_datatype(Dtype_IN); + testbase.set_input_shape(Shape({in_n, in_h, in_w, in_c}, Layout_NHWC),{1.f},{1.f});//add some input shape + testbase.set_ouput_datatype(Dtype_OUT); + testbase.run_test(pooling_cpu_func);//run test + + } + } + } + } + } + } + } + } + } + } + } + +} + +TEST(TestSaberFunc, test_func_pool) { +#ifdef USE_X86_PLACE + + // test_pooling(); + // test_pooling(); + if (jit::mayiuse(jit::avx512_core)) { + test_pooling(); +// test_pooling(); + } + +#endif +} + +int main(int argc, const char** argv) { + // initial logger + logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_power.cpp b/test/saber/test_saber_power.cpp index 2bec00c11..77bd370d6 100644 --- a/test/saber/test_saber_power.cpp +++ b/test/saber/test_saber_power.cpp @@ -15,10 +15,10 @@ void power_cpu_func(const std::vector*>& input, std::vector float p = param.power; float scale = param.scale; float shift = param.shift; - - const dtype* src_ptr = static_cast(input[0] -> data()); - dtype* dst_ptr = static_cast(output[0] -> mutable_data()); - + + const dtype* src_ptr = static_cast(input[0]->data()); + dtype* dst_ptr = static_cast(output[0]->mutable_data()); + for (int i=0; i < input[0] -> valid_size(); ++i){ dst_ptr[i] = pow(src_ptr[i]* scale +shift, p); } @@ -26,7 +26,7 @@ void power_cpu_func(const std::vector*>& input, std::vector template void test_power(){ - + typedef typename DataTrait :: Dtype dtype; //Init the test_base TestSaberBase testbase; @@ -34,13 +34,13 @@ void test_power(){ for (float scale : {0.5, 1.0, 2.0}){ for (float shift : {0, 1, 2}){ PowerParam param(p, scale, shift); - + for (int n : {1, 2}){ for (int c : {1, 3}){ for (int h: {32, 64}){ for (int w : {32, 64}){ testbase.set_param(param); - testbase.set_input_shape(Shape({n, c, h, w})); + testbase.set_input_shape(Shape({n, c, h, w}), SPECIAL); testbase.run_test(power_cpu_func); } } @@ -58,6 +58,9 @@ TEST(TestSaberFunc, test_func_power) { #ifdef USE_X86_PLACE test_power(); #endif +#ifdef USE_ARM_PLACE + test_power(); +#endif } @@ -65,9 +68,9 @@ TEST(TestSaberFunc, test_func_power) { int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); - + InitTest(); RUN_ALL_TESTS(argv[0]); - + return 0; } diff --git a/test/saber/test_saber_priorbox.cpp b/test/saber/test_saber_priorbox.cpp index 64a54f70d..230d3e348 100644 --- a/test/saber/test_saber_priorbox.cpp +++ b/test/saber/test_saber_priorbox.cpp @@ -25,7 +25,7 @@ void priorbox_cpu_base(const std::vector* > &input, \ unsigned long long out_size = output[0]->valid_size(); - float* _cpu_data = output[0]->mutable_data(); + float* _cpu_data = static_cast(output[0]->mutable_data()); float* min_buf = (float*)fast_malloc(sizeof(float) * 4); float* max_buf = (float*)fast_malloc(sizeof(float) * 4); diff --git a/test/saber/test_saber_product_quant_embedding_with_vsum.cpp b/test/saber/test_saber_product_quant_embedding_with_vsum.cpp new file mode 100644 index 000000000..61da33473 --- /dev/null +++ b/test/saber/test_saber_product_quant_embedding_with_vsum.cpp @@ -0,0 +1,331 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/product_quant_embedding_with_vsum.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include +#include +int g_num_threads = 1; + +using namespace anakin::saber; +bool decode_4d12b( const unsigned char *in, + unsigned int ilen, + unsigned int *out, + unsigned int olen) { + if (ilen % 3 != 0) { + LOG(INFO) << "error, ilen mod 3 != 0"; + return false; + } + if (ilen * 2 != olen * 3) { + LOG(INFO) << "error, ilen * 2 != olen * 3"; + return false; + } + memset(out, 0, olen * sizeof(unsigned int)); + for (unsigned int i = 0; i < ilen / 3; i++) { + unsigned char *raw_ptr = (unsigned char *)(out + i * 2); + raw_ptr[0] = in[3 * i]; + raw_ptr[1] = in[3 * i + 1] & 0x0f; + raw_ptr[4] = in[3 * i + 2]; + raw_ptr[5] = in[3 * i + 1] >> 4; + } + return true; +} + +void get_cur_idx(size_t word_idx, const size_t* word_offset, int offset_len, size_t* real_idx, int* case_idx) { + CHECK_EQ(offset_len, 9); + if (word_idx < word_offset[0]) { + *case_idx = 0; + *real_idx = word_idx; + } else if (word_idx < word_offset[1]) { + *case_idx = 1; + *real_idx = word_idx - word_offset[0]; + } else if (word_idx < word_offset[2]) { + *case_idx = 2; + *real_idx = word_idx - word_offset[1]; + } else if (word_idx < word_offset[3]) { + *case_idx = 0; + *real_idx = word_idx - word_offset[2] + word_offset[0]; + } else if (word_idx < word_offset[4]) { + *case_idx = 1; + *real_idx = word_idx - word_offset[3] + word_offset[1] - word_offset[0]; + } else if (word_idx < word_offset[5]) { + *case_idx = 2; + *real_idx = word_idx - word_offset[4] + word_offset[2] - word_offset[1]; + } else if (word_idx < word_offset[6]) { + *case_idx = 0; + *real_idx = word_idx - word_offset[5] + word_offset[0] + word_offset[3] - word_offset[2]; + } else if (word_idx < word_offset[7]) { + *case_idx = 1; + *real_idx = word_idx - word_offset[6] + word_offset[1] - word_offset[0] + word_offset[4] - word_offset[3]; + } else if (word_idx < word_offset[8]) { + *case_idx = 2; + *real_idx = word_idx - word_offset[7] + word_offset[2] - word_offset[1] + word_offset[5] - word_offset[4]; + } +} + +template +void product_quant_embedding_with_vsum_basic(const std::vector*>& inputs, + std::vector*>& outputs, + ProductQuantEmbeddingWithVsumParam& param) { + size_t voc_size; + size_t emb_size; + size_t max_seq_len; + size_t unigram_num[3]; + size_t bigram_num[3]; + size_t collocation_num[3]; + size_t chnl_num[3]; + size_t word_len[3]; + size_t word_num[3]; + size_t dict_size[3]; + size_t word_offset[9]; + const unsigned char* weights[3]; + const float* quant_dict[3]; + voc_size = param.word_voc; + emb_size = param.word_emb; + max_seq_len = param.max_seq_len; + + unigram_num[0] = param.top_unigram; + unigram_num[1] = param.sec_unigram; + unigram_num[2] = param.thd_unigram; + + bigram_num[0] = param.top_bigram; + bigram_num[1] = param.sec_bigram; + bigram_num[2] = param.thd_bigram; + + collocation_num[0] = param.top_collocation; + collocation_num[1] = param.sec_collocation; + collocation_num[2] = param.thd_collocation; + int level_num = 3; + for (unsigned int i = 0; i < level_num; i++) { + word_num[i] = unigram_num[i] + bigram_num[i] + collocation_num[i]; + quant_dict[i] = NULL; + } + + chnl_num[0] = 1; // log quant + chnl_num[1] = emb_size / 2; // 2d8b product quant + chnl_num[2] = emb_size / 4; // 4d12b product quant + + word_len[0] = emb_size; + word_len[1] = chnl_num[1]; + word_len[2] = chnl_num[2] / 2 * 3; + + dict_size[0] = 256; + dict_size[1] = 2 * 256; + dict_size[2] = 4 * 4096; + word_offset[0] = unigram_num[0]; + word_offset[1] = word_offset[0] + unigram_num[1]; + word_offset[2] = word_offset[1] + unigram_num[2]; + + word_offset[3] = word_offset[2] + bigram_num[0]; + word_offset[4] = word_offset[3] + bigram_num[1]; + word_offset[5] = word_offset[4] + bigram_num[2]; + + word_offset[6] = word_offset[5] + collocation_num[0]; + word_offset[7] = word_offset[6] + collocation_num[1]; + word_offset[8] = word_offset[7] + collocation_num[2]; + + unsigned int* buf = new unsigned int[chnl_num[2]]; + float* top_pos = new float[emb_size]; + + weights[0] = (const unsigned char*)param.embedding_0->data(); + weights[1] = (const unsigned char*)param.embedding_1->data(); + weights[2] = (const unsigned char*)param.embedding_2->data(); + + //CHECK_NE(weights[0], NULL) << "embedding weights 0 is NULL"; + //CHECK_NE(weights[1], NULL) << "embedding weights 1 is NULL"; + //CHECK_NE(weights[2], NULL) << "embedding weights 2 is NULL"; + quant_dict[0] = (const float*)param.quant_dict_0->data(); + quant_dict[1] = (const float*)param.quant_dict_1->data(); + quant_dict[2] = (const float*)param.quant_dict_2->data(); + //CHECK_NE(quant_dict[0], NULL) << "quant dict 0 is NULL"; + //CHECK_NE(quant_dict[1], NULL) << "quant dict 1 is NULL"; + //CHECK_NE(quant_dict[2], NULL) << "quant dict 2 is NULL"; + + + auto offset = inputs[0]->get_seq_offset()[0]; + int seq_num = offset.size() - 1; + + outputs[0]->reshape(Shape({seq_num, emb_size, 1, 1}, Layout_NCHW)); + + const dtype *input_data = (const dtype*)inputs[0]->data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + memset(output_data, 0, sizeof(dtype) * outputs[0]->valid_size()); + for (int seq_id = 0; seq_id < seq_num; seq_id++) { + size_t cur_len = offset[seq_id+1] - offset[seq_id]; + size_t len = max_seq_len == -1 ? cur_len : std::min(cur_len, max_seq_len); + auto tmp_out_data = output_data + seq_id * emb_size; + for (size_t i = 0; i < len; i++) { + size_t word_idx = static_cast(input_data[offset[seq_id] + i]); + size_t real_idx = 0; + int case_idx = 0; + get_cur_idx(word_idx, word_offset, 9, &real_idx, &case_idx); + + if (case_idx == 0) { + const unsigned char* word_pos = weights[0] + real_idx * word_len[0]; + for (size_t j = 0; j < word_len[0]; j++) { + top_pos[j] = quant_dict[0][word_pos[j]]; + } + } else if (case_idx == 1) { + const unsigned char* word_pos = weights[1] + real_idx * word_len[1]; + for (size_t j = 0; j < chnl_num[1]; j++) { + const float *curr_dict = quant_dict[1] + j * dict_size[1]; + memcpy(top_pos + j * 2, + curr_dict + word_pos[j] * 2, 2 * sizeof(float)); + } + } else { + const unsigned char* word_pos = weights[2] + real_idx * word_len[2]; + decode_4d12b(word_pos, word_len[2], buf, chnl_num[2]); + for (size_t j = 0; j < chnl_num[2]; j++) { + const float *curr_dict = quant_dict[2] + j * dict_size[2]; + memcpy(top_pos + j * 4, + curr_dict + buf[j] * 4, 4 * sizeof(float)); + } + } + for (size_t i = 0; i < emb_size; i++) { + tmp_out_data[i] += top_pos[i]; + } + } + } + + delete [] buf; + delete [] top_pos; + +} + +template +void test_model() { + //for (auto num_threads: {1}) { + int proc_num = omp_get_num_procs(); + CHECK_LE(g_num_threads, proc_num); + omp_set_num_threads(g_num_threads); + + TestSaberBase testbase(1, 1); + size_t word_emb = 256; + size_t word_voc = 10000; + size_t top_unigram = 1000; + size_t top_bigram = 500; + size_t top_collocation = 500; + size_t sec_unigram = 2000; + size_t sec_bigram = 500; + size_t sec_collocation = 500; + size_t thd_unigram = 3000; + size_t thd_bigram = 1000; + size_t thd_collocation = 1000; + int max_seq_len{512}; + int word_num[3]; + int word_len[3]; + int dict_size[3]; + int chnl_num[3]; + + int level_num = 3; + word_num[0] = top_unigram + top_bigram + top_collocation; + word_num[1] = sec_unigram + sec_bigram + sec_collocation; + word_num[2] = thd_unigram + thd_bigram + thd_collocation; + + chnl_num[0] = 1; // log quant + chnl_num[1] = word_emb / 2; // 2d8b product quant + chnl_num[2] = word_emb / 4; // 4d12b product quant + + word_len[0] = word_emb; + word_len[1] = chnl_num[1]; + word_len[2] = chnl_num[2] / 2 * 3; + + dict_size[0] = 256; + dict_size[1] = 2 * 256; + dict_size[2] = 4 * 4096; + + Shape embedding_shape_0(std::vector{word_num[0], word_len[0], 1, 1}, Layout_NCHW); + Shape embedding_shape_1(std::vector{word_num[1], word_len[1], 1, 1}, Layout_NCHW); + Shape embedding_shape_2(std::vector{word_num[2], word_len[2], 1, 1}, Layout_NCHW); + Tensor embedding_0(embedding_shape_0, AK_UINT8); + Tensor embedding_1(embedding_shape_1, AK_UINT8); + Tensor embedding_2(embedding_shape_2, AK_UINT8); + + Shape quant_dict_shape_0(std::vector{dict_size[0], chnl_num[0], 1, 1}, Layout_NCHW); + Shape quant_dict_shape_1(std::vector{dict_size[1], chnl_num[1], 1, 1}, Layout_NCHW); + Shape quant_dict_shape_2(std::vector{dict_size[2], chnl_num[2], 1, 1}, Layout_NCHW); + Tensor quant_dict_0(quant_dict_shape_0); + Tensor quant_dict_1(quant_dict_shape_1); + Tensor quant_dict_2(quant_dict_shape_2); + //test example + // + //for (auto seq_num : {1, 2, 16, 40}) { + // for (auto seq_len : {10, 16, 32}) { + for (auto seq_num : {40}) { + for (auto seq_len : {32}) { + fill_tensor_rand(embedding_0, 0, 128); + fill_tensor_rand(embedding_1, 0, 128); + fill_tensor_rand(embedding_2, 0, 128); + fill_tensor_rand(quant_dict_0, -1, 1); + fill_tensor_rand(quant_dict_1, -1, 1); + fill_tensor_rand(quant_dict_2, -1, 1); + + ProductQuantEmbeddingWithVsumParam param(word_emb, word_voc, + top_unigram, top_bigram, top_collocation, + sec_unigram, sec_bigram, sec_collocation, + thd_unigram, thd_bigram, thd_collocation, + max_seq_len, &embedding_0, &embedding_1, &embedding_2, + &quant_dict_0, &quant_dict_1, &quant_dict_2); + + testbase.set_param(param);//set param + std::vector> seq_offset; + seq_offset.resize(1); + int cumsum = 0; + seq_offset[0].push_back(cumsum); + for (int i = 0; i < seq_num; i++) { + int len = std::rand() % seq_len + 1; + cumsum += len; + seq_offset[0].push_back(cumsum); + } + + Shape shape_0 = std::vector{cumsum, 1, 1, 1}; + std::vector*> input_vec; + Tensor input_0(shape_0); + fill_tensor_rand(input_0, 0, word_voc); + input_0.set_seq_offset(seq_offset); + input_vec.push_back(&input_0); + testbase.add_custom_input(input_vec); + testbase.run_test(product_quant_embedding_with_vsum_basic, 0.00001, false, true);//run test + } + //} + } +} + +TEST(TestSaberFunc, test_func_product_quant_embedding_with_vsum) { + +#ifdef USE_CUDA + //Init the test_base + //Env::env_init(); + //test_model(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + if (argc >= 2) { + g_num_threads = atoi(argv[1]); + } + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_ps_roi_pooling.cpp b/test/saber/test_saber_ps_roi_pooling.cpp new file mode 100644 index 000000000..28bfecc90 --- /dev/null +++ b/test/saber/test_saber_ps_roi_pooling.cpp @@ -0,0 +1,192 @@ +#include "saber/core/context.h" +#include "saber/funcs/ps_roi_pooling.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include +#include +#include +using namespace anakin::saber; + +template +void ps_roi_pool_cpu(const std::vector*>& input, std::vector*>& output,\ + PsRoiPoolParam& param){ + int in_n = input[0]->num(); + int in_c = input[0]->channel(); + int in_h = input[0]->height(); + int in_w = input[0]->width(); + int o_n = output[0]->num(); + int o_h = output[0]->height(); + int o_w = output[0]->width(); + int o_c = output[0]->channel(); + int pooled_h = param.pooled_height; + int pooled_w = param.pooled_width; + int crop_width = param.crop_width / param.pooled_width; + int crop_height = param.crop_height / param.pooled_height; + int num_rois = o_n; + int im_h = in_h; + int im_w = in_w; + float extra_value = 0; + int method = 0; + int global_pooling = true; + //float spatial_scale = param.spatial_scale; + const Dtype* in_data = (const Dtype*)input[0]->data(); + const Dtype* rois = (const Dtype*)input[1]->data(); + Dtype* out_data = (Dtype*)output[0]->mutable_data(); + Tensor inter; + inter.re_alloc(Shape({pooled_w*pooled_h*o_c, o_n, crop_height, crop_width})); + Dtype* inter_data = (Dtype*)inter.mutable_data(); + int count = output[0]->valid_size(); + int inter_count = inter.valid_size(); + + for (int index = 0; index < inter_count; ++index){ + int temp_ind = index; + int cur_w = temp_ind % crop_width; + temp_ind /= crop_width; + int cur_h = temp_ind % crop_height; + temp_ind /= crop_height; + int cur_n = temp_ind % num_rois; + int cur_c = temp_ind / num_rois; + + const Dtype* rois_data = rois + cur_n * 4; + + float y1 = rois_data[0] * (im_h - 1); + float x1 = rois_data[1] * (im_w - 1); + float y2 = rois_data[2] * (im_h - 1); + float x2 = rois_data[3] * (im_w - 1); + + float height_scale = crop_height > 1 ? (y2 - y1) / (crop_height - 1) : 0; + float width_scale = crop_width > 1 ? (x2 - x1) / (crop_width - 1) : 0; + + float in_y = crop_height > 1 ? y1 + cur_h * height_scale : (y1 + y2) / 2; + + if (in_y < 0 || in_y > im_h - 1){ + out_data[index] = extra_value; + continue; + } + + float in_x = crop_width > 1 ? x1 + cur_w * width_scale : (x1 + x2) / 2; + if (in_x < 0 || in_x > im_w - 1){ + out_data[index] = extra_value; + continue; + } + + const Dtype* im_data = in_data + cur_c * im_h * im_w; + + //resize method 0 means bilinear + if (method == 0){ + int top_y = floor(in_y); + int bot_y = ceil(in_y); + float y_lerp = in_y - top_y; + + int left_x = floor(in_x); + int right_x = ceil(in_x); + float x_lerp = in_x - left_x; + + Dtype top_left = im_data[top_y*im_w + left_x]; + Dtype top_right = im_data[top_y*im_w + right_x]; + Dtype bot_left = im_data[bot_y*im_w + left_x]; + Dtype bot_right = im_data[bot_y*im_w + right_x]; + float top = top_left + (top_right - top_left) * y_lerp; + float bot = bot_left + (bot_right - bot_left) * y_lerp; + inter_data[index] = top + (bot - top) * x_lerp; + } + } + int channel = o_c; + int pooled_size = pooled_w * pooled_h; + int crop_size = crop_height * crop_width; + for (int index = 0; index < count; ++index){ + int cur_n = index / channel; + int cur_c = index % channel; + int crop_size = crop_height * crop_width; + Dtype sum = 0; + for (int i = 0; i < crop_size; ++i){ + Dtype tmp_sum = 0; + for (int j = 0; j < pooled_size; ++j){ + tmp_sum += inter_data[(j * num_rois + cur_n) * crop_size + i]; + } + sum += tmp_sum / pooled_size; + } + out_data[index] = sum / crop_size; + } + +} + +template +void test_ps_roi_pool(){ + typedef typename DataTrait::Dtype dtype; + TestSaberBase testbase(2, 1); + float spatial_scale = 2.0f; + for (auto num_in :{1, 2}){ + for (auto c_in:{4, 8}){ + for (auto h_in:{6}){ + for (auto w_in:{6}){ + for (auto roi_num:{1, 2}){ + for (auto pool_h:{2}){ + for (auto pool_w:{2}){ + for (auto ch : {2, 4}){ + for (auto cw : {2, 4}){ + Shape in_shape({num_in, c_in, h_in, w_in}, Layout_NCHW); + Shape roi_shape({roi_num, 4, 1, 1}, Layout_NCHW); + Tensor th_in, th_roi; + Tensor td_in, td_roi; + th_in.re_alloc(in_shape, Dtype); + th_roi.re_alloc(roi_shape, Dtype); + td_in.re_alloc(in_shape, Dtype); + td_roi.re_alloc(roi_shape, Dtype); + // prepare host data + fill_tensor_rand(th_in, 0.0, 1.0); + // prepare roi data + dtype* roi_data = (dtype*)th_roi.mutable_data(); + srand(time(0)); + for (int i = 0; i < roi_num; ++i){ + //roi_data[i * 5] = rand() % num_in; + roi_data[i * 4 + 0] = 0.5; + roi_data[i * 4 + 1] = 0.5; + roi_data[i * 4 + 2] = 1; + roi_data[i * 4 + 3] = 1; + } + td_in.copy_from(th_in); + td_roi.copy_from(th_roi); + std::vector*> input; + input.push_back(&td_in); + input.push_back(&td_roi); + LOG(ERROR) << num_in <<"," << c_in << ","<< h_in << ","<< w_in << ","<< + roi_num << ","<< pool_h << ","<< pool_w; + testbase.add_custom_input(input); + PsRoiPoolParam param(pool_h, pool_w, ch, cw); + testbase.set_param(param); + testbase.run_test(ps_roi_pool_cpu); + } + } + } + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_func_roi_pooling){ +//for (int i=0; i< 10000; ++i){ +#ifdef USE_CUDA + test_ps_roi_pool(); + LOG(INFO)<<"NV test end."; +#endif +#ifdef USE_X86_PLACE + test_ps_roi_pool(); + LOG(INFO)<<"X86 test end."; +#endif +//} + + +} +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_reduce.cpp b/test/saber/test_saber_reduce.cpp new file mode 100644 index 000000000..767d7f44a --- /dev/null +++ b/test/saber/test_saber_reduce.cpp @@ -0,0 +1,280 @@ +#include "saber/core/context.h" +#include "saber/funcs/reduce_min.h" +#include "saber/funcs/reduce.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include + +using namespace anakin::saber; + +template +void reduce_n(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index = 0; + int src_index = 0; + int src_index0 = 0; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = src[data_index]; + for (int n = 1; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_c(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = src[src_index0]; + for (int c = 1; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_h(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index = 0; + int src_index = 0; + int src_index0 = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = src[src_index0]; + for (int h = 1; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_w(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = src[src_index0]; + for (int w = 1; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] = dst[data_index] < src[src_index] ? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_all(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + dtype min = src[0]; + int src_index = 0; + int n_id = 0; + int c_id = 0; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + min = src[src_index] < min? src[src_index] : min; + } + } + } + } + dst[0] = min; +} +template +void reduce_nc(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + //reduce n first. + Shape shape_tmp({1, channel_in, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +template +void reduce_ch(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce c first + Shape shape_tmp({num_in, 1, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +template +void reduce_hw(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce h first + Shape shape_tmp({num_in, channel_in, 1, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +/** + * @brief This operator is to reduce input tensor according to the given dimentions. + * For details, please see saber_reduce_min.cu. + * + * @tparam dtype + * @tparam TargetType_D + * @tparam TargetType_H + * @param input + * @param output + * @param param + */ +template +void reduce_min_cpu_base(const std::vector* >& input, + std::vector* >& output, + ReduceParam& param) { + + int n = input[0]->num(); + int c = input[0]->channel(); + int h = input[0]->height(); + int w = input[0]->width(); + int count = input[0]->valid_size(); + int rank = input[0]->valid_shape().size(); + const dtype* input_ptr = (const dtype*)input[0]->data(); + dtype* output_ptr = (dtype*)output[0]->mutable_data(); + std::vector reduce_dim = param.reduce_dim; + //we don't need to check whether reduce_dim is valid because it will be checked in cuda/x86 impl. + if (!reduce_dim.empty()) { + //not empty + for (int i = 0; i < reduce_dim.size(); ++i) { + if (reduce_dim[i] < 0) { + reduce_dim[i] += rank; + } + } + } + + if (reduce_dim.empty()) { + //reduce all. + reduce_all(input_ptr, output_ptr, n, c, h, w); + }else { + if (reduce_dim.size() == 1) { + switch (reduce_dim[0]) { + case 0: reduce_n(input_ptr, output_ptr, n, c, h, w); break; + case 1: reduce_c(input_ptr, output_ptr, n, c, h, w); break; + case 2: reduce_h(input_ptr, output_ptr, n, c, h, w); break; + case 3: reduce_w(input_ptr, output_ptr, n, c, h, w); break; + default: LOG(FATAL) << "error!!!"; + } + }else if (reduce_dim.size() == 2) { + if (reduce_dim[0] == 0 && reduce_dim[1] == 1) { + reduce_nc(input_ptr, output_ptr, n, c, h, w); + }else if (reduce_dim[0] == 1 && reduce_dim[1] == 2) { + reduce_ch(input_ptr, output_ptr, n, c, h, w); + }else if (reduce_dim[0] == 2 && reduce_dim[1] == 3) { + reduce_hw(input_ptr, output_ptr, n, c, h, w); + }else { + LOG(FATAL) <<"invalid reduce_dim!!"; + } + } else { + LOG(FATAL) << "reduce_dim's size over than 2, which is not supported now!!"; + } + } + +} + +template +void test_reduce_min() { + TestSaberBase testbase; + std::vector reduce_type_v{Reduce_min}; + std::vector> reduce_dim{{0}, {1}, {2}, {3}, + {0, 1}, {1, 2}, {2, 3}}; + + for (auto t : reduce_type_v) { + for (auto d : reduce_dim) { + ReduceParam param(d, t, true, false); + for (int w_in : {2, 8, 16, 32}) { + for (int h_in : {2, 8, 16, 32, 64}) { + for (int ch_in : {2, 7, 8, 64}) { + for (int num_in:{2, 21, 32, 64}) { + Shape shape({num_in, ch_in, h_in, w_in}); + testbase.set_param(param); + //testbase.set_rand_limit(); + testbase.set_input_shape(shape); + testbase.run_test(reduce_min_cpu_base); + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_op_ReduceMin) { + +#ifdef USE_CUDA + //Init the test_base + test_reduce_min(); +#endif +#ifdef USE_X86_PLACE + test_reduce_min(); +#endif +#ifdef USE_ARM_PLACE + //test_ReduceMin(); +#endif +#ifdef USE_BM + // Env::env_init(); + //test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_reduce_min.cpp b/test/saber/test_saber_reduce_min.cpp new file mode 100644 index 000000000..2815341d1 --- /dev/null +++ b/test/saber/test_saber_reduce_min.cpp @@ -0,0 +1,272 @@ +#include "saber/core/context.h" +#include "saber/funcs/reduce_min.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include + +using namespace anakin::saber; + +template +void reduce_n(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index = 0; + int src_index = 0; + int src_index0 = 0; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = src[data_index]; + for (int n = 1; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_c(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = src[src_index0]; + for (int c = 1; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_h(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index = 0; + int src_index = 0; + int src_index0 = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = src[src_index0]; + for (int h = 1; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] = dst[data_index] < src[src_index]? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_w(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = src[src_index0]; + for (int w = 1; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] = dst[data_index] < src[src_index] ? dst[data_index] : src[src_index]; + } + } + } + } +} + +template +void reduce_all(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + dtype min = src[0]; + int src_index = 0; + int n_id = 0; + int c_id = 0; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + min = src[src_index] < min? src[src_index] : min; + } + } + } + } + dst[0] = min; +} +template +void reduce_nc(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + + //reduce n first. + Shape shape_tmp({1, channel_in, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +template +void reduce_ch(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce c first + Shape shape_tmp({num_in, 1, height_in, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +template +void reduce_hw(const dtype* src, dtype* dst, + const int num_in, const int channel_in, const int height_in, const int width_in) { + //reduce h first + Shape shape_tmp({num_in, channel_in, 1, width_in}); + Tensor tensor_tmp(shape_tmp); + dtype* tmp_out = (dtype*)tensor_tmp.mutable_data(); + reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +/** + * @brief This operator is to reduce input tensor according to the given dimentions. + * For details, please see saber_reduce_min.cu. + * + * @tparam dtype + * @tparam TargetType_D + * @tparam TargetType_H + * @param input + * @param output + * @param param + */ +template +void reduce_min_cpu_base(const std::vector* >& input, + std::vector* >& output, ReduceMinParam& param) { + + int n = input[0]->num(); + int c = input[0]->channel(); + int h = input[0]->height(); + int w = input[0]->width(); + int count = input[0]->valid_size(); + int rank = input[0]->valid_shape().size(); + const dtype* input_ptr = (const dtype*)input[0]->data(); + dtype* output_ptr = (dtype*)output[0]->mutable_data(); + std::vector reduce_dim = param.reduce_dim; + //we don't need to check whether reduce_dim is valid because it will be checked in cuda/x86 impl. + if (!reduce_dim.empty()) { + //not empty + for (int i = 0; i < reduce_dim.size(); ++i) { + if (reduce_dim[i] < 0) { + reduce_dim[i] += rank; + } + } + } + + if (reduce_dim.empty()) { + //reduce all. + reduce_all(input_ptr, output_ptr, n, c, h, w); + }else { + if (reduce_dim.size() == 1) { + switch (reduce_dim[0]) { + case 0: reduce_n(input_ptr, output_ptr, n, c, h, w); break; + case 1: reduce_c(input_ptr, output_ptr, n, c, h, w); break; + case 2: reduce_h(input_ptr, output_ptr, n, c, h, w); break; + case 3: reduce_w(input_ptr, output_ptr, n, c, h, w); break; + default: LOG(FATAL) << "error!!!"; + } + }else if (reduce_dim.size() == 2) { + if (reduce_dim[0] == 0 && reduce_dim[1] == 1) { + reduce_nc(input_ptr, output_ptr, n, c, h, w); + }else if (reduce_dim[0] == 1 && reduce_dim[1] == 2) { + reduce_ch(input_ptr, output_ptr, n, c, h, w); + }else if (reduce_dim[0] == 2 && reduce_dim[1] == 3) { + reduce_hw(input_ptr, output_ptr, n, c, h, w); + }else { + LOG(FATAL) <<"invalid reduce_dim!!"; + } + } else { + LOG(FATAL) << "reduce_dim's size over than 2, which is not supported now!!"; + } + } + +} + +template +void test_reduce_min(){ + TestSaberBase testbase; + std::vector reduce_dim{2, 3}; + ReduceMinParam param(reduce_dim, false); + + for (int w_in : {2, 8, 16, 32}) { + for (int h_in : {2, 8, 16, 32, 64}) { + for (int ch_in : {2, 7, 8, 64}) { + for (int num_in:{2, 21, 32, 64}) { + Shape shape({num_in, ch_in, h_in, w_in}); + testbase.set_param(param); + //testbase.set_rand_limit(); + testbase.set_input_shape(shape); + testbase.run_test(reduce_min_cpu_base); + } + } + } + } +} + +TEST(TestSaberFunc, test_op_ReduceMin) { + +#ifdef USE_CUDA + //Init the test_base + test_reduce_min(); +#endif +#ifdef USE_X86_PLACE + test_reduce_min(); +#endif +#ifdef USE_ARM_PLACE + //test_ReduceMin(); +#endif +#ifdef USE_BM + // Env::env_init(); + //test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_resize.cpp b/test/saber/test_saber_resize.cpp index 0ea00650d..64f0c5c0a 100644 --- a/test/saber/test_saber_resize.cpp +++ b/test/saber/test_saber_resize.cpp @@ -10,7 +10,7 @@ using namespace anakin::saber; template -void resize_cpu(const std::vector*>& input, +void resize_bilinear_custom_cpu(const std::vector*>& input, std::vector*>& output, \ ResizeParam& param) { int win = input[0]->width(); @@ -60,84 +60,210 @@ void resize_cpu(const std::vector*>& input, } +template +void resize_bilinear_align_cpu(const std::vector*>& input, + std::vector*>& output, \ + ResizeParam& param) { + int win = input[0]->width(); + int hin = input[0]->height(); + int channels = input[0]->channel(); + int num = input[0]->num(); + int wout = output[0]->width(); + int hout = output[0]->height(); + dtype scale_w = (dtype)(win - 1) / (wout - 1); + dtype scale_h = (dtype)(hin - 1) / (hout - 1); + const dtype* src = (const dtype*)input[0]->data(); + dtype* dst = (dtype*)output[0]->mutable_data(); + int dst_stride_w = 1, dst_stride_h = wout, dst_stride_c = wout * hout, + dst_stride_batch = wout * hout * channels; + int src_stride_w = 1, src_stride_h = win, src_stride_c = win * hin, + src_stride_batch = win * hin * channels; -TEST(TestSaberFunc, test_func_resize) { + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + int src_index = n * src_stride_batch + c * src_stride_c; -#ifdef USE_CUDA + for (int h = 0; h < hout; ++h) { + for (int w = 0; w < wout; ++w) { + dtype fw = w * scale_w; + dtype fh = h * scale_h; + int w_start = (int)fw; + int w_id = w_start < win - 1 ? 1 : 0; + int w_end = (int)fw + w_id; + int h_start = (int)fh; + int h_id = h_start < hin - 1 ? 1 : 0; + int h_end = (int)fh + h_id; + fw -= w_start; + fh -= h_start; + const dtype w00 = (1.0 - fh) * (1.0 - fw); + const dtype w01 = fw * (1.0 - fh); + const dtype w10 = fh * (1.0 - fw); + const dtype w11 = fw * fh; + dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + dtype tr = src[src_index + w_end * src_stride_w + h_start * src_stride_h]; + dtype bl = src[src_index + w_start * src_stride_w + h_end * src_stride_h]; + dtype br = src[src_index + w_end * src_stride_w + h_end * src_stride_h]; + int dst_index = n * dst_stride_batch + c * dst_stride_c + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); + } + } + } + } - LOG(INFO) << "NV test......"; - //Init the test_base - TestSaberBase testbase; - - for (int num_in : { - 3, 5, 8 - }) { - for (int c_in : { - 3, 5, 8 - }) { - for (int h_in : { - 3, 5, 8 - }) { - for (int w_in : { - 2, 5, 8 - }) { - for (float scale_w : { - 1.0f, 3.3f - }) { - for (float scale_h : { - 1.0f, 4.4f - }) { - LOG(INFO) << scale_w << " " << scale_h; - ResizeParam param(scale_w, scale_h); - testbase.set_param(param); - testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); - testbase.run_test(resize_cpu, 0.001); - } - } +} + +template +void resize_bilinear_no_align_cpu(const std::vector*>& input, + std::vector*>& output, \ + ResizeParam& param) { + int win = input[0]->width(); + int hin = input[0]->height(); + int channels = input[0]->channel(); + int num = input[0]->num(); + int wout = output[0]->width(); + int hout = output[0]->height(); + dtype scale_w = (dtype)win / wout; + dtype scale_h = (dtype)hin / hout; + const dtype* src = (const dtype*)input[0]->data(); + dtype* dst = (dtype*)output[0]->mutable_data(); + int dst_stride_w = 1, dst_stride_h = wout, dst_stride_c = wout * hout, + dst_stride_batch = wout * hout * channels; + int src_stride_w = 1, src_stride_h = win, src_stride_c = win * hin, + src_stride_batch = win * hin * channels; + + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + int src_index = n * src_stride_batch + c * src_stride_c; + + for (int h = 0; h < hout; ++h) { + for (int w = 0; w < wout; ++w) { + dtype fw = scale_w * (w + 0.5f) - 0.5f; + fw = (fw < 0) ? 0 : fw; + dtype fh = scale_h * (h + 0.5f) - 0.5f; + fh = (fh < 0) ? 0 : fh; + int w_start = (int)fw; + int w_id = w_start < win - 1 ? 1 : 0; + int w_end = (int)fw + w_id; + int h_start = (int)fh; + int h_id = h_start < hin - 1 ? 1 : 0; + int h_end = (int)fh + h_id; + fw -= w_start; + fh -= h_start; + const dtype w00 = (1.0 - fh) * (1.0 - fw); + const dtype w01 = fw * (1.0 - fh); + const dtype w10 = fh * (1.0 - fw); + const dtype w11 = fw * fh; + dtype tl = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + dtype tr = src[src_index + w_end * src_stride_w + h_start * src_stride_h]; + dtype bl = src[src_index + w_start * src_stride_w + h_end * src_stride_h]; + dtype br = src[src_index + w_end * src_stride_w + h_end * src_stride_h]; + int dst_index = n * dst_stride_batch + c * dst_stride_c + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = static_cast(w00 * tl + w01 * tr + w10 * bl + w11 * br); } } } } +} +template +void resize_nearest_align_cpu(const std::vector*>& input, + std::vector*>& output, \ + ResizeParam& param) { + int win = input[0]->width(); + int hin = input[0]->height(); + int channels = input[0]->channel(); + int num = input[0]->num(); + int wout = output[0]->width(); + int hout = output[0]->height(); + dtype scale_w = (dtype)(win - 1) / (wout - 1); + dtype scale_h = (dtype)(hin - 1) / (hout - 1); + const dtype* src = (const dtype*)input[0]->data(); + dtype* dst = (dtype*)output[0]->mutable_data(); + int dst_stride_w = 1, dst_stride_h = wout, dst_stride_c = wout * hout, + dst_stride_batch = wout * hout * channels; + int src_stride_w = 1, src_stride_h = win, src_stride_c = win * hin, + src_stride_batch = win * hin * channels; + + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + int src_index = n * src_stride_batch + c * src_stride_c; -#endif + for (int h = 0; h < hout; ++h) { + for (int w = 0; w < wout; ++w) { + dtype fw = scale_w * w + 0.5; + fw = (fw < 0) ? 0 : fw; + dtype fh = scale_h * h + 0.5; + fh = (fh < 0) ? 0 : fh; + int w_start = (int)fw; + int h_start = (int)fh; + int dst_index = n * dst_stride_batch + c * dst_stride_c + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + } + } + } + } -#ifdef USE_X86_PLACE +} - LOG(INFO) << "x86 test......"; - //Init the test_base - TestSaberBase testbase1; - - for (int num_in : { - 3, 5, 8 - }) { - for (int c_in : { - 3, 5, 8 - }) { - for (int h_in : { - 3, 5, 8 - }) { - for (int w_in : { - 2, 5, 8 - }) { - for (float scale_w : { - 1.0f, 3.3f - }) { - for (float scale_h : { - 1.0f, 4.4f - }) { - LOG(INFO) << scale_w << " " << scale_h; - ResizeParam param(scale_w, scale_h); - testbase1.set_param(param); - testbase1.set_input_shape(Shape({num_in, c_in, h_in, w_in})); - testbase1.run_test(resize_cpu); +template +void test_resize(){ + typedef typename DataTrait::Dtype dtype; + TestSaberBase testbase; + + for (int num_in : {3, 5, 8}) { + for (int c_in : {3, 5, 8}) { + for (int h_in : {3, 5, 8}) { + for (int w_in : {2, 5, 8}) { + for (float scale_w : {1.0f, 3.3f}) { + for (float scale_h : {1.0f, 4.4f}) { + for (int resize_type : {0, 1, 2, 3}){ + LOG(INFO) << scale_w << " " << scale_h << " " << resize_type; + ResizeParam param((ResizeType)resize_type, scale_w, scale_h); + testbase.set_param(param); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + switch (resize_type){ + case 0: + LOG(INFO) << "resize_type: " << "bilinear_align"; + testbase.run_test(resize_bilinear_align_cpu, 0.0001); + break; + case 1: + LOG(INFO) << "resize_type: " << "bilinear no align"; + testbase.run_test(resize_bilinear_no_align_cpu, 0.0001); + break; + case 2: + LOG(INFO) << "resize_type: " << "custom"; + testbase.run_test(resize_bilinear_custom_cpu, 0.0001); + break; + case 3: + LOG(INFO) << "resize_type: " << "nearest"; + testbase.run_test(resize_nearest_align_cpu, 0.0001); + break; + default: + break; + } + } } } } } } } +} +TEST(TestSaberFunc, test_func_resize) { +#ifdef USE_CUDA + test_resize(); + +#endif + +#ifdef USE_X86_PLACE + + test_resize(); + +#endif + +#ifdef USE_ARM_PLACE + test_resize(); #endif @@ -146,10 +272,8 @@ int main(int argc, const char** argv) { // initial logger //logger::init(argv[0]); InitTest(); + RUN_ALL_TESTS(argv[0]); - for (int i = 0; i < 100; i++) { - RUN_ALL_TESTS(argv[0]); - } return 0; } diff --git a/test/saber/test_saber_roi_align.cpp b/test/saber/test_saber_roi_align.cpp new file mode 100644 index 000000000..5a53f8d4d --- /dev/null +++ b/test/saber/test_saber_roi_align.cpp @@ -0,0 +1,259 @@ +#include "saber/core/context.h" +#include "saber/funcs/roi_align.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include + +using namespace anakin::saber; + + +/** + * @brief This operator is Region of Interest(ROIAlign) Align. + * The main steps of RoiAlign are as follows: + * For each ROI, extract fixed-size map ([pooled_height, pooled_width]something like 3*3): + * 1. chose a sampling_ratio[the number of sampling points] for each pixel of fixed-size map + * 2. then, for each smapling point, compute the src coordinate, and + * suppose that we get the src's coordinate (x, y). + * using the fomula to calculate coordinate (x, y). + * 3. for each (x, y) , do bilinear interpolate and suppose we get val. + * 4. sum up val and calculate the mean of them. + * + * + * @tparam dtype + * @tparam TargetType_D + * @tparam TargetType_H + * @param input + * @param output + * @param param + */ + +template +void PreCalcForBilinearInterpolate( + const int height, const int width, + const int pooled_height, const int pooled_width, const int iy_upper, + const int ix_upper, dtype roi_ymin, dtype roi_xmin, dtype bin_size_h, dtype bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, const int kROISize, + const int prePosROISize, Tensor* pre_pos, Tensor* pre_w) { + int pre_calc_index = 0; + int* pre_pos_data = (int*)pre_pos->mutable_data(); + dtype* pre_w_data = (dtype*)pre_w->mutable_data(); + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + // calculate y of sample points + dtype y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + // calculate x of samle points + for (int ix = 0; ix < ix_upper; ix++) { + dtype x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + // deal with elements out of map + if (y < -1.0 || y > height || x < -1.0 || x > width) { + for (int i = 0; i < prePosROISize; ++i) { + pre_pos_data[i + pre_calc_index * prePosROISize] = 0; + pre_w_data[i + pre_calc_index * prePosROISize] = 0; + } + pre_calc_index += 1; + continue; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high = 0; + int x_high = 0; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + dtype ly = y - y_low; + dtype lx = x - x_low; + dtype hy = 1. - ly; + dtype hx = 1. - lx; + pre_pos_data[pre_calc_index * prePosROISize] = y_low * width + x_low; + pre_pos_data[pre_calc_index * prePosROISize + 1] = y_low * width + x_high; + pre_pos_data[pre_calc_index * prePosROISize + 2] = y_high * width + x_low; + pre_pos_data[pre_calc_index * prePosROISize + 3] = y_high * width + x_high; + pre_w_data[pre_calc_index * prePosROISize] = hy * hx; + pre_w_data[pre_calc_index * prePosROISize + 1] = hy * lx; + pre_w_data[pre_calc_index * prePosROISize + 2] = ly * hx; + pre_w_data[pre_calc_index * prePosROISize + 3] = ly * lx; + pre_calc_index += 1; + } + } + } + } +} + + +template +void roi_align_cpu_base(const std::vector* >& input, + std::vector* >& output, RoiAlignParam& param) { + + CHECK_EQ(input.size(), 2) << "input size must be 2!!!"; + int batch_size = input[0]->num(); + int channels = input[0]->channel(); + int height = input[0]->height(); + int width = input[0]->width(); + int rois_num = input[1]->num(); + // int count = input[0]->valid_size(); + const int kROISize = 5; + const int prePosROISize = 4; + + Shape in_stride = input[0]->get_stride(); + Shape roi_stride = input[1]->get_stride(); + Shape out_stride = output[0]->get_stride(); + + const dtype* input_data = (const dtype*)input[0]->data(); + const dtype* rois = (const dtype*)input[1]->data(); + dtype* output_data = (dtype*)output[0]->mutable_data(); + // For each ROIs, do fix-sized align. + for (int n = 0; n < rois_num; ++n) { + const dtype* cur_rois = rois + n * kROISize; + int rois_id = cur_rois[0]; + dtype roi_xmin = cur_rois[1] * param.spatial_scale; + dtype roi_ymin = cur_rois[2] * param.spatial_scale; + dtype roi_xmax = cur_rois[3] * param.spatial_scale; + dtype roi_ymax = cur_rois[4] * param.spatial_scale; + + dtype roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); + dtype roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); + dtype bin_size_h = static_cast(roi_height) / static_cast(param.pooled_height); + dtype bin_size_w = static_cast(roi_width) / static_cast(param.pooled_width); + const dtype* batch_data = input_data + rois_id * in_stride[0]; + int roi_bin_grid_h = (param.sampling_ratio > 0)? param.sampling_ratio : ceil(roi_height / param.pooled_height); + int roi_bin_grid_w = (param.sampling_ratio > 0)? param.sampling_ratio : ceil(roi_width / param.pooled_width); + int count = roi_bin_grid_h * roi_bin_grid_w; + Tensor pre_pos; + Tensor pre_w; + int pre_size = count * out_stride[1]; + pre_pos.reshape(Shape({pre_size, prePosROISize, 1, 1})); //pre ROI + pre_w.reshape(Shape({pre_size, prePosROISize, 1, 1})); // pre ROI weights. + + PreCalcForBilinearInterpolate(height, width, + param.pooled_height, param.pooled_width, + roi_bin_grid_h,roi_bin_grid_w, + roi_ymin, roi_xmin, + bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, + kROISize, prePosROISize, + &pre_pos, &pre_w); + const int* pre_pos_data = (const int*)pre_pos.data(); + const dtype* pre_w_data = (const dtype*)pre_w.data(); + for (int c = 0; c < channels; c++) { + int pre_calc_index = 0; + for (int ph = 0; ph < param.pooled_height; ph++) { + for (int pw = 0; pw < param.pooled_width; pw++) { + const int pool_index = ph * param.pooled_width + pw; + dtype output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + for (int i = 0; i < prePosROISize; i++) { + int pos = pre_pos_data[pre_calc_index * prePosROISize + i]; + dtype w = pre_w_data[pre_calc_index * prePosROISize + i]; + output_val += w * batch_data[pos]; + } + pre_calc_index += 1; + } + } + output_val /= count; + output_data[pool_index] = output_val; + } + } + batch_data += in_stride[1]; + output_data += out_stride[1]; + } + } +} + +template +void test_roi_align(){ + TestSaberBase testbase(2); + float spatial_scale = 1.0f; + int sampling_ratio = -1.0; + // RoiAlignParam param; + + + for (int num_in : {2, 8, 16, 32}) { + for (int c_in : {2, 8, 16, 32}) { + for (int h_in : {2, 7, 8, 16}) { + for (int w_in:{2, 21, 16, 32}) { + for (auto roi_num:{1, 3, 6}){ + for (auto pooled_height:{1, 2, 4}){ + for (auto pooled_width:{1, 2, 4}){ + Shape in_shape({num_in, c_in, h_in, w_in}); + Shape roi_shape({roi_num, 5, 1, 1}); + RoiAlignParam param(pooled_height, + pooled_width, spatial_scale, sampling_ratio); + Tensor th_in, th_roi; + Tensor td_in, td_roi; + th_in.re_alloc(in_shape, AK_FLOAT); + th_roi.re_alloc(roi_shape, AK_FLOAT); + td_in.re_alloc(in_shape, AK_FLOAT); + td_roi.re_alloc(roi_shape, AK_FLOAT); + // prepare host data + fill_tensor_rand(th_in, 0.0, 1.0); + // prepare roi data + float* roi_data = (float*)th_roi.mutable_data(); + srand(time(0)); + for (int i = 0; i < roi_num; ++i) { + roi_data[i * 5] = rand() % num_in; + roi_data[i * 5 + 1] = floor(rand() % (w_in/2) / spatial_scale); + roi_data[i * 5 + 2] = floor(rand() % (h_in/2) / spatial_scale); + roi_data[i * 5 + 3] = floor((rand() % (w_in/2) + w_in/2) / spatial_scale); + roi_data[i * 5 + 4] = floor((rand() % (h_in/2) + h_in/2) / spatial_scale); + } + td_in.copy_from(th_in); + td_roi.copy_from(th_roi); + std::vector*> input; + input.push_back(&td_in); + input.push_back(&td_roi); + testbase.add_custom_input(input); + testbase.set_param(param); + testbase.run_test(roi_align_cpu_base); + } + } + } + } + } + } + } +} + +TEST(TestSaberFunc, test_op_RoiAlign) { + +#ifdef USE_CUDA + //Init the test_base + test_roi_align(); +#endif +#ifdef USE_X86_PLACE +// test_roi_align(); +#endif +#ifdef USE_ARM_PLACE + //test_RoiAlign(); +#endif +#ifdef USE_BM + // Env::env_init(); + //test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/test/saber/test_saber_scale.cpp b/test/saber/test_saber_scale.cpp index 54317ed94..3b3bf1a7f 100644 --- a/test/saber/test_saber_scale.cpp +++ b/test/saber/test_saber_scale.cpp @@ -209,6 +209,75 @@ TEST(TestSaberFunc, test_func_scale) { testbase.run_test(scale_cpu); } while (0); +#endif +#ifdef USE_ARM_PLACE + LOG(INFO) << "ARM test......"; + + do { + TestSaberBase testbase; + //test1 + int num_in = 2; + int c_in = 2; + int h_in = 4; + int w_in = 4; + int axis = 1; + int num_axes = 1; + bool bias_term = true; + int scale_dim = 2; + std::vector scale_data(scale_dim); + std::vector bias_data(scale_dim); + fill_vector_rand(scale_data); + fill_vector_rand(bias_data); + ScaleParam param1(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param1); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + //test2 + bias_term = false; + ScaleParam param2(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param2); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + //test3 + axis = 0; + num_axes = -1; + bias_term = true; + scale_dim = 64; + scale_data.resize(scale_dim); + bias_data.resize(scale_dim); + fill_vector_rand(scale_data); + fill_vector_rand(bias_data); + ScaleParam param3(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param3); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + //test4 + bias_term = false; + ScaleParam param4(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param4); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + //test5 + axis = 0; + num_axes = 0; + bias_term = true; + scale_dim = 1; + scale_data.resize(scale_dim); + bias_data.resize(scale_dim); + fill_vector_rand(scale_data); + fill_vector_rand(bias_data); + ScaleParam param5(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param5); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + //test6 + bias_term = false; + ScaleParam param6(scale_data, bias_data, bias_term, axis, num_axes); + testbase.set_param(param6); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(scale_cpu); + } while (0); + #endif } diff --git a/test/saber/test_saber_seq_concat_seq_pool_soft_sign.cpp b/test/saber/test_saber_seq_concat_seq_pool_soft_sign.cpp new file mode 100644 index 000000000..c6931e7b7 --- /dev/null +++ b/test/saber/test_saber_seq_concat_seq_pool_soft_sign.cpp @@ -0,0 +1,123 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/seq_concat_seq_pool_soft_sign.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; + +template +void seq_concat_seq_pool_soft_sign_basic(const std::vector*>& inputs, + std::vector*>& outputs, + SeqConcatSeqPoolSoftSignParam& param) { + + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_num = inputs[i]->get_seq_offset()[0].size() - 1 ; + CHECK_EQ(emb_size, cur_emb_size) << "emb size must be the same"; + CHECK_EQ(seq_num, cur_seq_num) << "seq num must be the same"; + } + + outputs[0]->reshape(Shape({seq_num, emb_size, 1, 1}, Layout_NCHW)); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + std::vector> offset_vecs; + for (int i = 0; i < inputs.size(); i++) { + offset_vecs.push_back(inputs[i]->get_seq_offset()[0]); + } + dtype buf[emb_size]; + for (size_t i = 0; i < seq_num; i++) { + memset(buf, 0, sizeof(dtype) * emb_size); + for (int j = 0; j < inputs.size(); j++) { + const dtype *in_data = (const dtype*)inputs[j]->data(); + for (int k = offset_vecs[j][i]; k < offset_vecs[j][i + 1]; k++) { + int start = k * emb_size; + for (int m = 0; m < emb_size; m++) { + buf[m] += in_data[k * emb_size + m]; + } + } + } + + for (int m = 0; m < emb_size; m++) { + auto tmp = buf[m] > 0 ? buf[m] : -buf[m]; + output_data[i * emb_size + m] = buf[m] / (1 + tmp); + } + } +} + +template +void test_model() { + int max_seq_len = 1; + int emb_size = 256; + for (auto input_size : {4}) { + TestSaberBase testbase(input_size, 1); + for (auto seq_num: {1}) { + std::vector> seq_offset_vec; + seq_offset_vec.resize(input_size); + std::vector*> input_vec; + for (int i = 0; i < input_size; i++) { + int num = 0; + seq_offset_vec[i].push_back(num); + for (int j = 0; j < seq_num; j++) { + //int len = std::rand() % max_seq_len; + int len = 1; + num += len; + seq_offset_vec[i].push_back(num); + } + std::vector> cur_seq_offset = {seq_offset_vec[i]}; + Shape shape({num, emb_size, 1, 1}, Layout_NCHW); + Tensor* input = new Tensor(shape); + input->set_seq_offset(cur_seq_offset); + fill_tensor_rand(*input); + input_vec.push_back(input); + } + //test example + SoftSignParam soft_sign_param; + SequenceConcatParam seq_concat_param; + SequencePoolParam seq_pool_param(Sequence_pool_sum); + SeqConcatSeqPoolSoftSignParam param(seq_concat_param, seq_pool_param, soft_sign_param); + testbase.set_param(param);//set param + testbase.add_custom_input(input_vec); + testbase.run_test(seq_concat_seq_pool_soft_sign_basic, 0.00001, false, true);//run test + for (int i = 0; i < input_size; i++) { + delete input_vec[i]; + } + } + } +} +TEST(TestSaberFunc, test_func_soft_sign) { + +#ifdef USE_CUDA + //Init the test_base + //Env::env_init(); + //test_model(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_sequence_concat.cpp b/test/saber/test_saber_sequence_concat.cpp new file mode 100644 index 000000000..aa6ecf1db --- /dev/null +++ b/test/saber/test_saber_sequence_concat.cpp @@ -0,0 +1,125 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/sequence_concat.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +template +void sequence_concat_basic(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceConcatParam& param) { + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + int emb_size = inputs[0]->valid_size() / inputs[0]->num(); + int seq_num = inputs[0]->get_seq_offset()[0].size() - 1; + for (int i = 1; i < inputs.size(); i++) { + int cur_emb_size = inputs[i]->valid_size() / inputs[i]->num(); + int cur_seq_num = inputs[i]->get_seq_offset()[0].size() - 1; + CHECK_EQ(emb_size, cur_emb_size) << "sequence concat emb size must be the same"; + CHECK_EQ(seq_num, cur_seq_num) << "sequence concat seq num must be the same"; + } + + for (int i = 0; i < seq_num; i++) { + for (int j = 0; j < inputs.size(); j++) { + size_t cur_len = inputs[j]->get_seq_offset()[0][i+1] - inputs[j]->get_seq_offset()[0][i]; + + const dtype *input_data = (const dtype*)inputs[j]->data() + inputs[j]->get_seq_offset()[0][i] * emb_size; + memcpy(output_data, input_data, sizeof(dtype) * cur_len * emb_size); + output_data += cur_len * emb_size; + } + } + + std::vector> out_offset; + out_offset.resize(1); + int seq_len = inputs[0]->get_seq_offset()[0].size() - 1; + out_offset[0].push_back(0); + int cur_off = 0; + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < inputs.size(); j++) { + cur_off += inputs[j]->get_seq_offset()[0][i + 1]; + } + out_offset[0].push_back(cur_off); + } + outputs[0]->set_seq_offset(out_offset); +} + +std::vector generate_sequence_offset(int seq_num, int max_seq_len) { + std::vector offset; + int cumsum = 0; + offset.push_back(cumsum); + for (int i = 0; i < seq_num; i++){ + int cur_len = rand() % max_seq_len + 1; + cumsum += cur_len; + offset.push_back(cumsum); + } + return offset; +} + + + +template +void test_model() { + //test example + //for (auto seq_num : {1, 2, 8}) { + // for (auto max_seq_len: {10, 16, 30}) { + // for (auto emb_size: {32, 128, 61}) { + for (auto seq_num : {4, 40}) { + for (auto max_seq_len: {50}) { + for (auto emb_size: {128, 256}) { + for (auto in_num: {2, 5}) { + TestSaberBase testbase(in_num, 1); + std::vector*> inputs; + for (int i = 0; i < in_num; i++) { + std::vector seq_offset_0 = generate_sequence_offset(seq_num, max_seq_len); + int word_num_0 = seq_offset_0.back(); + Tensor* input_0 = new Tensor(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + //input_0.re_alloc(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(*input_0, -1.f, 1.f); + std::vector> vseq_offset_0 = {seq_offset_0}; + input_0->set_seq_offset(vseq_offset_0); + inputs.push_back(input_0); + } + testbase.add_custom_input(inputs); + SequenceConcatParam param; + testbase.set_param(param); + testbase.run_test(sequence_concat_basic, 0.00001, true, true); + } + } + } + } +} + +TEST(TestSaberFunc, test_func_sequence_concat) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_sequence_depadding.cpp b/test/saber/test_saber_sequence_depadding.cpp new file mode 100644 index 000000000..d754d4633 --- /dev/null +++ b/test/saber/test_saber_sequence_depadding.cpp @@ -0,0 +1,128 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/sequence_depadding.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +template +void sequence_depadding_basic(const std::vector*>& inputs, + std::vector*>& outputs, + SequenceDePaddingParam& param) { + dtype *input_data = (dtype*)inputs[0]->mutable_data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + auto pad_offset = inputs[0]->get_seq_offset()[0]; + auto src_offset = inputs[1]->get_seq_offset()[0]; + int seq_num = src_offset.size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + + for (size_t i = 0; i < seq_num; i++) { + int src_len_i = src_offset[i+1] - src_offset[i]; + int pad_len_i = pad_offset[i+1] - pad_offset[i]; + CHECK_LE(src_len_i, pad_len_i) << "pad sequence length is bigger than source sequence length"; + memcpy(output_data + src_offset[i] * emb_size, input_data + i * pad_len_i * emb_size, src_len_i * emb_size * sizeof(dtype)); + } +} + + +void generate_sequence_offset(int seq_num, int max_seq_len, + std::vector& offset) { + offset.clear(); + int cumsum = 0; + offset.push_back(cumsum); + for (int i = 0; i < seq_num; i++){ + int cur_len = rand() % max_seq_len + 1; + cumsum += cur_len; + offset.push_back(cumsum); + } +} + +int get_max_len(std::vector& offset) { + int max_len = 0; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i+1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + +void generate_equal_step_offset(int seq_num, int max_seq_len, std::vector& offset) { + offset.clear(); + offset.push_back(0); + for (int i = 0; i < seq_num; i++){ + offset.push_back((i+1)* max_seq_len); + } +} + +template +void test_model() { + //test example + TestSaberBase testbase(2, 1); + for (auto seq_num : {1, 3, 8}) { + for (auto max_seq_len: {3, 30}) { + for (auto emb_size: {5, 128, 256}) { + std::vector*> inputs; + std::vector seq_offset_1; + std::vector seq_offset_0; + generate_sequence_offset(seq_num, max_seq_len, seq_offset_1); + int max_len = get_max_len(seq_offset_1); + generate_equal_step_offset(seq_num, max_len, seq_offset_0); + int word_num_0 = seq_offset_1.back(); + Tensor* input_0 = new Tensor(Shape({seq_num * max_len, emb_size, 1, 1}), AK_FLOAT); + Tensor* input_1 = new Tensor(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(*input_0, -1.f, 1.f); + std::vector> vseq_offset_0 = {seq_offset_0}; + input_0->set_seq_offset(vseq_offset_0); + + fill_tensor_rand(*input_1, -1.f, 1.f); + std::vector> vseq_offset_1 = {seq_offset_1}; + input_1->set_seq_offset(vseq_offset_1); + + inputs.push_back(input_0); + inputs.push_back(input_1); + testbase.add_custom_input(inputs); + SequenceDePaddingParam param; + testbase.set_param(param); + testbase.run_test(sequence_depadding_basic, 0.00001, true, true); + for (auto input: inputs) { + delete input; + } + } + } + } +} + +TEST(TestSaberFunc, test_func_sequence_depadding) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_sequence_padding.cpp b/test/saber/test_saber_sequence_padding.cpp new file mode 100644 index 000000000..a76a0b196 --- /dev/null +++ b/test/saber/test_saber_sequence_padding.cpp @@ -0,0 +1,120 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/sequence_padding.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; +template +void sequence_padding_basic(const std::vector*>& inputs, + std::vector*>& outputs, + SequencePaddingParam& param) { + size_t len = inputs[0]->valid_size(); + dtype *input_data = (dtype*)inputs[0]->mutable_data(); + dtype *output_data = (dtype*)outputs[0]->mutable_data(); + int max_len = 0; + auto seq_offset = inputs[0]->get_seq_offset()[0]; + int seq_num = seq_offset.size() - 1; + int emb_size = inputs[0]->count_valid(1, inputs[0]->dims()); + for (int i = 0; i < seq_num; i++) { + int cur_len = seq_offset[i+1] - seq_offset[i]; + max_len = cur_len > max_len ? cur_len : max_len; + } + + Shape out_shape = inputs[0]->valid_shape(); + out_shape[0] = seq_num * max_len; + outputs[0]->reshape(out_shape); + for (size_t i = 0; i < seq_num; i++) { + int start = i * max_len * emb_size; + int cur_len = seq_offset[i+1] - seq_offset[i]; + int pad_start = start + cur_len * emb_size; + int pad_num = max_len - cur_len; + memcpy(output_data + start, input_data + seq_offset[i] * emb_size, cur_len * emb_size * sizeof(dtype)); + if (pad_num > 0) { + memset(output_data + pad_start, 0, pad_num * emb_size * sizeof(dtype)); + } + } + + std::vector out_offset; + for (int i = 0; i < seq_num + 1; i++) { + out_offset.push_back(i * max_len); + } + outputs[0]->set_seq_offset({out_offset}); +} + + +std::vector generate_sequence_offset(int seq_num, int max_seq_len) { + std::vector offset; + int cumsum = 0; + offset.push_back(cumsum); + for (int i = 0; i < seq_num; i++){ + int cur_len = rand() % max_seq_len + 1; + cumsum += cur_len; + offset.push_back(cumsum); + } + return offset; +} + + + +template +void test_model() { + //test example + TestSaberBase testbase(1, 1); + for (auto seq_num : {4, 40}) { + for (auto max_seq_len: {50}) { + for (auto emb_size: {128, 256}) { + std::vector*> inputs; + std::vector seq_offset_0 = generate_sequence_offset(seq_num, max_seq_len); + int word_num_0 = seq_offset_0.back(); + Tensor* input_0 = new Tensor(Shape({word_num_0, emb_size, 1, 1}), AK_FLOAT); + fill_tensor_rand(*input_0, -1.f, 1.f); + std::vector> vseq_offset_0 = {seq_offset_0}; + input_0->set_seq_offset(vseq_offset_0); + inputs.push_back(input_0); + testbase.add_custom_input(inputs); + SequencePaddingParam param; + testbase.set_param(param); + testbase.run_test(sequence_padding_basic, 0.00001, true, true); + for (auto input: inputs) { + delete input; + } + } + } + } +} + +TEST(TestSaberFunc, test_func_sequence_padding) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/lite/test_lite_sgemm_prepacked.cpp b/test/saber/test_saber_sgemm_prepacked_arm.cpp similarity index 69% rename from test/lite/test_lite_sgemm_prepacked.cpp rename to test/saber/test_saber_sgemm_prepacked_arm.cpp index 02a04ecfb..e0066888e 100644 --- a/test/lite/test_lite_sgemm_prepacked.cpp +++ b/test/saber/test_saber_sgemm_prepacked_arm.cpp @@ -1,8 +1,13 @@ -#include "test_lite.h" -#include "saber/lite/funcs/neon/impl/sgemm_arm.h" -#include "saber/lite/funcs/neon/impl/sgemm_conv.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/timer.h" +#include "test/saber/test_saber_func.h" + +#ifdef USE_ARM_PLACE +#include "saber/funcs/impl/arm/neon/impl/sgemm_arm.h" +#include "saber/funcs/impl/arm/neon/impl/sgemm_prepacked.h" + using namespace anakin::saber; -using namespace anakin::saber::lite; + int cluster = 0; int threads = 1; @@ -15,20 +20,57 @@ bool traA = false; bool traB = false; bool flag_relu = false; bool flag_bias = false; -ARMArch flag_arch = A73; int test_iter = 1; bool COMPARE_RESULT = true; -typedef Tensor TensorHf4; +typedef Tensor TensorHf4; + + +template +static void basic_gemm(int m, int n, int k, const type* a, const type* b, const type2* bias, type2* c, \ + type2 alpha, type2 beta, \ + bool trans_a = false, bool trans_b = false, bool flag_bias = false, bool flag_relu = false) { +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + type2 bias_data = (type2)0; + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + type2 sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * m + i]; + } else{ + av = a[i * k + l]; + } + if (trans_b) { + bv = b[j * k + l]; + } else { + bv = b[l * n + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data; + if (flag_relu) { + c[i * n + j] = tmp > (type2)0? tmp : (type2)0; + } else { + c[i * n + j] = tmp; + } + } + } +} + SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bias, bool flag_relu, int in_threads) { double to = 0; double min_time = 1000000; - SaberTimer t1; - Context ctx1; + SaberTimer t1; + Context ctx1; PowerMode mode = (PowerMode)cluster; ctx1.set_run_mode(mode, in_threads); - ctx1.set_arch(flag_arch); - LOG(INFO) << "CPU ARCH: A" << flag_arch; + LOG(INFO) << "CPU ARCH: A" << ctx1.get_arch(); LOG(INFO) << "test threads activated"; #pragma omp parallel { @@ -37,15 +79,15 @@ SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bi LOG(INFO) << "number of threads: " << in_threads; #endif } - Shape sha(1, 1, M, K); - Shape shb(1, 1, N, K); - Shape shc(1, 1, M, N); + Shape sha({1, 1, M, K}); + Shape shb({1, 1, N, K}); + Shape shc({1, 1, M, N}); TensorHf4 ta; TensorHf4 tb; TensorHf4 tbias; ta.reshape(sha); tb.reshape(shb); - tbias.reshape(Shape(M)); + tbias.reshape(Shape({1, 1, 1, M})); fill_tensor_rand(ta, -1.f, 1.f); // fill_tensor_const(ta, 1.f); fill_tensor_rand(tb, -1.f, 1.f); @@ -89,7 +131,7 @@ SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bi ldb = n; } ldc = n; - long long ops = m * n * k; + double ops = 2.0 * m * n * k; float* dc_saber = static_cast(tout_saber.mutable_data()); to = 0; @@ -97,46 +139,53 @@ SaberStatus test_arm_sgemm(int M, int N, int K, bool tra, bool trb, bool flag_bi int hblock = get_hblock(ctx1.get_arch()); int round_up_a = ((hblock + m - 1) / hblock) * hblock; LOG(INFO) << "hblock = " << hblock << ", round up = " << round_up_a; - TensorHf4 tpackedA(Shape(round_up_a, K)); + TensorHf4 tpackedA(Shape({1, 1, round_up_a, K})); prepackA(static_cast(tpackedA.mutable_data()), da, lda, 0, m, 0, k, tra, &ctx1); + /// warm up + for (int i = 0; i < 5; ++i) { + sgemm_prepack(static_cast(tpackedA.data()), db, static_cast(tbias.data()), \ + dc_saber, m, n, k, flag_bias, flag_relu, trb, &ctx1); + } for (int i = 0; i < test_iter; ++i) { t1.clear(); - t1.start(); + t1.start(ctx1); sgemm_prepack(static_cast(tpackedA.data()), db, static_cast(tbias.data()), \ dc_saber, m, n, k, flag_bias, flag_relu, trb, &ctx1); - t1.end(); + t1.end(ctx1); to += t1.get_average_ms(); if (t1.get_average_ms() < min_time) { min_time = t1.get_average_ms(); } } + + float cpu_freq_cur = mode == SABER_POWER_HIGH + ? Env::cur_env()[0]._info._max_frequence : Env::cur_env()[0]._info._min_frequence; + float cpu_ca_theory = cpu_freq_cur * 8.0f / 1000; + int th_num = threads; + LOG(INFO) << "saber packed gemm running time, ave: " << to / test_iter << ", min time: " << min_time; - LOG(WARNING) << "mean gops: " << 0.000001f * ops * test_iter / to << " GFLOPS, max gops: " << 0.000001f * ops / min_time << " GFLOPS"; + LOG(INFO) << "calculate: OPS: " << ops << " timer: " << to / test_iter << " mean GOPS: " << 0.000001f * ops * test_iter / to + << " GFLOPS, max gops: " << 0.000001f * ops / min_time << " GFLOPS cpu potential: " + << 0.000001f * ops / min_time / cpu_ca_theory / th_num * 100; //print_tensor(tout_saber); if (COMPARE_RESULT) { double max_ratio = 0; double max_diff = 0; - tensor_cmp_host(tout_basic, tout_saber, max_ratio, max_diff); - if (fabs(max_ratio) > 1e-4f) { - TensorHf4 tdiff(tout_basic.valid_shape()); - tensor_diff(tout_basic, tout_saber, tdiff); + tensor_cmp_host((const float*)tout_basic.data(), (const float*)tout_saber.data(), + tout_basic.valid_size(), max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; + if (fabs(max_ratio) > 1e-4f && fabsf(max_diff) > 5e-5f) { LOG(INFO) << "basic result: "; print_tensor(tout_basic); LOG(INFO) << "saber result: "; print_tensor(tout_saber); - LOG(INFO) << "diff result: "; - print_tensor(tdiff); - } - LOG(INFO) << "compare result, max diff: " << max_diff << ", max ratio: " << max_ratio; - if (fabs(max_ratio) > 1e-4f) { - if (fabsf(max_diff) > 5e-5f) { - return SaberInvalidValue; - } + return SaberInvalidValue; } } return SaberSuccess; } -TEST(TestSaberLite, test_func_sgemm_prepacked) { + +TEST(TestSaberFunc, test_func_sgemm_prepacked) { if (Basic_test) { LOG(INFO) << "run basic sgemm test"; for (auto& m : {1, 8, 16, 111, 256, 397, 512, 777, 1024}) { @@ -169,7 +218,7 @@ TEST(TestSaberLite, test_func_sgemm_prepacked) { } } } -TEST(TestSaberLite, test_func_sgemm_prepacked_custom) { +TEST(TestSaberFunc, test_func_sgemm_prepacked_custom) { auto flag = test_arm_sgemm(M, N, K, traA, traB, flag_bias, flag_relu, threads); if (flag != SaberSuccess) { LOG(FATAL) << "test m = " << M << ", n=" << N << ", k=" << K << \ @@ -180,8 +229,10 @@ TEST(TestSaberLite, test_func_sgemm_prepacked_custom) { ", trans A: " << traA << ", trans B: " << traB << ", bias: " << flag_bias << \ ", relu: " << flag_relu << " passed!!"; } + + int main(int argc, const char** argv){ - anakin::saber::lite::Env::env_init(); + anakin::saber::Env::env_init(); LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster] [threads] [m] [n] [k] [transA] [transB] [relu] [bias] [test iter] [compare result]"; if (argc > 1) { Basic_test = atoi(argv[1]) > 0; @@ -211,16 +262,18 @@ int main(int argc, const char** argv){ if (argc > 12) { COMPARE_RESULT = atoi(argv[12]) > 0; } - if (argc > 13) { - if (atoi(argv[13]) > 0) { - flag_arch = A72; - } else { - flag_arch = A73; - } - } // initial logger - //logger::init(argv[0]); + logger::init(argv[0]); InitTest(); RUN_ALL_TESTS(argv[0]); return 0; } + +#else + +int main(int argc, const char** argv){ + LOG(INFO) << "this unit test only be used in TargetType is ARM"; + return 0; +} + +#endif diff --git a/test/saber/test_saber_shape.cpp b/test/saber/test_saber_shape.cpp index 4c58cf5d5..448339577 100644 --- a/test/saber/test_saber_shape.cpp +++ b/test/saber/test_saber_shape.cpp @@ -195,12 +195,14 @@ TEST(TestSaberFunc, test_dim_4) { LOG(INFO) << "Layout_NCHW PASS"; test_dim4(Layout_NHWC); LOG(INFO) << "Layout_NHWC PASS"; +#if 0 test_dim4(Layout_NCHW_C4); LOG(INFO) << "Layout_NCHW_C4 PASS"; test_dim4(Layout_NCHW_C8); LOG(INFO) << "Layout_NCHW_C8 PASS"; test_dim4(Layout_NCHW_C16); LOG(INFO) << "Layout_NCHW_C16 PASS"; +#endif } TEST(TestSaberFunc, test_dim_2) { @@ -232,6 +234,7 @@ TEST(TestSaberFunc, test_set_layout) { CHECK_EQ(test_shape[1], H); CHECK_EQ(test_shape[2], W); CHECK_EQ(test_shape[3], C); +#if 0 if (C % 4 ==0) { test_shape.set_layout(Layout_NCHW_C4); CHECK_EQ(test_shape[0], N); @@ -259,6 +262,7 @@ TEST(TestSaberFunc, test_set_layout) { CHECK_EQ(test_shape[4], 16); CHECK_EQ(test_shape.channel(), C); } +#endif test_shape.set_layout(Layout_HW); CHECK_EQ(test_shape[0], H); CHECK_EQ(test_shape[1], W); diff --git a/test/saber/test_saber_slice.cpp b/test/saber/test_saber_slice.cpp index f9e8c25f8..66e219149 100644 --- a/test/saber/test_saber_slice.cpp +++ b/test/saber/test_saber_slice.cpp @@ -53,7 +53,7 @@ TEST(TestSaberFunc, test_func_slice){ testbase.set_param(param); testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); testbase.run_test(slice_cpu); - + //test1 TestSaberBase testbase1(1,4); num_in = 10; @@ -149,6 +149,63 @@ TEST(TestSaberFunc, test_func_slice){ }while(0); #endif +#ifdef USE_ARM_PLACE + LOG(INFO)<<"ARM test......"; + do + { + //test 0 + TestSaberBase testbase(1,4); + int num_in = 4; + int c_in = 9; + int h_in = 12; + int w_in = 12; + int slice_axis = 1; + std::vector slice_points = {1,3,6}; + SliceParam param(slice_axis, slice_points); + testbase.set_param(param); + testbase.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase.run_test(slice_cpu); + + //test1 + TestSaberBase testbase1(1,4); + num_in = 10; + c_in = 3; + h_in = 2; + w_in = 3; + slice_axis = 0; + slice_points = {4,6,8}; + SliceParam param1(slice_axis, slice_points); + testbase1.set_param(param1); + testbase1.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase1.run_test(slice_cpu); + + //test2 + TestSaberBase testbase2(1,2); + num_in = 6; + c_in = 4; + h_in = 10; + w_in = 2; + slice_axis = 2; + slice_points = {5}; + SliceParam param2(slice_axis, slice_points); + testbase2.set_param(param2); + testbase2.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase2.run_test(slice_cpu); + //test3 + TestSaberBase testbase3(1,3); + num_in = 10; + c_in = 11; + h_in = 1; + w_in = 11; + slice_axis = 3; + slice_points = {1,9}; + SliceParam param3(slice_axis, slice_points); + testbase3.set_param(param3); + testbase3.set_input_shape(Shape({num_in, c_in, h_in, w_in})); + testbase3.run_test(slice_cpu); + + }while(0); +#endif } int main(int argc, const char** argv) { diff --git a/test/saber/test_saber_slice_v2.cpp b/test/saber/test_saber_slice_v2.cpp new file mode 100644 index 000000000..5f71fd580 --- /dev/null +++ b/test/saber/test_saber_slice_v2.cpp @@ -0,0 +1,123 @@ +#include "saber/core/context.h" +#include "saber/funcs/slice_v2.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include "saber/core/tensor_op.h" +#include "saber_types.h" +#include + +using namespace anakin::saber; + + +template +void slice_v2_cpu(const std::vector*>& inputs, + std::vector*>& outputs,\ + SliceV2Param& param){ + + auto starts = param.starts; + auto ends = param.ends; + auto axes = param.axes; + CHECK_EQ(axes.size(), starts.size()) << "the size of axes and starts are not equal "; + CHECK_EQ(ends.size(), starts.size()) << "the size of starts and ends are not valid"; + Shape shape_in = inputs[0]->valid_shape(); + Shape out_shape = shape_in; + std::vector valid_starts; + std::vector valid_ends; + valid_starts.resize(starts.size()); + valid_ends.resize(ends.size()); + for (int i = 0; i < starts.size(); i++) { + int dim_value = shape_in[axes[i]]; + int start = starts[i] < 0 ? starts[i] + dim_value : starts[i]; + int end = ends[i] < 0 ? ends[i] + dim_value : ends[i]; + start = std::max(start, 0); + start = std::min(start, dim_value); + end = std::max(end, 0); + end = std::min(end, dim_value); + out_shape[axes[i]] = end - start; + valid_starts[i] = start; + valid_ends[i] = end; + } + CHECK_EQ(outputs.size(), 1) << "SliceV2 only support one output"; + const dtype* in_data = (const dtype*)inputs[0]->data(); + dtype* out_data = (dtype*)outputs[0]->mutable_data(); + auto out_stride = outputs[0]->get_stride(); + auto in_stride = inputs[0]->get_stride(); + int inner = inputs[0]->count_valid(param.axes.back() + 1, outputs[0]->dims()); + int out_outer_stride = outputs[0]->count_valid(param.axes[0], outputs[0]->dims()); + int in_outer_stride = inputs[0]->count_valid(param.axes[0], inputs[0]->dims()); + int count = outputs[0]->valid_size(); + + for (int i = 0; i < count; i++) { + int out_id = i / out_outer_stride; + int inner_id = i % inner; + int new_i = i / inner; + int in_offset = inner_id + out_id * in_outer_stride; + for (int k = valid_starts.size() - 1; k >= 0; k--) { + int cur_id = new_i % out_shape[axes[k]]; + in_offset += (cur_id + valid_starts[k]) * in_stride[axes[k]]; + new_i /= out_shape[axes[k]]; + } + out_data[i] = in_data[in_offset]; + } + +} + +template +void test_model() { + Shape input_shape({2, 5, 2, 2}, Layout_NCHW); + std::vector starts_0 = {1, 0}; + std::vector ends_0 = {3, 1}; + std::vector axes_0 = {1, 2}; + std::vector starts_1 = {0, 1, 0, 1}; + std::vector ends_1 = {1, 3, 1, 2}; + std::vector axes_1 = {0, 1, 2, 3}; + std::vector starts_2 = {1}; + std::vector ends_2 = {3}; + std::vector axes_2 = {1}; + + TestSaberBase testbase(1, 1); + for (auto i : {0, 1, 2}) { + std::vector axes; + std::vector starts; + std::vector ends; + if (i == 0) { + axes = axes_0; + starts = starts_0; + ends = ends_0; + } else if (i == 1) { + axes = axes_1; + starts = starts_1; + ends = ends_1; + } else if (i == 2) { + axes = axes_2; + starts = starts_2; + ends = ends_2; + } else { + LOG(FATAL) << "no other param"; + } + SliceV2Param param(axes, starts, ends); + testbase.set_param(param);//set param + testbase.set_input_shape(input_shape); + testbase.run_test(slice_v2_cpu, 0.0001, true, false); + } +} + +TEST(TestSaberFunc, test_func_slice_v2) { + +#ifdef USE_CUDA + //Init the test_base + test_model(); +#endif +#ifdef USE_X86_PLACE + test_model(); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_soft_sign.cpp b/test/saber/test_saber_soft_sign.cpp new file mode 100644 index 000000000..c6216cf6b --- /dev/null +++ b/test/saber/test_saber_soft_sign.cpp @@ -0,0 +1,84 @@ +#include "saber/core/context.h" +#include "saber/core/tensor_op.h" +#include "saber/funcs/soft_sign.h" +#include "saber/saber_types.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include +#include + +using namespace anakin::saber; + +template +void softsign_basic(const std::vector*>& inputs, + std::vector*>& outputs, + SoftSignParam& param) { + + int num = inputs[0]->num(); + int channel = inputs[0]->channel(); + int height = inputs[0]->height(); + int width = inputs[0]->width(); + + dtype* dout = (dtype*)outputs[0]->mutable_data(); + const dtype* din = (const dtype*)inputs[0]->data(); + size_t count = inputs[0]->valid_size(); + + //y = x / (1 + |x|) + for (size_t i = 0; i < count; i++) { + dtype tmp = din[i] > 0 ? din[i] : -din[i]; + dout[i] = din[i] / (1 + tmp); + } + +} + +template +void test_model() { + + TestSaberBase testbase(1, 1); + //test example + for (auto num : {1, 2, 16}) { + for (auto channel : {1, 16, 32}) { + for (auto height : {8, 15, 32}) { + for (auto width: {8, 13, 45}) { + Shape shape({num, channel, height, width}, Layout_NCHW); + SoftSignParam param; + testbase.set_param(param);//set param + testbase.set_input_shape(shape); + testbase.run_test(softsign_basic);//run test + } + } + } + } +} +TEST(TestSaberFunc, test_func_soft_sign) { + +#ifdef USE_CUDA + //Init the test_base + Env::env_init(); + test_model(); +#endif +#ifdef USE_X86_PLACE + Env::env_init(); + test_model(); +#endif +#ifdef USE_ARM_PLACE + //test_model(); +#endif +#ifdef AMD_GPU + // Env::env_init(); + // test_model(); +#endif +#ifdef USE_BM_PLACE + // Env::env_init(); + // test_accuracy(num, channel, height, width,VENDER_IMPL); +#endif +} + +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} + diff --git a/test/saber/test_saber_softmax.cpp b/test/saber/test_saber_softmax.cpp index d111d3778..4bee055e3 100644 --- a/test/saber/test_saber_softmax.cpp +++ b/test/saber/test_saber_softmax.cpp @@ -169,6 +169,36 @@ TEST(TestSaberFunc, test_func_softmax) { LOG(INFO) << "x86 test end."; #endif +#ifdef USE_ARM_PLACE + LOG(INFO) << "ARM test......"; + TestSaberBase testbase2; + + for (auto num : { + 1, 3, 4, 12 + }) { + for (auto c : { + 1, 3, 11, 3 + }) { + for (auto h : { + 3, 1, 11, 2 + }) { + for (auto w : { + 1, 3, 4, 11 + }) { + for (auto axis : { + 0, 1, 2, 3 + }) { + SoftmaxParam param(axis); + testbase2.set_param(param); + testbase2.set_input_shape(Shape({num, c, h, w})); + testbase2.run_test(softmax_cpu); + } + } + } + } + } + LOG(INFO) << "x86 test end."; +#endif #if 0 Env::env_init(); diff --git a/test/saber/test_saber_tensor.cpp b/test/saber/test_saber_tensor.cpp index e674099bc..4cc2e8640 100644 --- a/test/saber/test_saber_tensor.cpp +++ b/test/saber/test_saber_tensor.cpp @@ -103,7 +103,11 @@ void tensor_constructor() { copy_API::sync_memcpy(dev_data_ptr, 0, DAPI::get_device_id(), \ static_cast(host_data_ptr), 0, HAPI::get_device_id(), \ - sizeof(dtype) * sh1.count(), __HtoD()); + sizeof(dtype) * sh1.count(), flag_type()); + + // copy_API::sync_memcpy(dev_data_ptr, 0, DAPI::get_device_id(), \ + // static_cast(host_data_ptr), 0, HAPI::get_device_id(), \ + // sizeof(dtype) * sh1.count(), __HtoD()); LOG(INFO) << "|--construct host tensor from host data ptr"; TensorH thost3(host_data_ptr, TargetH(), HAPI::get_device_id(), sh1, Dtype); @@ -277,7 +281,7 @@ TEST(TestSaberFunc, test_tensor_constructor) { tensor_constructor(); #endif -#ifdef USE_BM_PLACE +#ifdef USE_BM_PLACE Env::env_init(); Env::env_init(); LOG(INFO) << "test BM FP32 tensor"; diff --git a/test/saber/test_saber_topk_avg_pooling.cpp b/test/saber/test_saber_topk_avg_pooling.cpp index 3a230b830..37a534edb 100644 --- a/test/saber/test_saber_topk_avg_pooling.cpp +++ b/test/saber/test_saber_topk_avg_pooling.cpp @@ -120,7 +120,6 @@ void topk_avg_pooling_basic(const std::vector*>& inputs, st } } - return SaberSuccess; } diff --git a/test/saber/test_saber_yolo_box.cpp b/test/saber/test_saber_yolo_box.cpp new file mode 100644 index 000000000..8dc9d9f13 --- /dev/null +++ b/test/saber/test_saber_yolo_box.cpp @@ -0,0 +1,185 @@ + +#include "saber/core/context.h" +#include "saber/funcs/yolo_box.h" +#include "test_saber_func.h" +#include "test_saber_base.h" +#include "saber/core/tensor_op.h" +#include "saber/saber_types.h" +#include + +using namespace anakin::saber; + +namespace { + +inline float sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +inline void get_yolo_box(float* box, const float* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} + +inline int get_entry_index(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +inline void calc_detection_box(float* boxes, float* box, const int box_idx, + const int img_height, + const int img_width) { + + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} + +inline void calc_label_score(float* scores, const float* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} +} + +template +void yolo_box_cpu(const std::vector*>& input, + std::vector*>& output,\ + YoloBoxParam& param) { + + auto* in = input[0]; + auto* imgsize = input[1]; + auto* boxes = output[0]; + auto* scores = output[1]; + auto anchors = param.anchors; + int class_num = param.class_num; + float conf_thresh = param.conf_thresh; + int downsample_ratio = param.downsample_ratio; + + const int n = in->num(); + const int h = in->height(); + const int w = in->width(); + const int box_num = boxes->valid_shape()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + auto anchors_data = anchors.data(); + + const float* input_data = (const float*)in->data(); + const float* imgsize_data = (const float*)imgsize->data(); + + float* boxes_data = (float*)boxes->mutable_data(); + float* scores_data = (float*)scores->mutable_data(); + + float box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4); + float conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0); + get_yolo_box(box, input_data, anchors_data, l, k, j, h, input_size, + box_idx, stride, img_height, img_width); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + calc_detection_box(boxes_data, box, box_idx, img_height, + img_width); + + int label_idx = + get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + calc_label_score(scores_data, input_data, label_idx, score_idx, + class_num, conf, stride); + } + } + } + } +} + +template +void test_yolo() { + //Init the test_base + TestSaberBase testbase(2, 2); + YoloBoxParam param({1, 2, 3, 4}, 5, 0.5, 5); + for (int w_in : {16, 20, 32, 64}) { + for (int h_in : {16, 20, 32, 64}) { + for (int ch_in : {20}) { + for (int num_in:{1, 3, 5}) { + Shape shape0({num_in, ch_in, h_in, w_in}); + Shape shape1({num_in, 2, 4}, Layout_NHW); + + Tensor input0; + Tensor input1; + + testbase.set_param(param); + + input0.re_alloc(shape0, AK_FLOAT); + input1.re_alloc(shape1, AK_FLOAT); + + std::vector*> ins{&input0, &input1}; + fill_tensor_rand(input0, -10, 10); + fill_tensor_rand(input1, -10, 10); + testbase.add_custom_input(ins); + testbase.run_test(yolo_box_cpu); + } + } + } + } +} + +TEST(TestSaberFunc, test_func_yolo_box) { + +#ifdef USE_CUDA + test_yolo(); +#endif + +#ifdef USE_X86_PLACE + test_yolo(); +#endif + +#ifdef USE_ARM_PLACE + test_yolo(); +#endif + +} +int main(int argc, const char** argv) { + // initial logger + //logger::init(argv[0]); + InitTest(); + RUN_ALL_TESTS(argv[0]); + return 0; +} diff --git a/third-party/.gitignore b/third-party/.gitignore index 232d776af..6eca2178c 100644 --- a/third-party/.gitignore +++ b/third-party/.gitignore @@ -32,10 +32,11 @@ *.app # dir +mkl-patched mklml mkldnn_source mkldnn xbyak_source xbyak -sass -tensorrt5 +#sass +nanopb diff --git a/third-party/hash/include/bloomfilter/bloomfilter.h b/third-party/hash/include/bloomfilter/bloomfilter.h new file mode 100644 index 000000000..ff7d3f770 --- /dev/null +++ b/third-party/hash/include/bloomfilter/bloomfilter.h @@ -0,0 +1,44 @@ +#ifndef THIRD_PARTY_BLOOMFILTER_BLOOMFILTER_H +#define THIRD_PARTY_BLOOMFILTER_BLOOMFILTER_H + +#include +#include + +struct bloomfilter { + uint64_t magic_num; + uint64_t m; + uint64_t k; + uint64_t count; + unsigned char bit_vector[1]; +}; + +int bloomfilter_check(struct bloomfilter* filter); + +void +bloomfilter_init(struct bloomfilter *bloomfilter, uint64_t m, uint64_t k); + +int +bloomfilter_set(struct bloomfilter *bloomfilter, const void *key, size_t len); + +int +bloomfilter_set_nocheck(struct bloomfilter *bloomfilter, const void *key, size_t len); + +int +bloomfilter_get(struct bloomfilter *bloomfilter, const void *key, size_t len); + +int +bloomfilter_dump(struct bloomfilter *bloomfilter, const void *path); + +int +bloomfilter_load(struct bloomfilter **bloomfilter, const void *path); + +int +bloomfilter_get_hash(struct bloomfilter *bloomfilter, const void *key, size_t len, char *dst); + +uint64_t +char_to_little_endian_64bits(unsigned char *bytes); + +uint32_t +char_to_little_endian_32bits(unsigned char *bytes); + +#endif /* __BLOOMFILTER_H__ */ diff --git a/third-party/hash/include/bloomfilter/murmur3.h b/third-party/hash/include/bloomfilter/murmur3.h new file mode 100644 index 000000000..12a8d5749 --- /dev/null +++ b/third-party/hash/include/bloomfilter/murmur3.h @@ -0,0 +1,12 @@ +#ifndef THIRD_PARTY_BLOOMFILTER_MURMUR3_H +#define THIRD_PARTY_BLOOMFILTER_MURMUR3_H + +#include +#include + +void +murmur3_hash32(const void *key, size_t len, uint32_t seed, void *out); +void +murmurhash3_x64_128(const void * key, const int len, const uint32_t seed, void * out); + +#endif diff --git a/third-party/hash/include/xxHash/xxhash.h b/third-party/hash/include/xxHash/xxhash.h new file mode 100644 index 000000000..2419ebd55 --- /dev/null +++ b/third-party/hash/include/xxHash/xxhash.h @@ -0,0 +1,235 @@ +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2016, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bits version, named XXH64, is available since r35. +It offers much better speed, but for 64-bits applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#ifndef THIRD_PARTH_XXHASH_XXHASH_H +#define THIRD_PARTH_XXHASH_XXHASH_H + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode; + + +/* **************************** +* API modifier +******************************/ +/*!XXH_PRIVATE_API +* Transforms all publics symbols within `xxhash.c` into private ones. +* Methodology : +* instead of : #include "xxhash.h" +* do : +* #define XXH_PRIVATE_API +* #include "xxhash.c" // note the .c , instead of .h +* also : don't compile and link xxhash.c separately +*/ +#ifdef XXH_PRIVATE_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __attribute__((unused)) +# elif defined (__cplusplus) || \ + (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else +# define XXH_PUBLIC_API static +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif + +/*!XXH_NAMESPACE, aka Namespace Emulation : + +If you want to include _and expose_ xxHash functions from within your own library, +but also want to avoid symbol collisions with another library which also includes xxHash, + +you can use XXH_NAMESPACE, to automatically prefix any public symbol from `xxhash.c` +with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values). + +Note that no change is required within the calling program as long as it also includes +`xxhash.h` : +regular symbol name will be automatically translated by this header. +*/ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A, B) A##B +# define XXH_NAME2(A, B) XXH_CAT(A, B) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +#endif + + +/* ************************************* +* Version +***************************************/ +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Simple Hash Functions +******************************/ + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, + size_t length, + unsigned int seed); +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, + size_t length, + unsigned long long seed); + +/*! +XXH32() : + Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s +XXH64() : + Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs faster on 64-bits systems, but slower on 32-bits systems (see benchmark). +*/ + + +/* **************************** +* Advanced Hash Functions +******************************/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete */ + + +/*!Static allocation + For static linking only, do not use in the context of DLL ! */ +typedef struct { long long ll[ 6]; } XXH32_stateBody_t; +typedef struct { long long ll[11]; } XXH64_stateBody_t; + +#define XXH32_CREATESTATE_STATIC(name) \ + XXH32_stateBody_t name##xxhbody; \ + void* name##xxhvoid = &(name##xxhbody); \ + XXH32_state_t* name = (XXH32_state_t*) \ + (name##xxhvoid) +#define XXH64_CREATESTATE_STATIC(name) \ + XXH64_stateBody_t name##xxhbody; \ + void* name##xxhvoid = &(name##xxhbody); \ + XXH64_state_t* name = (XXH64_state_t*)(name##xxhvoid) + + +/*!Dynamic allocation + To be preferred in the context of DLL */ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + + +/* hash streaming */ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, + const void* input, + size_t length); +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, + const void* input, + size_t length); +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* statePtr); + +/*! +These functions generate the xxHash of an input provided in multiple segments, +as opposed to provided as a single block. + +XXH state must first be allocated, using either static or dynamic method provided above. + +Start a new hash by initializing state with a seed, using XXHnn_reset(). + +Then, feed the hash state by calling XXHnn_update() as many times as necessary. +Obviously, input must be valid, hence allocated and read accessible. +The function returns an error code, with 0 meaning OK, +and any other value meaning there is an error. + +Finally, a hash value can be produced anytime, by using XXHnn_digest(). +This function returns the nn-bits hash. +It's nonetheless possible to continue inserting input into the hash state +and later on generate some new hashes, by calling again XXHnn_digest(). + +When done, free XXH state space if it was allocated dynamically. +*/ + +#endif + diff --git a/third-party/hash/src/bloomfilter/bloomfilter.c b/third-party/hash/src/bloomfilter/bloomfilter.c new file mode 100644 index 000000000..fa920a30b --- /dev/null +++ b/third-party/hash/src/bloomfilter/bloomfilter.c @@ -0,0 +1,240 @@ +#include "bloomfilter/bloomfilter.h" + +#include +#include +#include + +#include "bloomfilter/murmur3.h" + +#define bit_set(v, n) ((v)[(n) >> 3] |= (0x1 << (0x7 - ((n) & 0x7)))) +#define bit_get(v, n) ((v)[(n) >> 3] & (0x1 << (0x7 - ((n) & 0x7)))) +#define bit_clr(v, n) ((v)[(n) >> 3] &=~(0x1 << (0x7 - ((n) & 0x7)))) + +unsigned int G_BLOOMFILTER_HEADER_SIZE = 32; +unsigned int G_BLOOMFILTER_MAGIC_NUM_OLD = 17062621; +unsigned int G_BLOOMFILTER_MAGIC_NUM_NEW = 17070416; + +void +bloomfilter_init(struct bloomfilter *bloomfilter, uint64_t m, uint64_t k) +{ + memset(bloomfilter, 0, sizeof(*bloomfilter)); + bloomfilter->m = m; + bloomfilter->k = k; + bloomfilter->magic_num = G_BLOOMFILTER_MAGIC_NUM_NEW; + bloomfilter->count = 0; + memset(bloomfilter->bit_vector, 0, bloomfilter->m >> 3); +} + +int bloomfilter_check(struct bloomfilter* filter){ + if( filter->magic_num == G_BLOOMFILTER_MAGIC_NUM_NEW){ + return 1; + }else{ + fprintf(stderr, "error magic_num %d\n", filter->magic_num); + return 0; + } +} + +int +bloomfilter_load_32bits(struct bloomfilter **bloomfilter, FILE *fp) { + if(fp == NULL) { + return 0; + } + unsigned char bytes[4]; + struct bloomfilter* t; + fread(bytes, 4, 1, fp); + uint32_t magic_num = char_to_little_endian_32bits(bytes); + if(magic_num != G_BLOOMFILTER_MAGIC_NUM_OLD) { + return 0; + } + fread(bytes, 4, 1, fp); + uint32_t m = char_to_little_endian_32bits(bytes); + if(m % 8 != 0) { + return 0; + } + fread(bytes, 4, 1, fp); + uint32_t k = char_to_little_endian_32bits(bytes); + + fread(bytes, 4, 1, fp); + uint32_t count = char_to_little_endian_32bits(bytes); + t = (struct bloomfilter*)malloc(sizeof(struct bloomfilter)+(m>>3)); + memset(t, 0, sizeof(struct bloomfilter) + (m >> 3)); + t->m = m; + t->k = k; + t->magic_num = magic_num; + t->count = count; + fseek(fp, G_BLOOMFILTER_HEADER_SIZE - 16, SEEK_CUR); + fread(t->bit_vector, m >> 3, 1, fp); + fseek(fp, 0, SEEK_END); // seek to end of file + unsigned int filesize = ftell(fp); + if (filesize != m / 8 + G_BLOOMFILTER_HEADER_SIZE) { + free(t); + return 0; + } + *bloomfilter = t; + return 1; +} + +int +bloomfilter_load(struct bloomfilter **bloomfilter, const void *path) +{ + struct bloomfilter* t; + unsigned char bytes[8]; + FILE * file = fopen(path, "rb"); + if (file != NULL) { + if(bloomfilter_load_32bits(bloomfilter, file) > 0) { + fclose(file); + return 1; + } + //back to beginning of file + fseek(file, 0, SEEK_SET); + fread(bytes, 8, 1, file); + uint64_t magic_num = char_to_little_endian_64bits(bytes); + if(magic_num != G_BLOOMFILTER_MAGIC_NUM_NEW) { + fclose(file); + return 0; + } + fread(bytes, 8, 1, file); + uint64_t m = char_to_little_endian_64bits(bytes); + if(m % 8 != 0) { + fclose(file); + return 0; + } + fread(bytes, 8, 1, file); + uint64_t k = char_to_little_endian_64bits(bytes); + + fread(bytes, 8, 1, file); + uint64_t count = char_to_little_endian_64bits(bytes); + + t = (struct bloomfilter*)malloc(sizeof(struct bloomfilter)+(m>>3)); + memset(t, 0, sizeof(struct bloomfilter) + (m >> 3)); + t->m = m; + t->k = k; + t->magic_num = magic_num; + t->count = count; + fread(t->bit_vector, m >> 3, 1, file); + fseek(file, 0, SEEK_END); // seek to end of file + unsigned int filesize = ftell(file); + fclose(file); + if(filesize != m / 8 + G_BLOOMFILTER_HEADER_SIZE) { + free(t); + return 0; + } + *bloomfilter = t; + return 1; + } + fprintf(stderr, "file %s not exist\n", path); + return 0; +} + +int +bloomfilter_set(struct bloomfilter *bloomfilter, const void *key, size_t len) +{ + if(bloomfilter_get(bloomfilter, key, len) > 0) { + return 0; + } + uint32_t i; + uint64_t result[2]; + for (i = 0; i < bloomfilter->k; i++) { + murmurhash3_x64_128(key, len, i, &result); + result[0] %= bloomfilter->m; + result[1] %= bloomfilter->m; + bit_set(bloomfilter->bit_vector, result[0]); + bit_set(bloomfilter->bit_vector, result[1]); + } + bloomfilter->count++; + return 1; +} + +int +bloomfilter_set_nocheck(struct bloomfilter *bloomfilter, const void *key, size_t len) +{ + uint32_t i; + uint64_t result[2]; + for (i = 0; i < bloomfilter->k; i++) { + murmurhash3_x64_128(key, len, i, &result); + result[0] %= bloomfilter->m; + result[1] %= bloomfilter->m; + bit_set(bloomfilter->bit_vector, result[0]); + bit_set(bloomfilter->bit_vector, result[1]); + } + bloomfilter->count++; + return 1; +} + +int +bloomfilter_get(struct bloomfilter *bloomfilter, const void *key, size_t len) +{ + uint32_t i; + uint64_t result[2]; + + for (i = 0; i < bloomfilter->k; i++) { + murmurhash3_x64_128(key, len, i, &result); + result[0] %= bloomfilter->m; + result[1] %= bloomfilter->m; + if (!bit_get(bloomfilter->bit_vector, result[0])){ + return 0; + } + if (!bit_get(bloomfilter->bit_vector, result[1])){ + return 0; + } + } + return 1; +} + +int +bloomfilter_get_hash(struct bloomfilter *bloomfilter, const void *key, size_t len, char *dst) +{ +#define SIZEOF_MIN(X, Y) ((X) < (Y) ? (X) : (Y)) + uint32_t i; + uint64_t result[2]; + char hash[255] = ""; + char valstr[32]; + for (i = 0; i < bloomfilter->k; i++) { + murmurhash3_x64_128(key, len, i, &result); + snprintf(valstr, sizeof(valstr), "%lu,", result[0]); + strncat(hash, valstr, SIZEOF_MIN(sizeof(valstr), sizeof(hash))); + snprintf(valstr, sizeof(valstr), "%lu,", result[1]); + strncat(hash, valstr, SIZEOF_MIN(sizeof(valstr), sizeof(hash))); + } + strncpy(dst, hash, SIZEOF_MIN(len, sizeof(hash))); + return 1; +#undef SIZEOF_MIN +} + +int +bloomfilter_dump(struct bloomfilter *bloomfilter, const void *path) +{ + FILE * file = fopen(path, "wb"); + if (file != NULL) { + fwrite(&bloomfilter->magic_num, sizeof(bloomfilter->magic_num), 1, file); + fwrite(&bloomfilter->m, sizeof(bloomfilter->m), 1, file); + fwrite(&bloomfilter->k, sizeof(bloomfilter->k), 1, file); + fwrite(&bloomfilter->count, sizeof(bloomfilter->count), 1, file); + fwrite(bloomfilter->bit_vector, (bloomfilter->m >> 3), 1, file); + fclose(file); + return 1; + } + return 0; +} + +/** + * works either big-endian or little-endian architectures + */ +uint32_t +char_to_little_endian_32bits(unsigned char *bytes) { + return bytes[0] | (bytes[1] << 8) | (bytes[2] << 16) | (bytes[3] << 24); +} + +/** + * works either big-endian or little-endian architectures + */ +uint64_t +char_to_little_endian_64bits(unsigned char *bytes) { + uint64_t bytes_ull[8]; + int i; + for(i = 0; i < 8; i++) { + bytes_ull[i] = bytes[i]; + } + return bytes_ull[0] | (bytes_ull[1] << 8) | (bytes_ull[2] << 16) | (bytes_ull[3] << 24) | + (bytes_ull[4] << 32) | (bytes_ull[5] << 40) | (bytes_ull[6] << 48) | (bytes_ull[7] << 56); +} diff --git a/third-party/hash/src/bloomfilter/murmur3.c b/third-party/hash/src/bloomfilter/murmur3.c new file mode 100644 index 000000000..5904188c1 --- /dev/null +++ b/third-party/hash/src/bloomfilter/murmur3.c @@ -0,0 +1,184 @@ +#include "bloomfilter/murmur3.h" + +#define ROTL32(x, r) (((x) << (r)) | ((x) >> (32 - (r)))) +#define ROTL64(x, r) (((x) << (r)) | ((x) >> (64 - (r)))) +#define BIG_CONSTANT(x) (x##LLU) + +uint32_t fmix32(uint32_t h) { + return h; +} + +//uint64_t getblock64(const uint64_t * p, int i) { +// return p[i]; +//} + +uint64_t fmix64(uint64_t k) { + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + return k; +} + +void murmur3_hash32(const void *key, size_t len, uint32_t seed, void *out) { + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + int i = 0; + uint32_t k1 = 0; + uint32_t h1 = seed; + + const uint8_t *data = (const uint8_t *) key; + const int nblocks = len >> 2; + + const uint32_t *blocks = (const uint32_t *) (data + nblocks * 4); + const uint8_t *tail = (const uint8_t *) (data + nblocks * 4); + + for (i = -nblocks; i; i++) { + uint32_t k1 = blocks[i]; + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + break; + case 2: + k1 ^= tail[1] << 8; + break; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + break; + }; + + h1 ^= len; + + h1 ^= h1 >> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >> 16; + + *(uint32_t*) out = h1; +} + +void murmurhash3_x64_128(const void * key, const int len, const uint32_t seed, void * out) { + const uint8_t * data = (const uint8_t*) key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + int i = 0; + + const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *) (data); + + uint64_t k1; + uint64_t k2; + + for (i = 0; i < nblocks; i++) { + k1 = blocks[i * 2 + 0]; + k2 = blocks[i * 2 + 1]; + + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + h2 = ROTL64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*) (data + nblocks * 16); + uint64_t nk1 = 0; + uint64_t nk2 = 0; + //no break here!!! + switch (len & 15) { + case 15: + nk2 ^= ((uint64_t) tail[14]) << 48; + case 14: + nk2 ^= ((uint64_t) tail[13]) << 40; + case 13: + nk2 ^= ((uint64_t) tail[12]) << 32; + case 12: + nk2 ^= ((uint64_t) tail[11]) << 24; + case 11: + nk2 ^= ((uint64_t) tail[10]) << 16; + case 10: + nk2 ^= ((uint64_t) tail[9]) << 8; + case 9: + nk2 ^= ((uint64_t) tail[8]) << 0; + nk2 *= c2; + nk2 = ROTL64(nk2, 33); + nk2 *= c1; + h2 ^= nk2; + case 8: + nk1 ^= ((uint64_t) tail[7]) << 56; + case 7: + nk1 ^= ((uint64_t) tail[6]) << 48; + case 6: + nk1 ^= ((uint64_t) tail[5]) << 40; + case 5: + nk1 ^= ((uint64_t) tail[4]) << 32; + case 4: + nk1 ^= ((uint64_t) tail[3]) << 24; + case 3: + nk1 ^= ((uint64_t) tail[2]) << 16; + case 2: + nk1 ^= ((uint64_t) tail[1]) << 8; + case 1: + nk1 ^= ((uint64_t) tail[0]) << 0; + nk1 *= c1; + nk1 = ROTL64(nk1, 31); + nk1 *= c2; + h1 ^= nk1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*) out)[0] = h1; + ((uint64_t*) out)[1] = h2; +} diff --git a/third-party/hash/src/xxHash/xxhash.c b/third-party/hash/src/xxHash/xxhash.c new file mode 100755 index 000000000..bacab8332 --- /dev/null +++ b/third-party/hash/src/xxHash/xxhash.c @@ -0,0 +1,975 @@ +/* +xxHash - Fast Hash algorithm +Copyright (C) 2012-2016, Yann Collet + +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. + * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. + * By default, this option is disabled. To enable it, uncomment below define : + */ +/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independant Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independance be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#define XXH_FORCE_NATIVE_FORMAT 0 + +/*!XXH_USELESS_ALIGN_BRANCH : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : don't check for aligned/unaligned input, because performance will be the same. + * It saves one initial branch per hash. + */ +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_USELESS_ALIGN_BRANCH 1 +#endif + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define FORCE_INLINE static __forceinline +#else +# if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +# else +# define FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/* Modify the local functions below should you wish to use some other memory routines */ +/* for malloc(), free() */ +#include +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } +/* for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) + { return memcpy(dest, src, size); } + +unsigned int XXH_VERSION_MAJOR = 0; +unsigned int XXH_VERSION_MINOR = 5; +unsigned int XXH_VERSION_RELEASE = 0; +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + \ + XXH_VERSION_MINOR *100 + \ + XXH_VERSION_RELEASE) +#include "xxHash/xxhash.h" + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# define MEM_MODULE +# if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } +static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; + +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x, r) _rotl(x, r) +# define XXH_rotl64(x, r) _rotl64(x, r) +#else +# define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r))) +# define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +# define XXH_swap64 _byteswap_uint64 +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +# define XXH_swap64 __builtin_bswap64 +#else +static U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +static U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN + static const int g_one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one)) +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/* ************************************* +* Constants +***************************************/ +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + +#define PRIME64_1 11400714785074694791ULL +#define PRIME64_2 14029467366897019727ULL +#define PRIME64_3 1609587929392839161ULL +#define PRIME64_4 9650029242287828579ULL +#define PRIME64_5 2870177450012600261ULL + +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* *************************** +* Simple Hash Functions +*****************************/ +FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) + { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do + { + v1 += XXH_get32bits(p) * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + p+=4; + v2 += XXH_get32bits(p) * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + p+=4; + v3 += XXH_get32bits(p) * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + p+=4; + v4 += XXH_get32bits(p) * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + p+=4; + } + while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p+4<=bEnd) + { + h32 += XXH_get32bits(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, input, len); + return XXH32_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USELESS_ALIGN_BRANCH) + if ((((size_t)input) & 3) == 0) /* Input is 4-bytes aligned, leverage the speed benefit */ + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) + { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) + { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do + { + v1 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + v2 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + v3 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + v4 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + } + while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + + v1 *= PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + h64 ^= v1; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v2 *= PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + h64 ^= v2; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v3 *= PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + h64 ^= v3; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v4 *= PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + h64 ^= v4; + h64 = h64 * PRIME64_1 + PRIME64_4; + } + else + { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + while (p+8<=bEnd) + { + U64 k1 = XXH_get64bits(p); + k1 *= PRIME64_2; + k1 = XXH_rotl64(k1,31); + k1 *= PRIME64_1; + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) + { + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USELESS_ALIGN_BRANCH) + if ((((size_t)input) & 7)==0) /* Input is aligned, let's leverage the speed advantage */ + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/* ************************************************** +* Advanced Hash Functions +****************************************************/ + +/*** Allocation ***/ +struct XXH32_state_s +{ + U64 total_len; + U32 seed; + U32 v1; + U32 v2; + U32 v3; + U32 v4; + U32 mem32[4]; /* defined as U32 for alignment */ + U32 memsize; +}; /* typedef'd to XXH32_state_t within xxhash.h */ + +struct XXH64_state_s +{ + U64 total_len; + U64 seed; + U64 v1; + U64 v2; + U64 v3; + U64 v4; + U64 mem64[4]; /* defined as U64 for alignment */ + U32 memsize; +}; /* typedef'd to XXH64_state_t within xxhash.h */ + + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_stateBody_t) >= sizeof(XXH32_state_t)); /* A compilation error here means XXH32_state_t is not large enough */ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_stateBody_t) >= sizeof(XXH64_state_t)); /* A compilation error here means XXH64_state_t is not large enough */ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + + +/*** Hash feed ***/ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.seed = seed; + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.seed = seed; + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 16) /* fill in tmp buffer */ + { + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) /* some data left from previous update */ + { + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { + const U32* p32 = state->mem32; + state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v1 = XXH_rotl32(state->v1, 13); + state->v1 *= PRIME32_1; + p32++; + state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v2 = XXH_rotl32(state->v2, 13); + state->v2 *= PRIME32_1; + p32++; + state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v3 = XXH_rotl32(state->v3, 13); + state->v3 *= PRIME32_1; + p32++; + state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v4 = XXH_rotl32(state->v4, 13); + state->v4 *= PRIME32_1; + p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do + { + v1 += XXH_readLE32(p, endian) * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + p+=4; + v2 += XXH_readLE32(p, endian) * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + p+=4; + v3 += XXH_readLE32(p, endian) * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + p+=4; + v4 += XXH_readLE32(p, endian) * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + p+=4; + } + while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->mem32, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem32; + const BYTE* bEnd = (const BYTE*)(state->mem32) + state->memsize; + U32 h32; + + if (state->total_len >= 16) + { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } + else + { + h32 = state->seed + PRIME32_5; + } + + h32 += (U32) state->total_len; + + while (p+4<=bEnd) + { + h32 += XXH_readLE32(p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + +FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) /* fill in tmp buffer */ + { + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) /* some data left from previous update */ + { + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + { + const U64* p64 = state->mem64; + state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v1 = XXH_rotl64(state->v1, 31); + state->v1 *= PRIME64_1; + p64++; + state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v2 = XXH_rotl64(state->v2, 31); + state->v2 *= PRIME64_1; + p64++; + state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v3 = XXH_rotl64(state->v3, 31); + state->v3 *= PRIME64_1; + p64++; + state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v4 = XXH_rotl64(state->v4, 31); + state->v4 *= PRIME64_1; + p64++; + } + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) + { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do + { + v1 += XXH_readLE64(p, endian) * PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + p+=8; + v2 += XXH_readLE64(p, endian) * PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + p+=8; + v3 += XXH_readLE64(p, endian) * PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + p+=8; + v4 += XXH_readLE64(p, endian) * PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + p+=8; + } + while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->mem64, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem64; + const BYTE* bEnd = (const BYTE*)state->mem64 + state->memsize; + U64 h64; + + if (state->total_len >= 32) + { + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + + v1 *= PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + h64 ^= v1; + h64 = h64*PRIME64_1 + PRIME64_4; + + v2 *= PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + h64 ^= v2; + h64 = h64*PRIME64_1 + PRIME64_4; + + v3 *= PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + h64 ^= v3; + h64 = h64*PRIME64_1 + PRIME64_4; + + v4 *= PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + h64 ^= v4; + h64 = h64*PRIME64_1 + PRIME64_4; + } + else + { + h64 = state->seed + PRIME64_5; + } + + h64 += (U64) state->total_len; + + while (p+8<=bEnd) + { + U64 k1 = XXH_readLE64(p, endian); + k1 *= PRIME64_2; + k1 = XXH_rotl64(k1,31); + k1 *= PRIME64_1; + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) + { + h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + diff --git a/third-party/sass/.DS_Store b/third-party/sass/.DS_Store new file mode 100644 index 000000000..10f4d4306 Binary files /dev/null and b/third-party/sass/.DS_Store differ diff --git a/third-party/sass/include/sass_funcs.h b/third-party/sass/include/sass_funcs.h index 8d171824c..b316054d8 100644 --- a/third-party/sass/include/sass_funcs.h +++ b/third-party/sass/include/sass_funcs.h @@ -15,39 +15,35 @@ namespace anakin { namespace saber { //Round a / b to nearest higher integer value -inline int i_div_up(int a, int b) -{ +inline int i_div_up(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } //Align a to nearest higher multiple of b -inline int i_align_up(int a, int b) -{ +inline int i_align_up(int a, int b) { return (a % b != 0) ? (a - a % b + b) : a; } -inline int bin(int var){ +inline int bin(int var) { int x = (var >= 0) ? var : -var; int bits; - for (bits = 0; x != 0; ++bits){ + for (bits = 0; x != 0; ++bits) { x >>= 1; } return bits; } -inline std::pair -magic_32_div(long long int nmax, int div) -{ +inline std::pair +magic_32_div(long long int nmax, int div) { unsigned m = -1; unsigned int p; long long int nc = ((nmax + 1) / div) * div - 1; int nbits = bin(nmax); int range = 2 * nbits + 1; - for (p = 0; p < range; p++){ + for (p = 0; p < range; p++) { long long int exp = 1 << p; long long int mod = div - 1 - (exp - 1) % div; - if (exp > nc * mod) - { + if (exp > nc * mod) { m = (unsigned) ((exp + mod) / div); return std::make_pair(m, p); } @@ -55,396 +51,524 @@ magic_32_div(long long int nmax, int div) return std::make_pair(-1, -1); } -template -void winograd_conv(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void winograd_conv_relu(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void winograd_conv_relu_pooling(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void winograd_conv_eltwise(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - EltwiseType elt_type, - cudaStream_t cuda_stream); - -template -void direct_conv_Kdivis4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_Kindiv4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_bias_Kdivis4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_bias_Kindiv4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_bias_relu_Kdivis4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_bias_relu_Kindiv4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - - -template -void direct_conv_bias_relu_maxpool2k2s0p_Kdivis4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); - -template -void direct_conv_bias_relu_maxpool2k2s0p_Kindiv4(const DataType* src, - DataType* dst, - const OpType* weight, - const DataType* bias, - int img_num, - int img_in_channel, - int img_in_height, - int img_in_width, - int img_out_channel, - int img_out_height, - int img_out_width, - int img_in_channel_stride, - int img_in_height_stride, - int img_in_width_stride, - int img_out_channel_stride, - int img_out_height_stride, - int img_out_width_stride, - int kernel_h, - int kernel_w, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - int dilation_h, - int dilation_w, - int group, - float alpha, - float beta, - cudaStream_t cuda_stream); +template +void winograd_conv(const DataType *src, + DataType *dst, + const OpType *weight, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void winograd_conv_relu(const DataType *src, + DataType *dst, + const OpType *weight, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void winograd_conv_relu_pooling(const DataType *src, + DataType *dst, + const OpType *weight, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void winograd_conv_eltwise(const DataType *src, + DataType *dst, + const OpType *weight, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + EltwiseType elt_type, + cudaStream_t cuda_stream); + +template +void direct_conv_Kdivis4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_Kindiv4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_bias_Kdivis4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_bias_Kindiv4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_bias_relu_Kdivis4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_bias_relu_Kindiv4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + + +template +void direct_conv_bias_relu_maxpool2k2s0p_Kdivis4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_bias_relu_maxpool2k2s0p_Kindiv4(const DataType *weights, + DataType *dst, + const OpType *src, + const DataType *bias, + int img_num, + int img_in_channel, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int img_in_channel_stride, + int img_in_height_stride, + int img_in_width_stride, + int img_out_channel_stride, + int img_out_height_stride, + int img_out_width_stride, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +// [zs] int8 kernels +template +void direct_conv_Kdivis4_s8_to_f32( + const void *weights, + void *dst, + const void *src, + const void *bias, + int img_num, + int img_in_channel_4, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilate_h, + int dilate_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +template +void direct_conv_Kdivis4_s8_to_s8( + const void *weights, + void *dst, + const void *src, + const void *bias, + int img_num, + int img_in_channel_4, + int img_in_height, + int img_in_width, + int img_out_channel, + int img_out_height, + int img_out_width, + int kernel_h, + int kernel_w, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilate_h, + int dilate_w, + int group, + float alpha, + float beta, + cudaStream_t cuda_stream); + +void ker_igemm_32x32x32_NN_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_32x32x32_NN_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_32x32x32_NN_vec_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_32x32x32_NN_vec_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_vec_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_vec_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_scale_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *scale, + const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_scale_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *scale, + const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_scale_vec_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *scale, + const void *bias, cudaStream_t cuda_stream); + +void ker_igemm_s8s8_32x32x32_NN_scale_vec_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const void *A, + const float beta, const void *B, + void *C, const void *scale, + const void *bias, cudaStream_t cuda_stream); void ker_deconv_implicit_gemm_k4_s2_p1_16x64( - float* dout, const float *din, - const float* weights, const float* bias, + float *dout, const float *din, + const float *weights, const float *bias, int num, int hin, int win, int hout, int wout, int ch_in, int ch_out, cudaStream_t &stream); void ker_deconv_implicit_gemm_k4_s2_p1_32x32_relu( - float* dout, const float *din, - const float* weights, const float* bias, + float *dout, const float *din, + const float *weights, const float *bias, int num, int hin, int win, int hout, int wout, int ch_in, int ch_out, cudaStream_t &stream); __inline__ bool ifVec(int m, int n, int k, - int lda, int ldb, int ldc) -{ + int lda, int ldb, int ldc) { bool vec_a = false; bool vec_b = false; bool vec_c = false; @@ -457,138 +581,359 @@ bool ifVec(int m, int n, int k, } void ker_gemm_32x32x32_NN_bias_relu(const int M, const int N, const int K, - const float alpha, const float* A, - const float beta, const float* B, - float* C, const float* bias, cudaStream_t cuda_stream); + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); void ker_gemm_32x32x32_NN_vec_bias_relu(const int M, const int N, const int K, - const float alpha, const float* A, - const float beta, const float* B, - float* C, const float* bias, cudaStream_t cuda_stream); + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); void ker_gemm_32x32x32_NN_bias(const int M, const int N, const int K, - const float alpha, const float* A, - const float beta, const float* B, - float* C, const float* bias, cudaStream_t cuda_stream); + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); void ker_gemm_32x32x32_NN_vec_bias(const int M, const int N, const int K, - const float alpha, const float* A, - const float beta, const float* B, - float* C, const float* bias, cudaStream_t cuda_stream); - -template + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); + + +void ker_gemm_128x128x8_NN_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); + +void ker_gemm_128x128x8_NN_vec_bias_relu(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); + +void ker_gemm_128x128x8_NN_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); + +void ker_gemm_128x128x8_NN_vec_bias(const int M, const int N, const int K, + const int batch_num, const int in_stride, const int out_stride, + const float alpha, const float *B, + const float beta, const float *A, + float *C, const float *bias, cudaStream_t cuda_stream); + +template void ker_sgemm_nn(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_nt(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_tn(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_tt(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_nn_vec(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_nt_vec(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_tn_vec(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); -template + const float alpha, const float *A, const float beta, const float *B, + float *C, cudaStream_t cuda_stream); + +template void ker_sgemm_tt_vec(const int M, const int N, const int K, const int lda, const int ldb, const int ldc, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); -template +template void ker_sgemm_sass(const int M, const int N, const int K, - const float alpha, const float* A, - const float beta, const float* B, - float* C, cudaStream_t cuda_stream); + const float alpha, const float *A, + const float beta, const float *B, + float *C, cudaStream_t cuda_stream); std::function + const float, const float *, const float, + const float *, float *, cudaStream_t)> saber_find_fast_sass_gemm(const bool TransA, const bool TransB, const int M, const int N, const int K); -template +template void conv_gemm_k1s1p0(int num, int in_stride, int out_stride, - float* out, const float* img, - const float* weights, int out_channel, + float *out, const float *weights, + const float *src, int out_channel, int in_channel, int img_h, int img_w, - const float* bias, cudaStream_t cuda_stream, - float a = 1.f, float b = 0.f) { + const float *bias, cudaStream_t cuda_stream, + float a = 1.f, float b = 0.f, int tile = 32) { float alpha = a; float beta = b; int m = out_channel; int k = in_channel; int n = img_h * img_w; - if (ifVec(m, n, k, k, n, n)) { - if (with_relu) { - for (int i = 0; i < num; ++i) { + if (tile == 32) { + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { ker_gemm_32x32x32_NN_vec_bias_relu(m, n, k, - alpha, weights, - beta, img + i * in_stride, - out + i * out_stride, bias, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, cuda_stream); - } - } else { - for (int i = 0; i < num; ++i) { + } else { ker_gemm_32x32x32_NN_vec_bias(m, n, k, - alpha, weights, - beta, img + i * in_stride, - out + i * out_stride, bias, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, cuda_stream); } + } else { + if (with_relu) { + ker_gemm_32x32x32_NN_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_gemm_32x32x32_NN_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } } } else { - if (with_relu) { - for (int i = 0; i < num; ++i) { - ker_gemm_32x32x32_NN_bias_relu(m, n, k, - alpha, weights, - beta, img + i * in_stride, - out + i * out_stride, bias, + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { + ker_gemm_128x128x8_NN_vec_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_gemm_128x128x8_NN_vec_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, cuda_stream); } } else { - for (int i = 0; i < num; ++i) { - ker_gemm_32x32x32_NN_bias(m, n, k, - alpha, weights, - beta, img + i * in_stride, - out + i * out_stride, bias, - cuda_stream); + if (with_relu) { + ker_gemm_128x128x8_NN_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_gemm_128x128x8_NN_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); } } } } +template +void conv_igemm_k1s1p0(int num, int in_stride, int out_stride, + void *out, const void *weights, + const void *src, int out_channel, + int in_channel_4, int img_h, int img_w, + const void *bias, cudaStream_t cuda_stream, + float a = 1.f, float b = 0.f, int tile = 32) { + + float alpha = a; + float beta = b; + int m = out_channel; + int k = in_channel_4; + int n = img_h * img_w; +// if (tile == 32) { + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { + ker_igemm_32x32x32_NN_vec_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_igemm_32x32x32_NN_vec_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } + } else { + if (with_relu) { + ker_igemm_32x32x32_NN_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_igemm_32x32x32_NN_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } + } +// } else { +// } +} + +template +void conv_igemm_s8s8_k1s1p0(int num, int in_stride, int out_stride, + void *out, const void *weights, + const void *src, int out_channel, + int in_channel_4, int img_h, int img_w, + const void *bias, cudaStream_t cuda_stream, + float a = 1.f, float b = 0.f, int tile = 32) { + + float alpha = a; + float beta = b; + int m = out_channel; + int k = in_channel_4; + int n = img_h * img_w; +// if (tile == 32) { + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { + ker_igemm_s8s8_32x32x32_NN_vec_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_igemm_s8s8_32x32x32_NN_vec_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } + } else { + if (with_relu) { + ker_igemm_s8s8_32x32x32_NN_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } else { + ker_igemm_s8s8_32x32x32_NN_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, bias, + cuda_stream); + } + } +// } else { +// } +} + +template +void conv_igemm_s8s8_scale_k1s1p0(int num, int in_stride, int out_stride, + void *out, const void *weights, + const void *src, int out_channel, + int in_channel_4, int img_h, int img_w, + const void *scale, const void *bias, + cudaStream_t cuda_stream, + float a = 1.f, float b = 0.f, int tile = 32) { + + float alpha = a; + float beta = b; + int m = out_channel; + int k = in_channel_4; + int n = img_h * img_w; +// if (tile == 32) { + if (ifVec(m, n, k, k, n, n)) { + if (with_relu) { + ker_igemm_s8s8_32x32x32_NN_scale_vec_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, scale, bias, + cuda_stream); + } else { + ker_igemm_s8s8_32x32x32_NN_scale_vec_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, scale, bias, + cuda_stream); + } + } else { + if (with_relu) { + ker_igemm_s8s8_32x32x32_NN_scale_bias_relu(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, scale, bias, + cuda_stream); + } else { + ker_igemm_s8s8_32x32x32_NN_scale_bias(m, n, k, + num, in_stride, out_stride, + alpha, src, + beta, weights, + out, scale, bias, + cuda_stream); + } + } +// } else { +// } +} + } // namespace saber } // namespace anakin diff --git a/third-party/sass/lib/libanakin_saber_sass.a b/third-party/sass/lib/libanakin_saber_sass.a index 216df2006..91daeb219 100644 Binary files a/third-party/sass/lib/libanakin_saber_sass.a and b/third-party/sass/lib/libanakin_saber_sass.a differ diff --git a/tools/anakin-lite/.gitignore b/tools/anakin-lite/.gitignore deleted file mode 100644 index 15a1cdf39..000000000 --- a/tools/anakin-lite/.gitignore +++ /dev/null @@ -1,46 +0,0 @@ -# Prerequisites -*.d - -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -#*.a -*.lib - -# Executables -*.exe -*.out -*.app - -# generate code -*.bin -*.h -*.cpp - -# dir -log -bin -lite -saber -utils -build -output diff --git a/tools/anakin-lite/CMakeLists.txt b/tools/anakin-lite/CMakeLists.txt deleted file mode 100644 index 048a210ec..000000000 --- a/tools/anakin-lite/CMakeLists.txt +++ /dev/null @@ -1,242 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2016 Baidu.com, Inc. All Rights Reserved -# ---------------------------------------------------------------------------- - -# options -option(USE_ARM_PLACE "Select the build mode for ARM place." YES) -option(USE_ARMV8 "build armv8" NO) -option(USE_ANDROID "using android place." YES) -option(USE_IOS "using android place." NO) -option(TARGET_IOS "using ios" NO) -option(USE_OPENMP "using openmp for lite." YES) -option(ENABLE_DEBUG "Enable DEBUG(default) mode." NO) -option(BUILD_LITE_UNIT_TEST "build unit test for lite." YES) -option(USE_OPENCV "use opencv in unit test" NO) -option(ENABLE_OP_TIMER "get time consumption of each op" NO) -option(USE_ANDROID_LOG "use build-in android logger" NO) - -if (USE_ARMV8) - set(ANDROID_ABI "arm64-v8a") -else() - set(ANDROID_ABI "armeabi-v7a with NEON") -endif() - -if(CMAKE_TOOLCHAIN_FILE) - set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") - # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( - get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) - find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) - message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}") -endif() - -if(NOT DEFINED CMAKE_INSTALL_PREFIX) - set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/" CACHE PATH "Installation Directory") -endif() -message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}") - -cmake_minimum_required(VERSION 2.8) -project(ANAKIN-lite C CXX) - -configure_file ( - "../../cmake/config/anakin_config.h.in" - "${PROJECT_BINARY_DIR}/anakin_config.h" -) - -if(ENABLE_DEBUG) - set(CMAKE_BUILD_TYPE Debug FORCE) -else() - set(CMAKE_BUILD_TYPE Release FORCE) - add_compile_options(-Ofast) - add_compile_options(-ffast-math) - add_compile_options(-Os) - add_compile_options(-DNDEBUG) -endif() - -set(anakin_lite_lib_so "anakin_lite") -set(anakin_lite_lib_static "anakin_lite_static") - -# disable shared library on xcode ios -if(USE_IOS) - set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE) -endif() -add_compile_options(-std=c++11) -add_compile_options(-fPIC) -if (USE_ANDROID) - #add_compile_options(-ffunction-sections) - #add_compile_options(-fdata-sections) - #add_compile_options(-fvisibility=hidden) - #add_compile_options(-fvisibility-inlines-hidden) - add_compile_options(-ldl) - add_compile_options(-Os) - add_compile_options(-Ofast) - if(USE_ARMV8) - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections") - set(MAKE_STATIC_LINKER_FLAGS "${MAKE_STATIC_LINKER_FLAGS} -Wl,--gc-sections") - else() - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections -Wl,--icf=safe") - set(MAKE_STATIC_LINKER_FLAGS "${MAKE_STATIC_LINKER_FLAGS} -Wl,--gc-sections -Wl,--icf=safe") - endif() -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions") - -#anakin_add_compile_option(-fstrict-aliasing) -add_compile_options(-W) -add_compile_options(-Wall) -add_compile_options(-Wno-unused-variable) # no unused-variable -add_compile_options(-Wformat) -add_compile_options(-Wmissing-declarations) -add_compile_options(-Winit-self) -add_compile_options(-Wpointer-arith) -add_compile_options(-Wno-shadow) -add_compile_options(-fpermissive) -add_compile_options(-Wsign-promo) -add_compile_options(-fdiagnostics-show-option) -add_compile_options(-Wno-undef) -add_compile_options(-Wno-narrowing) -add_compile_options(-Wno-unknown-pragmas) -add_compile_options(-Wno-delete-non-virtual-dtor) -add_compile_options(-Wno-comment) -add_compile_options(-Wno-sign-compare) -add_compile_options(-Wno-ignored-qualifiers) -add_compile_options(-Wno-enum-compare) -add_compile_options(-Wno-unused-parameter) -add_compile_options(-Wno-unused-function) - -if(USE_ANDROID) - add_compile_options(-pie) - if(USE_ARMV8) - else() - add_compile_options(-mfloat-abi=softfp) - add_compile_options(-mfpu=neon) - endif() - add_compile_options(-ffast-math) - add_compile_options(-lc) -endif() - -if(USE_OPENMP) - find_package(OpenMP REQUIRED) - if(OPENMP_FOUND OR OpenMP_CXX_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - message(STATUS "Found openmp in ${OPENMP_INCLUDE_DIR}") - message(STATUS " |-- openmp c flags: ${OpenMP_C_FLAGS}") - message(STATUS " |-- openmp cxx flags: ${OpenMP_CXX_FLAGS}") - message(STATUS " `-- openmp link flags: ${OpenMP_EXE_LINKER_FLAGS}") - include_directories(${OPENMP_INCLUDE_DIR}) - else() - message(FATAL_ERROR "Could not found openmp !") - endif() -endif() - -set(ANAKIN_LITE_SABER ${CMAKE_CURRENT_SOURCE_DIR}/../../saber/lite) -set(UNIT_TEST_LITE ${CMAKE_CURRENT_SOURCE_DIR}/../../test/lite) - -include_directories(${ANAKIN_LITE_SABER}/../) -include_directories(${ANAKIN_LITE_SABER}/../../) -include_directories(${PROJECT_BINARY_DIR}/) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/) - -FILE(GLOB BUILD_SRC_FILES1 "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") -FILE(GLOB BUILD_SRC_FILES2 "${ANAKIN_LITE_SABER}/core/*.cpp") -FILE(GLOB BUILD_SRC_FILES3 "${ANAKIN_LITE_SABER}/funcs/*.cpp") -FILE(GLOB BUILD_SRC_FILES4 "${ANAKIN_LITE_SABER}/funcs/neon/*.cpp") -FILE(GLOB BUILD_SRC_FILES5 "${ANAKIN_LITE_SABER}/funcs/neon/impl/*.cpp") -FILE(GLOB BUILD_SRC_FILES6 "${ANAKIN_LITE_SABER}/net/*.cpp") -FILE(GLOB BUILD_SRC_FILES7 "${ANAKIN_LITE_SABER}/utils/*.cpp") -FILE(GLOB HEADER_NET "${ANAKIN_LITE_SABER}/net/*.h") -FILE(GLOB HEADER_UTILS "${ANAKIN_LITE_SABER}/utils/*.h") - -if(USE_ANDROID) - FILE(GLOB UNIT_TEST_LITE_SRC "${UNIT_TEST_LITE}/*.cpp") -endif() - -if(USE_ANDROID_LOG) - find_library(log-lib log) -endif() - -add_library(ANAKIN_LITE_OBJS OBJECT ${BUILD_SRC_FILES1} ${BUILD_SRC_FILES2} ${BUILD_SRC_FILES3} ${BUILD_SRC_FILES4} ${BUILD_SRC_FILES5} - ${BUILD_SRC_FILES6} ${HEADER_NET} - ${BUILD_SRC_FILES7} ${HEADER_UTILS} - ) - -if(USE_ANDROID) - add_library(${anakin_lite_lib_so} SHARED $) - set_target_properties(${anakin_lite_lib_so} PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_INSTALL_PREFIX}/) - target_link_libraries(${anakin_lite_lib_so} ${log-lib}) -endif() -add_library(${anakin_lite_lib_static} STATIC $) -set_target_properties(${anakin_lite_lib_static} PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_INSTALL_PREFIX}/) - -if(USE_OPENCV) - # set your opencv path here - # for android - include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/arm-android/opencv/include/) - LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/arm-android/opencv/lib/armeabi-v7a/) -endif() - -if(BUILD_LITE_UNIT_TEST) - add_compile_options(-fexceptions) - # build test cases - foreach(SRC_NAME ${UNIT_TEST_LITE_SRC}) - #unpack the dir "/" - string(REPLACE "/" ";" SEXY_LIST ${SRC_NAME}) - list(GET SEXY_LIST -1 TEST_CASE_NAME) - #get the file name without suffix - string(REPLACE "." ";" SEXY_LIST ${TEST_CASE_NAME}) - list(GET SEXY_LIST 0 TEST_CASE_NAME) - add_executable(${TEST_CASE_NAME} ${SRC_NAME}) - if(NO)#BUILD_SHARED) - target_link_libraries(${TEST_CASE_NAME} ${anakin_lite_lib_so}) - else() - target_link_libraries(${TEST_CASE_NAME} -Wl,--whole-archive ${anakin_lite_lib_static} -Wl,--no-whole-archive) - endif() - if(USE_OPENCV) - target_link_libraries(${TEST_CASE_NAME} -lopencv_core -lopencv_highgui -lopencv_imgproc - -ltbb -llibtiff -llibpng -llibjpeg -llibjasper -lIlmImf -lc -lz -llog -ldl) - endif() - if(USE_ANDROID_LOG) - target_link_libraries(${TEST_CASE_NAME} ${log-lib}) - endif() - set_target_properties(${TEST_CASE_NAME} PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${CMAKE_INSTALL_PREFIX}/unit_test) - endforeach() -endif() - -FILE(GLOB MODEL_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/*.h") - -install(DIRECTORY ${ANAKIN_LITE_SABER}/../../saber/lite/core - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite - FILES_MATCHING - PATTERN "*.h") -install(DIRECTORY ${ANAKIN_LITE_SABER}/../../saber/lite/net - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite - FILES_MATCHING - PATTERN "*.h") -install(DIRECTORY ${ANAKIN_LITE_SABER}/../../saber/lite/utils - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite - FILES_MATCHING - PATTERN "*.h") -install(FILES ${ANAKIN_LITE_SABER}/../../saber/lite/funcs/timer_lite.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite/funcs) -install(FILES ${ANAKIN_LITE_SABER}/../../saber/lite/funcs/op_base.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite/funcs) -install(FILES ${ANAKIN_LITE_SABER}/../../saber/lite/funcs/op_param.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber/lite/funcs) -install(FILES ${ANAKIN_LITE_SABER}/../../saber/saber_types.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/include/saber) -install(FILES ${MODEL_HEADER} - DESTINATION ${CMAKE_INSTALL_PREFIX}/include) -install(FILES ${PROJECT_BINARY_DIR}/anakin_config.h - DESTINATION ${CMAKE_INSTALL_PREFIX}/include) - -install(TARGETS ${anakin_lite_lib_static} - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -if(USE_ANDROID) - install(TARGETS ${anakin_lite_lib_static} ${anakin_lite_lib_so} - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -endif() diff --git a/tools/anakin-lite/README.md b/tools/anakin-lite/README.md deleted file mode 100644 index 638fc8a05..000000000 --- a/tools/anakin-lite/README.md +++ /dev/null @@ -1,247 +0,0 @@ -# Anakin Lite -Anakin Lite是Anakin为移动端打造的轻量化前向计算库,支持AOT和通用两种模式。 -AOT模式是使用模型转换器根据具体一个模型生成与模型相关的`*.h`, `*.cpp`和模型文件`*.bin`,然后编译生成模型对应的库。 -通用模式是直接编译生成库,库是通用的,所需的模型文件只需通过模型转换器转换为`*.lite.bin)`(融合模型)或者`*.info, *.bin`(分立模型)即可使用。 -其中`*.info`表示模型的描述文件;`*.bin`表示模型的weights;`*.lite.bin`融合模型包含了模型的weights和模型描述文件。 -Anakin Lite 的特性包括: -* 支持ARMv7/v8架构 -* 支持Android和ios系统 -* 无第三方依赖 -* 支持openmp多线程 -* 支持大小核调度机制 -* 支持从memory加载模型 -* 简单易用的API -## 编译模型转换器 -1. 为宿主机编译安装protobuf -protobuf3.4.0 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0) -```bash -$ tar -xzf protobuf-3.4.0.tar.gz -$ cd protobuf-3.4.0 -$ ./autogen.sh -$ ./configure -$ make -$ make check -$ make install -``` -2. 编译模型转换器 -运行tools目录下build_lite.sh,编译完成后,会在output目录下生成generator文件夹 - -## AOT模式 -#### 一、使用模型转换器转换为`*.bin`模型和生成相应`*.h`, `*.cpp` #### -1. 运行generator目录下的gen_code.sh,转换`*.anakin.bin`模型,输出目录选择到`tools/anakin_lite`, -'-a'参数为1,表示AOT模式。该命令会输出3个文件,`*.h`, `*.cpp`和`*.bin`。 -‘-m’参数为模型(model)所在路径,如“/home/Anakin/mobilenet.anakin.bin” -'-n'参数为生成三个文件的名字(name) -'-o'参数为生成文件的路径,一般设置在tools/anakin-lite目录 -‘-d’参数为Debug模式,默认为0,不开启Debug -```bash -$ sh gen_code.sh -a 1 -m /home/Anakin/mobilenet.anakin.bin -n mobilenet -o ../../tools/anakin-lite -d 0 -``` -2. 如果有多个模型,重复1的操作即可。 - -#### 二、使用脚本编译Anakin Lite库 #### -1. 编辑tools/anakin_lite目录下的脚本lite_android_build_armv7/8.sh,设置ANDROID_NDK路径。 -2. 运行脚本即可生成模型对应的库。 - -#### 三、测试模型(可选) #### -1. 根据具体的测试模型修改`test/lite/`目录下的`test_lite_aot_model.cpp`,编译完成后,使用adb push将tools/anakin_lite/output/unit_test目录下生成的test_lite_aot_model和模型`*.bin`拷贝到手机目录data/local/tmp -```bash -$ adb push tools/anakin_lite/output/unit_test/test_lite_model data/local/tmp -$ adb push tools/anakin_lite/*.bin data/local/tmp -``` -2. 使用adb shell命令运行test_lite_aot_model,用法为 -./test_lite_aot_model <模型文件> <预热次数> <执行次数> <大小核> <线程数> -大小核参数:0代表使用大核,1代表使用小核心。 -如测试model.bin,batch_size=1,预热十次,测试二十次,使用大核,四线程 -```bash -$ adb shell -$ cd data/local/tmp -$ ./test_lite_aot_model model.bin 1 10 20 0 4 -``` - -## 通用模式 - -#### 一、使用脚本编译Anakin Lite通用库 #### -1. 如使用过AOT模式,请删除tools/anakin_lite目录下的`.h`和`.cpp`文件。注释掉`test/lite/test_lite_model.cpp`AOT模式下添加的模型,如果没有编辑过该文件,则不需要修改。 -2. 编译Android库:编辑tools/anakin_lite目录下的脚本lite_android_build_armv7/8.sh,设置ANDROID_NDK路径。 -3. 编译IOS库:直接运行lite_ios_build_armv7/8.sh。 -4. 运行脚本即可生成通用库。 - -#### 二、使用模型转换器把模型转换为Lite版(已有Lite版模型文件可跳过) #### -1. 运行generator目录下的gen_code.sh,转换`*.anakin.bin`模型,输出目录选择到`tools/anakin_lite`, -'-a'参数为0,表示通用模式。该命令会输出3个模型文件`*.lite.bin`,`*.bin`, `*.info`,可以选择用融合的模型`*.lite.bin`或者同时使用`*.bin`和`*.info`。 -‘-m’参数为模型(model)所在路径,如“/home/Anakin/mobilenet.anakin.bin” -'-n'参数为生成模型文件的名字(name) -'-o'参数为生成文件的路径,一般设置在tools/anakin-lite目录 -‘-d’参数为Debug模式,默认为0,不开启Debug -```bash -$ sh gen_code.sh -a 0 -m /home/Anakin/mobilenet.anakin.bin -n mobilenet -o ../../tools/anakin-lite -d 0 -``` - -#### 三、测试模型(可选) #### -1. 使用adb push将tools/anakin_lite/output/unit_test目录下生成的test_lite_model或者test_lite_merged_model和模型`*.info, *.bin`或者`*.lite.bin`拷贝到手机目录data/local/tmp。内存加载模式可以参考test_lite_model_from_mem或者test_lite_merged_model_from_mem。 -```bash -$ adb push tools/anakin_lite/output/unit_test/test_lite_net data/local/tmp -$ adb push tools/anakin_lite/*.lite.bin data/local/tmp -``` -2. 使用adb shell命令运行test_lite_net,用法为 -./test_lite_net <模型文件> <预热次数> <执行次数> <大小核> <线程数> -大小核参数:0代表使用大核,1代表使用小核心 -如测试model.lite.bin,batch_size=1,预热十次,测试二十次,使用大核,四线程 -```bash -$ adb shell -$ cd data/local/tmp -$ ./test_lite_model model.lite.bin 1 10 20 0 4 -``` - -## API 使用说明 - -### Net -Net类是Anakin预测库对外的接口。 -1. 构造函数`Net(PowerMode mode = SABER_POWER_HIGH, int threads = 1)`: -说明:构造一个net,net可以加载模型,获取输入输出,并做预测。 -参数: -* `mode`:可以指定Android端大小核调度。默认参数`SABER_POWER_HIGH`:使用大核; -`SABER_POWER_LOW`:使用小核;`SABER_POWER_FULL`:可以同时使用大小核,优先使用大核;`SABER_POWER_NO_BIND`:不绑定大小核。 -* `threads`:指定前向计算的线程数(Android,Openmp),默认1个线程。当指定大小核时,线程数若超过核的数量,则线程数会设置为相应处理器核的数量。 -当模式是`SABER_POWER_FULL`或者`SABER_POWER_NO_BIND`时,输入线程数若超过总的处理器核数量时,线程数量会被设置为总核数。 - -2. 运行模式设置`set_run_mode(PowerMode mode, int threads)`: -说明:设置模型运行模式,支持Android系统,可以指定大小核和线程数量。 -参数:参考构造函数。 - -3. 从文件路径加载融合模型`load_model(const char* lite_model_path)`: -说明: 从文件路径加载模型,模型为`*.lite.bin`融合模型,包含网络信息和参数; -参数: `const char* lite_model_path`: 模型路径 -返回: 若加载成功,则返回`SaberSuccess`,否则返回错误代码; - -4. 从文件路径加载分立模型`load_model(const char* info_path, const char* weights_path)`: -说明: 从文件路径加载分立模型,分别为网络信息和参数信息; -参数: -* `const char* info_path`: 模型网络信息 -* `const char* weights_path`:网络参数信息 -返回: 若加载成功,则返回`SaberSuccess`,否则返回错误代码; - -5. 从内存加载融合模型`load_model(const void* merged_memory, size_t mem_size)`: -说明: 从内存加载融合模型,包含网络信息和参数; -参数: -* `const void* merged_memory`: 融合模型 -* `size_t mem_size`:数据长度,单位bytes -返回: 若加载成功,则返回`SaberSuccess`,否则返回错误代码; - -6. 从内存加载分立模型`load_model(const void* info_memory, size_t info_size, const void* weights_memory, size_t weights_size)`: -说明: 从内存加载分立模型,分别为网络信息和参数信息; -参数: -* `const void* info_memory`: 模型网络信息 -* `size_t info_size`:数据长度,单位bytes -* `const void* weights_memory`:网络参数信息 -* `size_t weights_size`:数据长度,单位bytes -返回: 若加载成功,则返回`SaberSuccess`,否则返回错误枚举类型; - -7. 获取网络输入`std::vector*> get_input()`: -说明:获取net所有的输入tensor的指针,可以进行赋值和reshape操作 -返回:返回一个vector存放所有输入tensor的指针,tensor已经分配好空间。 - -8. 获取网络指定的输入`Tensor* get_input(std::string name)`: -说明:根据输入的名称,获取指定输入tensor指针 -参数:`std::string name`:输入tensor的名称,可以在网络图中获取 -返回:如果存在名字为`name`的tensor,则返回该tensor的指针,否则返回`nullptr` - -9. 获取网络全部输出`std::vector*> get_output()`: -说明: 获取网络所有输出tensor的指针 -返回:返回一个vector存放所有输出tensor的指针。 - -10. 获取网络指定输出`Tensor* get_output(std::string name)`: -说明:根据输入的名称,获取指定输出tensor指针 -参数:`std::string name`:输出tensor的名称,可以在网络图中获取 -返回:如果存在名字为`name`的tensor,则返回该tensor的指针,否则返回`nullptr` - -11. 网络前向计算`prediction()`: -说明: 网络前向计算 -返回: 如果成功返回`SaberSuccess`,如果有错误返回相应错误枚举类型。 - -### Tensor -`Tensor`类是Anakin lite的基础数据类型,Tensor是一个模板类, -支持移动端CPU,GPU,DSP等,支持数据类型有float,int8等。目前lite版仅支持CPU数据, -数据类型为float,即声明Tensor对象时需要指定模板为`Tensor` -Tensor支持内存的复用,因此Tensor包含当前有效维度信息`valid_shape`和总维度信息`Shape`, -在取数据时,需要注意用`valid_shape`和`valid_size`接口。 -1. 构造函数 -Tensor包含4个构造函数: -* `Tensor()`:空构造,声明一个空的tensor,没有分配数据空间; -* `Tensor(Shape shape)`:构造一个维度信息为`shape`的tensor,分配`shape`维度信息的数据空间; -* `Tensor(Dtype* data_ptr, Shape shape)`:从已有的数据构造一个tensor,不分配数据空间; -* `Tensor(const Tensor& tensor)`:拷贝构造函数,数据为浅拷贝 - -2. 设置tensor维度信息`set_shape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape())`: -说明:设置tensor的维度信息,不分配数据空间。 -参数: -* `valid_shape`:当前tensor有效数据维度信息 -* `shape`:当前tensor真正维度信息。默认为空,表示与valid_shape一致,shape始终要大于等于valid_shape -* `offset`:表示valid_shape偏移shape的维度信息,默认为空,只有在share_sub_buffer的情况下用到(该参数暂时没有用)。 -返回:如果成功返回`SaberSuccess`,否则返回错误枚举类型。 - -3. 重新分配空间`re_alloc(Shape shape)`: -说明:重新分配tensor内存空间,如果tensor已经分配了内存空间,则先释放该内存,重新申请一块内存。 -如果当前tensor是从别的tensor共享的(调用share_from),在调用此接口时会返回错误。 -参数:`shape`: tensor维度信息,调用该接口后,tensor内部的`valid_shape`和`shape`都变成输入的`shape`. -返回:如果成功返回`SaberSuccess`,否则返回错误枚举类型。 - -4. 调整内存空间`reshape(Shape valid_shape, Shape shape = Shape(), Shape offset = Shape())`: -说明:调整tensor内存空间和有效数据维度信息。该接口可以用于对网络(net)输入维度进行调整。如果tensor是通过`share_from`共享的, -则输入`shape`的大小不能超过原有tensor的`shape`的大小。 -参数: -* `valid_shape`:当前tensor有效数据维度信息 -* `shape`:当前tensor真正维度信息。默认为空,表示与valid_shape一致,shape始终要大于等于valid_shape -* `offset`:表示valid_shape偏移shape的维度信息,默认为空,只有在share_sub_buffer的情况下用到(该参数暂时没有用)。 -返回:如果成功返回`SaberSuccess`,否则返回错误枚举类型。 - -5. 获取有效维度信息`valid_shape()`: -说明:获取当前tensor有效的数据维度信息。 -返回:维度信息Shape - -6. 获取真实维度信息`shape()`: -说明:获取当前tensor真实的数据维度信息。 -返回:维度信息Shape - -7. 获取有效数据长度`valid_size()`: -说明:获取有效数据的长度 -返回:有效数据长度 - -8. 获取真实数据长度`size()`: -说明:获取有效数据的长度 -返回:有效数据长度 - -9. 获取可修改数据的指针`mutable_data(int index = 0)`: -说明:获取tensor的数据指针,可读写 -参数:`index`:数据起始地址,默认为0 -返回:数据指针 - -10. 获取只读数据的指针`data(int index = 0)`: -说明:获取tensor的数据指针,只读 -参数:`index`:数据起始地址,默认为0 -返回:数据指针 - -11. 数据共享`share_from(const Tensor& tensor)`: -说明:共享tensor的数据空间,要求被共享的tensor的真实数据长度不小于当前tensor真实数据长度。 -参数:`tensor`:被共享的数据空间的tensor -返回:如果成功返回`SaberSuccess`,否则返回相应的错误枚举类型。 - -12. 数据拷贝`copy_from(const Tensor& tensor)`: -说明:tensor至今数据拷贝,要求当前tensor和被拷贝的tensor有效数据长度必须一致。 -参数:`tensor`:被拷贝的数据空间的tensor -返回:如果成功返回`SaberSuccess`,否则返回相应的错误枚举类型。 - -13. 获取特定维度信息`num()`, `channel()`, `height()`, `width()`: -说明: -* `num()`获取tensor的batch大小; -* `channel()`获取tensor的通道数; -* `height()`获取tensor高度大小; -* `width()`获取tensor宽度大小; -返回: 返回对应的维度大小 - -### Shape -`Shape`类用于指定`Tensor`类数据维度信息,Layout类型是NCHW -1. 构造函数 -* `Shape(First first, Args... res)`: 通过可变长参数构造Shape,可以是任意长度。在Tensor中使用时,输入为4维。 -* `Shape(std::vector vsh)`: 从一个vector构造Shape,Shape中的数据从vsh拷贝 \ No newline at end of file diff --git a/tools/anakin-lite/build_ios_merge.sh b/tools/anakin-lite/build_ios_merge.sh deleted file mode 100755 index 3b2d3ff0d..000000000 --- a/tools/anakin-lite/build_ios_merge.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# This script shows how one can build a merged ios lib. -ANAKIN_LITE_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" -echo "-- Anakin lite root dir is: $ANAKIN_LITE_ROOT" - -BUILD_ROOT=$ANAKIN_LITE_ROOT -sh lite_ios_build_armv7.sh -sh lite_ios_build_armv8.sh -lipo -create build-ios-armv7/lib/libanakin_lite_static.a build-ios-armv8/lib/libanakin_lite_static.a -output libanakin_lite_static.a -OUT_DIR=$BUILD_ROOT/../../output -if [ -d $OUT_DIR/ios_merge ];then - rm -rf $OUT_DIR/ios_merge - mkdir -p $OUT_DIR/ios_merge/include - mkdir -p $OUT_DIR/ios_merge/lib -else - mkdir -p $OUT_DIR/ios_merge/include - mkdir -p $OUT_DIR/ios_merge/lib -fi - -cp -r $ANAKIN_LITE_ROOT/build-ios-armv8/include/ $OUT_DIR/ios_merge/include -cp $ANAKIN_LITE_ROOT/libanakin_lite_static.a $OUT_DIR/ios_merge/lib \ No newline at end of file diff --git a/tools/anakin-lite/lite_android_build_armv7.sh b/tools/anakin-lite/lite_android_build_armv7.sh deleted file mode 100755 index 42da593e1..000000000 --- a/tools/anakin-lite/lite_android_build_armv7.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# This script shows how one can build a anakin for the Android platform using android-tool-chain. -# IMPORTANT!!!!!!!!!!!!!! -# remove "-g" compile flags in "$ANDROID_NDK/build/cmake/android.toolchain.cmake" -# to remove debug info -export ANDROID_NDK=/home/public/android-ndk-r14b/ - -ANAKIN_LITE_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" -echo "-- Anakin lite root dir is: $ANAKIN_LITE_ROOT" - -if [ -z "$ANDROID_NDK" ]; then - echo "-- Did you set ANDROID_NDK variable?" - exit 1 -fi - -if [ -d "$ANDROID_NDK" ]; then - echo "-- Using Android ndk at $ANDROID_NDK" -else - echo "-- Cannot find ndk: did you install it under $ANDROID_NDK ?" - exit 1 -fi - -# build the target into build_android. -BUILD_ROOT=$ANAKIN_LITE_ROOT/build-android-v7 - -#if [ -d $BUILD_ROOT ];then -# rm -rf $BUILD_ROOT -#fi - -mkdir -p $BUILD_ROOT -echo "-- Build anakin lite Android into: $BUILD_ROOT" - -# Now, actually build the android target. -echo "-- Building anakin lite ..." -cd $BUILD_ROOT -#-DCMAKE_TOOLCHAIN_FILE=../../../cmake/android/android.toolchain.cmake \ # set toolchain file to file in this project -#-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ # set toolchain file to NDK default -#-DANDROID_STL=gnustl_static \ # set stl lib -#-DANDROID_TOOLCHAIN=clang \ # set compile to gcc or clang -cmake .. \ - -DCMAKE_TOOLCHAIN_FILE=../../../cmake/android/android.toolchain.cmake \ - -DANDROID_NDK=$ANDROID_NDK \ - -DANDROID_NATIVE_API_LEVEL=19 \ - -DANDROID_ABI="armeabi-v7a with NEON" \ - -DENABLE_DEBUG=NO \ - -DUSE_ARMV8=NO \ - -DUSE_ANDROID=YES \ - -DTARGET_IOS=NO \ - -DUSE_OPENMP=YES \ - -DBUILD_LITE_UNIT_TEST=YES \ - -DUSE_OPENCV=NO \ - -DENABLE_OP_TIMER=NO \ - -DUSE_ANDROID_LOG=NO - -# build target lib or unit test. -if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install -else - make "-j$(nproc)" && make install -fi - -OUT_DIR=$BUILD_ROOT/../../../output -if [ -d $OUT_DIR/android_armv7 ];then - rm -rf $OUT_DIR/android_armv7 - mkdir -p $OUT_DIR/android_armv7/include - mkdir -p $OUT_DIR/android_armv7/lib -else - mkdir -p $OUT_DIR/android_armv7/include - mkdir -p $OUT_DIR/android_armv7/lib -fi - -cp -r include/ $OUT_DIR/android_armv7/include -cp -r lib/ $OUT_DIR/android_armv7/lib -cp -r unit_test/ $OUT_DIR/android_armv7/unit_test \ No newline at end of file diff --git a/tools/anakin-lite/lite_android_build_armv8.sh b/tools/anakin-lite/lite_android_build_armv8.sh deleted file mode 100755 index 620646036..000000000 --- a/tools/anakin-lite/lite_android_build_armv8.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# This script shows how one can build a anakin for the Android platform using android-tool-chain. -# IMPORTANT!!!!!!!!!!!!!! -# remove "-g" compile flags in "$ANDROID_NDK/build/cmake/android.toolchain.cmake" -# to remove debug info -export ANDROID_NDK=/home/public/android-ndk-r14b/ - -ANAKIN_LITE_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" -echo "-- Anakin lite root dir is: $ANAKIN_LITE_ROOT" - -if [ -z "$ANDROID_NDK" ]; then - echo "-- Did you set ANDROID_NDK variable?" - exit 1 -fi - -if [ -d "$ANDROID_NDK" ]; then - echo "-- Using Android ndk at $ANDROID_NDK" -else - echo "-- Cannot find ndk: did you install it under $ANDROID_NDK ?" - exit 1 -fi - -# build the target into build_android. -BUILD_ROOT=$ANAKIN_LITE_ROOT/build-android-v8 - -#if [ -d $BUILD_ROOT ];then -# rm -rf $BUILD_ROOT -#fi - -mkdir -p $BUILD_ROOT -echo "-- Build anakin lite Android into: $BUILD_ROOT" - -# Now, actually build the android target. -echo "-- Building anakin lite ..." -cd $BUILD_ROOT -#-DCMAKE_TOOLCHAIN_FILE=../../../cmake/android/android.toolchain.cmake \ # set toolchain file to file in this project -#-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ # set toolchain file to NDK default -#-DANDROID_STL=gnustl_static \ # set stl lib -#-DANDROID_TOOLCHAIN=clang \ # set compile to gcc or clang -cmake .. \ - -DCMAKE_TOOLCHAIN_FILE=../../../cmake/android/android.toolchain.cmake \ - -DANDROID_NDK=$ANDROID_NDK \ - -DANDROID_NATIVE_API_LEVEL=21 \ - -DANDROID_ABI="arm64-v8a" \ - -DENABLE_DEBUG=NO \ - -DUSE_ARMV8=YES \ - -DUSE_ANDROID=YES \ - -DTARGET_IOS=NO \ - -DUSE_OPENMP=YES \ - -DBUILD_LITE_UNIT_TEST=YES \ - -DUSE_OPENCV=NO \ - -DENABLE_OP_TIMER=NO \ - -DUSE_ANDROID_LOG=NO - -# build target lib or unit test. -if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install -else - make "-j$(nproc)" && make install -fi - -OUT_DIR=$BUILD_ROOT/../../../output -if [ -d $OUT_DIR/android_armv8 ];then - rm -rf $OUT_DIR/android_armv8 - mkdir -p $OUT_DIR/android_armv8/include - mkdir -p $OUT_DIR/android_armv8/lib -else - mkdir -p $OUT_DIR/android_armv8/include - mkdir -p $OUT_DIR/android_armv8/lib -fi - -cp -r include/ $OUT_DIR/android_armv8/include -cp -r lib/ $OUT_DIR/android_armv8/lib -cp -r unit_test/ $OUT_DIR/android_armv8/unit_test \ No newline at end of file diff --git a/tools/anakin-lite/lite_ios_build_armv7.sh b/tools/anakin-lite/lite_ios_build_armv7.sh deleted file mode 100755 index b4a264e15..000000000 --- a/tools/anakin-lite/lite_ios_build_armv7.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# This script shows how one can build a anakin for the Android platform using android-tool-chain. - -ANAKIN_LITE_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" -echo "-- Anakin lite root dir is: $ANAKIN_LITE_ROOT" - -# build the target into build_android. -BUILD_ROOT=$ANAKIN_LITE_ROOT/build-ios-armv7 - -#if [ -d $BUILD_ROOT ];then -# rm -rf $BUILD_ROOT -#fi - -mkdir -p $BUILD_ROOT -echo "-- Build anakin lite ios into: $BUILD_ROOT" - -# Now, actually build the android target. -echo "-- Building anakin lite ..." -cd $BUILD_ROOT - -cmake .. \ - -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios/ios.toolchain.cmake \ - -DENABLE_DEBUG=NO \ - -DIOS_PLATFORM=iPhoneOS \ - -DUSE_ARMV8=NO \ - -DCMAKE_OSX_ARCHITECTURES=armv7 \ - -DUSE_IOS=YES \ - -DUSE_ANDROID=NO \ - -DTARGET_IOS=YES \ - -DUSE_OPENMP=NO \ - -DBUILD_LITE_UNIT_TEST=NO \ - -DUSE_OPENCV=NO \ - -DENABLE_OP_TIMER=NO \ - -DUSE_ANDROID_LOG=NO - -# build target lib or unit test. -if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install -else - make "-j$(nproc)" && make install -fi -OUT_DIR=$BUILD_ROOT/../../../output -if [ -d $OUT_DIR/ios_armv7 ];then - rm -rf $OUT_DIR/ios_armv7 - mkdir -p $OUT_DIR/ios_armv7/include - mkdir -p $OUT_DIR/ios_armv7/lib -else - mkdir -p $OUT_DIR/ios_armv7/include - mkdir -p $OUT_DIR/ios_armv7/lib -fi - -cp -r include/ $OUT_DIR/ios_armv7/include -cp -r lib/ $OUT_DIR/ios_armv7/lib \ No newline at end of file diff --git a/tools/anakin-lite/lite_ios_build_armv8.sh b/tools/anakin-lite/lite_ios_build_armv8.sh deleted file mode 100755 index dccc015e5..000000000 --- a/tools/anakin-lite/lite_ios_build_armv8.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# This script shows how one can build a anakin for the Android platform using android-tool-chain. - -ANAKIN_LITE_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" -echo "-- Anakin lite root dir is: $ANAKIN_LITE_ROOT" - -# build the target into build_android. -BUILD_ROOT=$ANAKIN_LITE_ROOT/build-ios-armv8 - -#if [ -d $BUILD_ROOT ];then -# rm -rf $BUILD_ROOT -#fi - -mkdir -p $BUILD_ROOT -echo "-- Build anakin lite ios into: $BUILD_ROOT" - -# Now, actually build the android target. -echo "-- Building anakin lite ..." -cd $BUILD_ROOT - -cmake .. \ - -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios/ios.toolchain.cmake \ - -DENABLE_DEBUG=NO \ - -DIOS_PLATFORM=iPhoneOS \ - -DUSE_ARMV8=YES \ - -DCMAKE_OSX_ARCHITECTURES=arm64 \ - -DUSE_IOS=YES \ - -DUSE_ANDROID=NO \ - -DTARGET_IOS=YES \ - -DUSE_OPENMP=NO \ - -DBUILD_LITE_UNIT_TEST=NO \ - -DUSE_OPENCV=NO \ - -DENABLE_OP_TIMER=NO \ - -DUSE_ANDROID_LOG=NO - -# build target lib or unit test. -if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install -else - make "-j$(nproc)" && make install -fi - -OUT_DIR=$BUILD_ROOT/../../../output -if [ -d $OUT_DIR/ios_armv8 ];then - rm -rf $OUT_DIR/ios_armv8 - mkdir -p $OUT_DIR/ios_armv8/include - mkdir -p $OUT_DIR/ios_armv8/lib -else - mkdir -p $OUT_DIR/ios_armv8/include - mkdir -p $OUT_DIR/ios_armv8/lib -fi - -cp -r include/ $OUT_DIR/ios_armv8/include -cp -r lib/ $OUT_DIR/ios_armv8/lib \ No newline at end of file diff --git a/tools/android_build_v7.sh b/tools/android_build_v7.sh new file mode 100755 index 000000000..12e6df580 --- /dev/null +++ b/tools/android_build_v7.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# This script shows how one can build a anakin for the Android platform using android-tool-chain. +export ANDROID_NDK=/Users/chenjiao04/Documents/android-ndk-r16b/ +export ARM_PROTOBUF_ROOT=/home/public/arm-android/protobuf + +ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" +echo "-- Anakin root dir is: $ANAKIN_ROOT" + +if [ -z "$ANDROID_NDK" ]; then + echo "-- Did you set ANDROID_NDK variable?" + exit 1 +fi + +if [ -d "$ANDROID_NDK" ]; then + echo "-- Using Android ndk at $ANDROID_NDK" +else + echo "-- Cannot find ndk: did you install it under $ANDROID_NDK ?" + exit 1 +fi + +# build the target into build_android. +BUILD_ROOT=$ANAKIN_ROOT/android_build_armv7 + +# if [ -d $BUILD_ROOT ];then +# rm -rf $BUILD_ROOT +# fi + +mkdir -p $BUILD_ROOT +echo "-- Build anakin Android into: $BUILD_ROOT" + +# Now, actually build the android target. +#../cmake/android/android.toolchain.cmake \ +#"armeabi-v7a with NEON" \ "arm64-v8a" \ +# -DANDROID_STL=c++_static \ +echo "-- Building anakin ..." +cd $BUILD_ROOT +# rm -rf * +cmake .. \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_NDK=$ANDROID_NDK \ + -DCMAKE_BUILD_TYPE=Release \ + -DANDROID_ABI="armeabi-v7a with NEON" \ + -DANDROID_TOOLCHAIN=gcc \ + -DANDROID_NATIVE_API_LEVEL=21 \ + -DUSE_ARM_PLACE=YES \ + -DUSE_GPU_PLACE=NO \ + -DUSE_X86_PLACE=NO \ + -DTARGET_ANDROID=YES \ + -DBUILD_WITH_UNIT_TEST=YES \ + -DUSE_PYTHON=OFF \ + -DENABLE_DEBUG=NO \ + -DENABLE_VERBOSE_MSG=NO \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO \ + -DUSE_OPENMP=YES \ + -DENABLE_OP_TIMER=NO \ + -DBUILD_SHARED=NO\ + -DBUILD_EXAMPLES=NO \ + -DBUILD_WITH_FRAMEWORK=YES \ + -DUSE_OPENCV=NO + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make -j4 # && make install +else + make -j4 # && make install +fi + diff --git a/tools/andrid_build.sh b/tools/android_build_v8.sh old mode 100755 new mode 100644 similarity index 52% rename from tools/andrid_build.sh rename to tools/android_build_v8.sh index f34aa7fbe..c17eb0d1e --- a/tools/andrid_build.sh +++ b/tools/android_build_v8.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script shows how one can build a anakin for the Android platform using android-tool-chain. -export ANDROID_NDK=/home/public/android-ndk-r14b +# This script shows how one can build a anakin for the Android platform using android-tool-chain. +export ANDROID_NDK=/Users/zhangxi20/Downloads/android-ndk-r16b/ export ARM_PROTOBUF_ROOT=/home/public/arm-android/protobuf ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" @@ -19,41 +19,44 @@ else fi # build the target into build_android. -BUILD_ROOT=$ANAKIN_ROOT/android_build +BUILD_ROOT=$ANAKIN_ROOT/android_build_armv8 -#if [ -d $BUILD_ROOT ];then -# rm -rf $BUILD_ROOT -#fi +# if [ -d $BUILD_ROOT ];then +# rm -rf $BUILD_ROOT +# fi mkdir -p $BUILD_ROOT echo "-- Build anakin Android into: $BUILD_ROOT" # Now, actually build the android target. +#../cmake/android/android.toolchain.cmake \ +#"armeabi-v7a with NEON" \ "arm64-v8a" \ +# -DANDROID_STL=c++_static \ echo "-- Building anakin ..." cd $BUILD_ROOT - +rm -rf * cmake .. \ - -DCMAKE_TOOLCHAIN_FILE=../cmake/android/android.toolchain.cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_NDK=$ANDROID_NDK \ -DCMAKE_BUILD_TYPE=Release \ - -DANDROID_ABI="armeabi-v7a with NEON" \ - -DANDROID_NATIVE_API_LEVEL=21 \ - -DUSE_ARM_PLACE=YES \ - -DUSE_GPU_PLACE=NO \ - -DUSE_X86_PLACE=NO \ - -DUSE_BM_PLACE=NO \ - -DTARGET_ANDROID=YES \ - -DBUILD_WITH_UNIT_TEST=YES \ + -DANDROID_ABI="arm64-v8a" \ + -DANDROID_TOOLCHAIN=gcc \ + -DANDROID_NATIVE_API_LEVEL=21 \ + -DUSE_ARM_PLACE=YES \ + -DUSE_GPU_PLACE=NO \ + -DUSE_X86_PLACE=NO \ + -DTARGET_ANDROID=YES \ + -DBUILD_WITH_UNIT_TEST=YES \ -DUSE_PYTHON=OFF \ - -DENABLE_DEBUG=NO \ - -DENABLE_VERBOSE_MSG=NO \ - -DDISABLE_ALL_WARNINGS=YES \ - -DENABLE_NOISY_WARNINGS=NO \ + -DENABLE_DEBUG=NO \ + -DENABLE_VERBOSE_MSG=NO \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO \ -DUSE_OPENMP=YES\ - -DBUILD_SHARED=NO\ - -DBUILD_WITH_UNIT_TEST=YES\ - -DBUILD_EXAMPLES=NO\ - -DUSE_OPENCV=NO + -DBUILD_SHARED=NO\ + -DBUILD_EXAMPLES=NO \ + -DBUILD_WITH_FRAMEWORK=NO \ + -DUSE_OPENCV=NO # build target lib or unit test. if [ "$(uname)" = 'Darwin' ]; then diff --git a/tools/build_android_protobuf_gcc_armv7.sh b/tools/build_android_protobuf_gcc_armv7.sh new file mode 100644 index 000000000..1c2bfcbd8 --- /dev/null +++ b/tools/build_android_protobuf_gcc_armv7.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# This script shows how one can build protobuf for the Android platform using android-tool-chain. +# IMPORTANT!!!!!!!!!!!!!! +# remove "-g" compile flags in "$ANDROID_NDK/build/cmake/android.toolchain.cmake" +# to remove debug info +# set your ndk path to ANDROID_NDK +# NDK version is up to r16b, the latest version(r18b) remove gcc from toolchain +# firstly, download the release version of protobuf or git clone the protobuf project, recoment version v3.5.0 +# copy this script to protobuf_path/cmake/ +# run this script by: sh build_android_protobuf_gcc_armv7.sh +set -e +export ANDROID_NDK=/home/public/android-ndk-r16b + +protobuf_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" +echo "-- protobuf root dir is: $protobuf_ROOT" + +if [ -z "$ANDROID_NDK" ]; then + echo "-- Did you set ANDROID_NDK variable?" + exit 1 +fi + +if [ -d "$ANDROID_NDK" ]; then + echo "-- Using Android ndk at $ANDROID_NDK" +else + echo "-- Cannot find ndk: did you install it under $ANDROID_NDK ?" + exit 1 +fi + +# remove protoc in CMakeList.txt and install.cmake +sed -i "s/include(libprotoc.cmake)/#/g" CMakeLists.txt +sed -i "s/include(protoc.cmake)/#/g" CMakeLists.txt +sed -i "s/libprotoc)/)/g" install.cmake +sed -i "s/install(TARGETS protoc EXPORT protobuf-targets/#/g" install.cmake +sed -i "s/RUNTIME DESTINATION \${CMAKE_INSTALL_BINDIR} COMPONENT protoc)/#/g" install.cmake +sed -i "s/export(TARGETS libprotobuf-lite libprotobuf libprotoc protoc/export(TARGETS libprotobuf-lite libprotobuf/g" install.cmake + +# build the target into build_android. +BUILD_ROOT=$protobuf_ROOT/build-protobuf-android-v7-gcc +mkdir -p $BUILD_ROOT +echo "-- Build protobuf Android into: $BUILD_ROOT" + +# Now, actually build the android target. +echo "-- Building anakin lite ..." +cd $BUILD_ROOT +cmake .. \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_NDK=$ANDROID_NDK \ + -DANDROID_NATIVE_API_LEVEL=17 \ + -DANDROID_ABI="armeabi-v7a with NEON" \ + -DANDROID_TOOLCHAIN=gcc \ + -DCMAKE_BUILD_TYPE=Release \ + -Dprotobuf_BUILD_EXAMPLES=OFF \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=OFF \ + -Dprotobuf_BUILD_STATIC_LIBS=ON \ + -Dprotobuf_BUILD_SHARED_LIBS=OFF \ + -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ + -DANDROID_STL=c++_shared \ + -DANDROID_LINKER_FLAGS="-landroid -llog" \ + -DANDROID_CPP_FEATURES="rtti exceptions" \ + + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" && make install +else + make "-j$(nproc)" && make install +fi diff --git a/tools/build_android_protobuf_gcc_armv8.sh b/tools/build_android_protobuf_gcc_armv8.sh new file mode 100644 index 000000000..d44bf9342 --- /dev/null +++ b/tools/build_android_protobuf_gcc_armv8.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# This script shows how one can build protobuf for the Android platform using android-tool-chain. +# IMPORTANT!!!!!!!!!!!!!! +# remove "-g" compile flags in "$ANDROID_NDK/build/cmake/android.toolchain.cmake" +# to remove debug info +# set your ndk path to ANDROID_NDK +# NDK version is up to r16b, the latest version(r18b) remove gcc from toolchain +# firstly, download the release version of protobuf or git clone the protobuf project, recoment version v3.5.0 +# copy this script to protobuf_path/cmake/ +# run this script by: sh build_android_protobuf_gcc_armv8.sh +set -e +export ANDROID_NDK=/home/public/android-ndk-r16b + +protobuf_ROOT="$( cd "$(dirname "$0")" ; pwd -P)" +echo "-- protobuf root dir is: $protobuf_ROOT" + +if [ -z "$ANDROID_NDK" ]; then + echo "-- Did you set ANDROID_NDK variable?" + exit 1 +fi + +if [ -d "$ANDROID_NDK" ]; then + echo "-- Using Android ndk at $ANDROID_NDK" +else + echo "-- Cannot find ndk: did you install it under $ANDROID_NDK ?" + exit 1 +fi + +# remove protoc in CMakeList.txt and install.cmake +sed -i "s/include(libprotoc.cmake)/#/g" CMakeLists.txt +sed -i "s/include(protoc.cmake)/#/g" CMakeLists.txt +sed -i "s/libprotoc)/)/g" install.cmake +sed -i "s/install(TARGETS protoc EXPORT protobuf-targets/#/g" install.cmake +sed -i "s/RUNTIME DESTINATION \${CMAKE_INSTALL_BINDIR} COMPONENT protoc)/#/g" install.cmake +sed -i "s/export(TARGETS libprotobuf-lite libprotobuf libprotoc protoc/export(TARGETS libprotobuf-lite libprotobuf/g" install.cmake + +# build the target into build_android. +BUILD_ROOT=$protobuf_ROOT/build-protobuf-android-v8-gcc +mkdir -p $BUILD_ROOT +echo "-- Build protobuf Android into: $BUILD_ROOT" + +# Now, actually build the android target. +echo "-- Building anakin lite ..." +cd $BUILD_ROOT +cmake .. \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_NDK=$ANDROID_NDK \ + -DANDROID_NATIVE_API_LEVEL=21 \ + -DANDROID_ABI="arm64-v8a" \ + -DANDROID_TOOLCHAIN=gcc \ + -DCMAKE_BUILD_TYPE=Release \ + -Dprotobuf_BUILD_EXAMPLES=OFF \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=OFF \ + -Dprotobuf_BUILD_STATIC_LIBS=ON \ + -Dprotobuf_BUILD_SHARED_LIBS=OFF \ + -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ + -DANDROID_STL=c++_shared \ + -DANDROID_LINKER_FLAGS="-landroid -llog" \ + -DANDROID_CPP_FEATURES="rtti exceptions" \ + + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" && make install +else + make "-j$(nproc)" && make install +fi diff --git a/tools/build_lite.sh b/tools/build_lite.sh index d67e20921..5b5c2c5ca 100755 --- a/tools/build_lite.sh +++ b/tools/build_lite.sh @@ -1,5 +1,7 @@ #!/bin/bash # This script shows how one can build a anakin for the gpu platform +set -e + ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" echo "-- Anakin root dir is: $ANAKIN_ROOT" @@ -8,7 +10,7 @@ BUILD_ROOT=$ANAKIN_ROOT/lite_build mkdir -p $BUILD_ROOT echo "-- Build anakin lite into: $BUILD_ROOT" - +export PATH=/Users/scmtools/buildkit/cmake/cmake-3.8.2/bin:$PATH # Now, actually build the gpu target. echo "-- Building anakin ..." cd $BUILD_ROOT @@ -22,6 +24,8 @@ cmake .. \ -DUSE_PYTHON=OFF \ -DENABLE_DEBUG=NO \ -DENABLE_VERBOSE_MSG=NO \ + -DENABLE_MIN_DEPENDENCY=YES \ + -DPROTOBUF_ROOT=/Users/scmbuild/workspaces_cluster/baidu.sys-hic-gpu.Anakin-2.0/baidu/sys-hic-gpu/Anakin-2.0/protobuf/ \ -DDISABLE_ALL_WARNINGS=YES \ -DENABLE_NOISY_WARNINGS=NO \ -DUSE_OPENMP=NO \ @@ -31,8 +35,8 @@ cmake .. \ # build target lib or unit test. if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install + make "-j$(sysctl -n hw.ncpu)" install else - make "-j$(nproc)" && make install + make "-j$(nproc)" install fi diff --git a/tools/build_lite_arm.sh b/tools/build_lite_arm.sh new file mode 100755 index 000000000..0c5e78c7f --- /dev/null +++ b/tools/build_lite_arm.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# This script shows how one can build a anakin for the gpu platform +set -e + +ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" +echo "-- Anakin root dir is: $ANAKIN_ROOT" + +# build the target into gpu_build. +BUILD_ROOT=$ANAKIN_ROOT/lite_build + +mkdir -p $BUILD_ROOT +echo "-- Build anakin lite into: $BUILD_ROOT" +export PATH=/Users/scmtools/buildkit/cmake/cmake-3.8.2/bin:$PATH +# Now, actually build the gpu target. +echo "-- Building anakin ..." +cd $BUILD_ROOT + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_ARM_PLACE=NO \ + -DUSE_GPU_PLACE=NO \ + -DUSE_X86_PLACE=NO \ + -DBUILD_WITH_UNIT_TEST=NO \ + -DUSE_PYTHON=OFF \ + -DENABLE_DEBUG=NO \ + -DENABLE_VERBOSE_MSG=NO \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO \ + -DUSE_OPENMP=NO \ + -DBUILD_SHARED=YES \ + -DBUILD_EXAMPLES=NO \ + -DBUILD_LITE=YES + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" install +else + make "-j$(nproc)" install +fi + diff --git a/tools/external_converter_v2/config.py b/tools/external_converter_v2/config.py index 623023cbb..395573299 100644 --- a/tools/external_converter_v2/config.py +++ b/tools/external_converter_v2/config.py @@ -2,6 +2,8 @@ # Copyright (c) 2017, Cuichaowen. All rights reserved. # -*- coding: utf-8 -*- +import argparse +import enum import os import sys import subprocess @@ -19,9 +21,12 @@ class Configuration: Parse the config.yaml file. Configuration holds all the params defined in configfile. """ - def __init__(self, argv, config_file_path=ConfigFilePath): + def __init__(self, args, config_file_path=ConfigFilePath): data = load(open(config_file_path, 'r').read()) + self.fill_config_from_args(args, data) + # parse Options from config file. + self.DebugConfig = data['DEBUG'] if 'DEBUG' in data else None self.framework = data['OPTIONS']['Framework'] self.SavePath = data['OPTIONS']['SavePath'] \ if data['OPTIONS']['SavePath'][-1] == '/' \ @@ -38,89 +43,86 @@ def __init__(self, argv, config_file_path=ConfigFilePath): self.logger_dict = data['OPTIONS']['LOGGER'] self.framework_config_dict = data['TARGET'][self.framework] self.check_protobuf_version() - if len(argv) > 1: - self.config_from_cmd(argv) if 'ProtoPaths' in data['TARGET'][self.framework].keys(): proto_list = data['TARGET'][self.framework]['ProtoPaths'] self.__refresh_pbs(proto_list) self.generate_pbs_of_anakin() - def config_from_cmd(self, argv): - """ - Read configuration information from the command line. + def fill_config_from_args(self, args, data): + """Fill config from args """ - cmd = { - 'CAFFE': { - 'proto': ['ProtoPaths', list()], - 'prototxt': ['PrototxtPath', str()], - 'caffemodel': ['ModelPath', str()], - }, - 'FLUID': { - 'modelpath': ['ModelPath', str()], - 'type': ['NetType', str()], - }, - } - err_note = '\nUsage1: python ./converter.py ' \ - + 'CAFFE --proto=/path/to/filename1.proto ' \ - + '--prototxt=/path/to/filename.prototxt ' \ - + '--caffemodel=/path/to/filename.caffemodel\n' \ - + 'Usage2: python ./converter.py ' \ - + 'FLUID --modelpath=/model/path/ --type=OCR' - def splitter(arg, key_delim='--', val_delim='='): - """ - Extract the valid content of the parameter string to form a [key, val] list. - """ - if (key_delim in arg) and (val_delim in arg): - element = arg.split(key_delim)[1].split(val_delim) - return element - else: - raise NameError(err_note) - def filler(arg, dic, val_idx=1): - """ - Extract the valid content of the parameter string to form a [key, val] list. - """ - element = splitter(arg) - key = element[0] - val = element[1] - assert key in dic.keys(), \ - "Param %s in cmd is wrong." % (key) - if type(dic[key][val_idx]) == str: dic[key][val_idx] = val - elif type(dic[key][val_idx]) == list: dic[key][val_idx].append(val) - def null_scanner(dic, val_idx=1): - """ - Make sure the parameters are complete. - """ - for key in dic: - assert (bool(dic[key][val_idx])), 'Key [%s] should not be null.' % (key) - def arg_transmit(dic, target, key_idx=0, val_idx=1): - """ - Match the command line to yaml. - """ - if target == 'CAFFE': - self.ResultName = dic['caffemodel'][val_idx].split("/")[-1].split('.caffemodel')[0] - elif target == 'FLUID': - if dic['modelpath'][-1] == '/': - self.ResultName = dic['modelpath'][val_idx].split("/")[-2] - else: - self.ResultName = dic['modelpath'][val_idx].split("/")[-1] - else: - raise NameError(err_note) - for cmd_key in cmd[target].keys(): - key = dic[cmd_key][key_idx] - val = dic[cmd_key][val_idx] - self.framework_config_dict[key] = val - self.LaunchBoard = False - target = argv[1] - assert target in cmd.keys(), "Framework [%s] is not yet supported." % (target) - for arg in argv[2:]: - filler(arg, cmd[target]) - null_scanner(cmd[target]) - arg_transmit(cmd[target], target) + # set common args + if args.debug is not None: + data['DEBUG'] = args.debug + if args.framework is not None: + data['OPTIONS']['Framework'] = str(args.framework) + if args.save_path is not None: + data['OPTIONS']['SavePath'] = args.save_path + if args.result_name is not None: + data['OPTIONS']['ResultName'] = args.result_name + if args.open_launch_board is not None: + data['OPTIONS']['Config']['LaunchBoard'] = True if args.open_launch_board != 0 else False + if args.board_server_ip is not None: + data['OPTIONS']['Config']['Server']['ip'] = args.board_server_ip + if args.board_server_port is not None: + data['OPTIONS']['Config']['Server']['port'] = args.board_server_port + if args.optimized_graph_enable is not None: + data['OPTIONS']['Config']['OptimizedGraph']['enable'] = True if args.optimized_graph_enable != 0 else False + if args.optimized_graph_path is not None: + data['OPTIONS']['Config']['OptimizedGraph']['path'] = args.optimized_graph_path + if args.log_path is not None: + data['OPTIONS']['LOGGER']['LogToPath'] = args.log_path + if args.log_with_color is not None: + data['OPTIONS']['LOGGER']['WithColor'] = args.log_with_color + + # set framwork specific args + # caffe + if args.caffe_proto_paths is not None: + data['TARGET']['CAFFE']['ProtoPaths'] = args.caffe_proto_paths + if args.caffe_proto_txt_path is not None: + data['TARGET']['CAFFE']['PrototxtPath'] = args.caffe_proto_txt_path + if args.caffe_model_path is not None: + data['TARGET']['CAFFE']['ModelPath'] = args.caffe_model_path + if args.caffe_remark is not None: + data['TARGET']['CAFFE']['Remark'] = args.caffe_remark + + # fluid + if args.fluid_debug is not None: + data['TARGET']['FLUID']['Debug'] = args.fluid_debug + if args.fluid_model_path is not None: + data['TARGET']['FLUID']['ModelPath'] = args.fluid_model_path + if args.fluid_net_type is not None: + data['TARGET']['FLUID']['NetType'] = args.fluid_net_type + + # lego + if args.lego_proto_path is not None: + data['TARGET']['LEGO']['ProtoPath'] = args.lego_proto_path + if args.lego_prototxt_path is not None: + data['TARGET']['LEGO']['PrototxtPath'] = args.lego_prototxt_path + if args.lego_model_path is not None: + data['TARGET']['LEGO']['ModelPath'] = args.lego_model_path + + # tensorflow + if args.tensorflow_model_path is not None: + data['TARGET']['TENSORFLOW']['ModelPath'] = args.tensorflow_model_path + if args.tensorflow_outputs is not None: + data['TARGET']['TENSORFLOW']['OutPuts'] = args.tensorflow_outputs + + # onnx + if args.onnx_model_path is not None: + data['TARGET']['ONNX']['ModelPath'] = args.onnx_model_path + + # houyi + if args.houyi_model_path is not None: + data['TARGET']['HOUYI']['ModelPath'] = args.houyi_model_path + if args.houyi_weights_path is not None: + data['TARGET']['HOUYI']['WeightsPath'] = args.houyi_weights_path def check_protobuf_version(self): """ Check if the pip-protoc version is equal to sys-protoc version. """ + assert sys.version_info[0] == 2 for path in sys.path: module_path = os.path.join(path, 'google', 'protobuf', '__init__.py') if os.path.exists(module_path): @@ -177,6 +179,7 @@ def __refresh_pbs(self, proto_list, default_save_path="parser/pbs/"): "The ProtoPaths format maybe incorrect, please check if there is any HORIZONTAL LINE." for pFile in proto_list: assert os.path.exists(pFile), "%s does not exist.\n" % (pFile) - subprocess.check_call(['protoc', '-I', + subprocess.check_call(['protoc', '-I', os.path.dirname(pFile) + "/", '--python_out', os.path.dirname(default_save_path) + "/", pFile]) + diff --git a/tools/external_converter_v2/config.yaml b/tools/external_converter_v2/config.yaml index 058100311..fe6cf17d2 100644 --- a/tools/external_converter_v2/config.yaml +++ b/tools/external_converter_v2/config.yaml @@ -1,5 +1,5 @@ #--------------------------------------------------------------- -## configuration file of external model convert to anakin +## configuration file of external model convert to anakin ##--------------------------------------------------------------- # ##--------------------------------------------------------------- @@ -19,7 +19,7 @@ ## Anakin graph dash board server ip ( local boardcast ip or real ip) ## @Param port ## Anakin graph dash board server port ( you need to set os open the port ) -## @Param OptimizedGraph: +## @Param OptimizedGraph: ## |- enable (OFF/ON) : Whether to visualize the necessary compute and optimization analysis of graph ## `- path: This place the optimized anakin model path generated by anakin framework's api graph::save ## @Param LogToPath @@ -28,7 +28,7 @@ ## Wether to usecolorful log ## ## @Param TARGET::CAFFE ... -## You only need to fill in the framework config +## You only need to fill in the framework config ## you need to convert ## @Param ProtoPaths: ## Protobuf define files, maybe a list. @@ -36,21 +36,26 @@ ## Json define prototxt file path of you model ## @Param ModelPath: ## Path of you binary model. +## @Param DEBUG: +## NET: +## LoadPaths: +## SavePath: +## SaveFormat: text ## ##-------------------------------------------------------------- # OPTIONS: Framework: CAFFE SavePath: ./output - ResultName: life_feature1 + ResultName: googlenet Config: LaunchBoard: ON Server: ip: 0.0.0.0 - port: 8000 - OptimizedGraph: - enable: ON - path: /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/map/anakin-models/route-dnn/route-dnn.anakin2.bin.saved + port: 8888 + OptimizedGraph: + enable: OFF + path: /path/to/anakin_optimized/googlenet.anakin.bin.saved LOGGER: LogToPath: ./log/ WithColor: ON @@ -59,11 +64,9 @@ TARGET: CAFFE: # path to proto files ProtoPaths: - - /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/face/caffe.proto - #PrototxtPath: /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/face/face_detect/multiscale-sgnet13R2_no_inception.prototxt - #ModelPath: /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/face/face_detect/sgnet13R2_iter_480000.caffemodel - PrototxtPath: /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/face/life_feature/caffe_life_feature1.prototxt - ModelPath: /home/chengyujuan/baidu/sys-hic-gpu/anakin-models/face/life_feature/caffe_life_feature1.caffemodel + - /path/to/caffe.proto + PrototxtPath: /path/to/your/googlenet.prototxt + ModelPath: /path/to/your/googlenet.caffemodel Remark: # Generally no need to modify. FLUID: @@ -71,20 +74,20 @@ TARGET: Debug: NULL # Generally no need to modify. ModelPath: /path/to/your/model/ # The upper path of a fluid inference model. NetType: # Generally no need to modify. - + LEGO: # path to proto files ProtoPath: PrototxtPath: ModelPath: - + TENSORFLOW: - ProtoPaths: / - PrototxtPath: / - ModelPath: / + ModelPath: /path/to/your/model/ OutPuts: - + ONNX: - ProtoPath: - PrototxtPath: ModelPath: + + HOUYI: + ModelPath: /Users/chenjiao04/Downloads/for_sys/train.conf + WeightsPath: /Users/chenjiao04/Downloads/for_sys/model diff --git a/tools/external_converter_v2/converter.py b/tools/external_converter_v2/converter.py index b1b895661..b43a861b3 100644 --- a/tools/external_converter_v2/converter.py +++ b/tools/external_converter_v2/converter.py @@ -2,6 +2,7 @@ # Copyright (c) 2017, Cuichaowen. All rights reserved. # -*- coding: utf-8 -*- +import argparse import os import sys from config import * @@ -10,16 +11,113 @@ def launch(config, graph): logger(verbose.WARNING).feed("anakin parser dash board will be launch in site: ") graph.run_with_server(config.ip, config.port) + +class DeepLearningFramework(enum.Enum): + """Anakin parser supported deep learning framework enum + """ + caffe = 'CAFFE' + fluid = 'FLUID' + lego = 'LEGO' + tensorflow = 'TENSORFLOW' + onnx = 'ONNX' + houyi = 'HOUYI' + + def __str__(self): + return self.value + + +def parse_args(): + """parse command args + """ + arg_parser = argparse.ArgumentParser('Anakin Parser') + + # common args + arg_parser.add_argument( + '--debug', type=str, help='debug') + arg_parser.add_argument( + '--framework', type=DeepLearningFramework, choices=list(DeepLearningFramework), help='input framework') + arg_parser.add_argument( + '--save_path', type=str, help='output save directory') + arg_parser.add_argument( + '--result_name', type=str, help='id of output filename') + arg_parser.add_argument( + '--open_launch_board', type=int, help='open Anakin net display board') + arg_parser.add_argument( + '--board_server_ip', type=str, help='display board server ip') + arg_parser.add_argument( + '--board_server_port', type=int, help='display board server port') + arg_parser.add_argument( + '--optimized_graph_enable', type=int, help='OptimizedGraph enable') + arg_parser.add_argument( + '--optimized_graph_path', type=str, help='OptimizedGraph path') + arg_parser.add_argument( + '--log_path', type=str, help='log dir') + arg_parser.add_argument( + '--log_with_color', type=str, help='use color log') + + # framwork specific args + # CAFFE + arg_parser.add_argument( + '--caffe_proto_paths', nargs='*', help='caffe ProtoPaths') + arg_parser.add_argument( + '--caffe_proto_txt_path', type=str, help='caffe PrototxtPath') + arg_parser.add_argument( + '--caffe_model_path', type=str, help='caffe ModelPath') + arg_parser.add_argument( + '--caffe_remark', type=str, help='caffe Remark') + + # FLUID + arg_parser.add_argument( + '--fluid_debug', type=str, help='fluid debug switch') + arg_parser.add_argument( + '--fluid_model_path', type=str, help='fluid ModelPath') + arg_parser.add_argument( + '--fluid_net_type', type=str, help='fluid NetType') + + # LEGO + arg_parser.add_argument( + '--lego_proto_path', type=str, help='lego ProtoPath') + arg_parser.add_argument( + '--lego_prototxt_path', type=str, help='lego PrototxtPath') + arg_parser.add_argument( + '--lego_model_path', type=str, help='lego ModelPath') + + # TENSORFLOW + arg_parser.add_argument( + '--tensorflow_model_path', type=str, help='tensorflow ModelPath') + arg_parser.add_argument( + '--tensorflow_outputs', type=str, help='tensorflow OutPuts') + + # ONNX + arg_parser.add_argument( + '--onnx_model_path', type=str, help='onnx ModelPath') + + # HOUYI + arg_parser.add_argument( + '--houyi_model_path', type=str, help='houyi ModelPath') + arg_parser.add_argument( + '--houyi_weights_path', type=str, help='houyi WeightsPath') + + args = arg_parser.parse_args() + + return args + + if __name__ == "__main__": - config = Configuration(sys.argv) - # import parser + args = parse_args() + config = Configuration(args) from parser import * - # init logger logger.init(config.logger_dict) - graph = Graph(config) - graph.info_table() - graph.serialization() + if config.DebugConfig is None: + graph = Graph(config) + graph.info_table() + graph.serialization() + + if config.LaunchBoard: + launch(config, graph) + else: + import utils + net = utils.net.net_parser.NetHolder(config) + net.parse() - if config.LaunchBoard: - launch(config, graph) diff --git a/tools/external_converter_v2/parser/caffe/caffe_helper.py b/tools/external_converter_v2/parser/caffe/caffe_helper.py index e42a3a993..f656f342d 100644 --- a/tools/external_converter_v2/parser/caffe/caffe_helper.py +++ b/tools/external_converter_v2/parser/caffe/caffe_helper.py @@ -2,10 +2,90 @@ # Copyright (c) 2017, Cuichaowen. All rights reserved. # -*- coding: utf-8 -*- +import copy +import math +from .. import graph_io from ..utils import * from ..pbs import * +from ..logger import logger, verbose +def FillerCaffeBlob(filler, raw_blob): + """caffe filler effective + """ + filler_blob = copy.deepcopy(raw_blob) + + if filler.type == 'constant': + filler_blob.data[:] = [filler.value, ] * len(raw_blob.data) + else: + logger(verbose.WARNING).feed('filler.type={} not support yet'.format(filler.type)) + # TODO handle + + return filler_blob + + +def MergeCaffeLayer(rlayer, mlayer): + """merge caffe caffemodel layer(mlayer) in prototxt layer(rlayer) + """ + # if no mlayer, give rlayer directly + if mlayer is None: + return rlayer + + assert rlayer.name == mlayer.name, 'assert rlayer.name({0}) == mlayer.name({1})'.format(rlayer.name, mlayer.name) + + layer = copy.deepcopy(rlayer) + + # merge rlayer & mlayer blobs + if len(layer.blobs) == 0: + layer.blobs.extend(mlayer.blobs) + + # if layer.type == 'BatchNorm' + if layer.type == 'BatchNorm': + layer.batch_norm_param.MergeFrom(mlayer.batch_norm_param) + + return layer + + +def GetTensorsFromCaffeLayer(layer): + """(caffe.LayerParameter or caffe.V1LayerParameter) => anakin graph_io.TensorProtoIO + """ + # filler blob first + if layer.type == 'PReLU': + if layer.prelu_param.HasField('filler') \ + and layer.blobs[0].num == 0 \ + and layer.prelu_param.channel_shared: + # 1. filler only when layer.blobs[0] empty(layer.blobs[0].num == 0) + # 2. PReLU must filler a [1, 1, 1, 1, ] blob + layer.blobs[0].shape.dim[:] = [1, 1, 1, 1,] + (layer.blobs[0].num, + layer.blobs[0].channels, + layer.blobs[0].height, + layer.blobs[0].width) = layer.blobs[0].shape.dim + layer.blobs[0].data[:] = [.0, ] + layer.blobs[0].CopyFrom( + FillerCaffeBlob(layer.prelu_param.filler, layer.blobs[0])) + + # layer.blobs => tensors + tensors = [] + for blob in layer.blobs: + tensor = graph_io.TensorProtoIO() + if len(blob.shape.dim): + n, c, h, w = map(int, [1] * (4 - len(blob.shape.dim)) + list(blob.shape.dim)) + if len(blob.shape.dim) == 1: + c = w + w = 1 + else: + n, c, h, w = blob.num, blob.channels, blob.height, blob.width + tensor.set_data_type(graph_io.FLOAT) # default float + if layer.type == "Deconvolution": # deconv is different in caffe + tensor.set_shape([c, n, h, w]) + else: + tensor.set_shape([n, c, h, w]) # set shape (n c h w) + tensor.set_data(blob.data, "float") + tensors.append(tensor) + + return tensors + def SplitBlobName(layer_name, blob_name, blob_idx, split_idx): """ Used for caffe parser. diff --git a/tools/external_converter_v2/parser/caffe/caffe_layer_param_transmit.py b/tools/external_converter_v2/parser/caffe/caffe_layer_param_transmit.py index 271cfd6f0..9cfc4e0ff 100755 --- a/tools/external_converter_v2/parser/caffe/caffe_layer_param_transmit.py +++ b/tools/external_converter_v2/parser/caffe/caffe_layer_param_transmit.py @@ -6,7 +6,7 @@ except ImportError: pass try: - from google.protobuf.pyext._message import RepeatedScalarContainer as repeat_container # 3.5.1 + + from google.protobuf.pyext._message import RepeatedScalarContainer as repeat_container # 3.5.1 + except ImportError: pass from ..operations import OpsParam, OpsRegister @@ -14,13 +14,13 @@ from ..pbs import * -def is_has_proto_key(param_pkg, key_name): +def is_has_proto_key(param_pkg, key_name): """ Judge if param_pkg has field key_name """ - for field in param_pkg.DESCRIPTOR.fields: - if field.name == key_name: - return True + for field in param_pkg.DESCRIPTOR.fields: + if field.name == key_name: + return True return False @@ -57,7 +57,7 @@ def warpper_args(args): return warpper_args return warpper -# common +# common def NotNeededInInference(args): @@ -93,11 +93,19 @@ def Parser_resize(args): layer = args[1] # parser caffe parameter resize_param = layer.resize_param - if resize_param.HasField("out_width_scale"): - OpsRegister()["Resize"].width_scale = resize_param.out_width_scale - if resize_param.HasField("out_height_scale"): - OpsRegister()["Resize"].height_scale = resize_param.out_height_scale - + OpsRegister()["Resize"].width_scale = resize_param.out_width_scale + OpsRegister()["Resize"].height_scale = resize_param.out_height_scale + OpsRegister()["Resize"].out_width = resize_param.out_width + OpsRegister()["Resize"].out_height = resize_param.out_height + method = "" + if resize_param.type == ResizeParameter.BILINEAR_ALIGN: + method = "BILINEAR_ALIGN" + elif resize_param.type == ResizeParameter.BILINEAR_NO_ALIGN: + method = "BILINEAR_NO_ALIGN" + else: + method = "RESIZE_CUSTOM" + OpsRegister()["Resize"].method = method + @ParserFeedDecorator("DeformConvolution") @@ -152,7 +160,7 @@ def Parser_deformable_convolution(args): paddings = [convolution_param.pad_h, convolution_param.pad_w] OpsRegister()["DeformConvolution"].padding = paddings if is_has_proto_key(convolution_param, "dilation"): - if len(convolution_param.dilation) == 0: + if len(convolution_param.dilation) == 0: OpsRegister()["DeformConvolution"].dilation_rate = list([1, 1]) elif len(convolution_param.dilation) == 1: OpsRegister()["DeformConvolution"].dilation_rate = list([convolution_param.dilation[0], convolution_param.dilation[0]]) @@ -220,7 +228,7 @@ def Parser_deconvolution(args): paddings = [convolution_param.pad_h, convolution_param.pad_w] OpsRegister()["Deconvolution"].padding = paddings if is_has_proto_key(convolution_param, "dilation"): - if len(convolution_param.dilation) == 0: + if len(convolution_param.dilation) == 0: OpsRegister()["Deconvolution"].dilation_rate = list([1, 1]) elif len(convolution_param.dilation) == 1: OpsRegister()["Deconvolution"].dilation_rate = list([convolution_param.dilation[0], convolution_param.dilation[0]]) @@ -288,7 +296,7 @@ def Parser_convolution(args): paddings = [convolution_param.pad_h, convolution_param.pad_w] OpsRegister()["Convolution"].padding = paddings if is_has_proto_key(convolution_param, "dilation"): - if len(convolution_param.dilation) == 0: + if len(convolution_param.dilation) == 0: OpsRegister()["Convolution"].dilation_rate = list([1, 1]) elif len(convolution_param.dilation) == 1: OpsRegister()["Convolution"].dilation_rate = list([convolution_param.dilation[0], convolution_param.dilation[0]]) @@ -370,22 +378,26 @@ def Parser_convolutiondepthwise(args): OpsRegister()["Convolution"].axis = 1 OpsRegister()["Convolution"].bias_term = convolution_param.bias_term -@ParserFeedDecorator("Cropping") +@ParserFeedDecorator("Crop") def Parser_crop(args): layer = args[1] # parser caffe parameter crop_param = layer.crop_param - OpsRegister()["Cropping"].cropping = list(crop_param.offset) - OpsRegister()["Cropping"].axis = crop_param.axis + OpsRegister()["Crop"].cropping = list(crop_param.offset) + OpsRegister()["Crop"].axis = crop_param.axis -@ParserFeedDecorator("Dropout") +@ParserFeedDecorator("Scale") def Parser_dropout(args): layer = args[1] # parser caffe parameter dropout_param = layer.dropout_param - OpsRegister()["Dropout"].ratio = dropout_param.dropout_ratio - + scale_val = 1 - dropout_param.dropout_ratio + tensor = TensorProtoIO() + tensor.set_data_type(FLOAT) + tensor.set_data([scale_val], "float") + tensor.set_shape([1, 1, 1, 1]) + OpsRegister()["Scale"].weight_1 = tensor @ParserFeedDecorator("Eltwise") def Parser_eltwise(args): @@ -447,7 +459,7 @@ def Parser_innerproduct(args): # parser caffe parameter tensors = args[2] weight = tensors[0] - inner_product_param = layer.inner_product_param + inner_product_param = layer.inner_product_param OpsRegister()["Dense"].axis = inner_product_param.axis # weight().shape.dim.value[2] OpsRegister()["Dense"].out_dim = inner_product_param.num_output # weight().shape.dim.value[3] OpsRegister()["Dense"].bias_term = inner_product_param.bias_term @@ -644,6 +656,16 @@ def Parser_input(args): #for shape in input_param.shape: # OpsRegister()["Input"].input_shape.append(list(shape.dim)) +@ParserFeedDecorator("Input") +def Parser_dummydata(args): + logger(verbose.INFO).feed(str(args)) + layer = args[1] + input_param = layer.dummy_data_param + OpsRegister()["Input"].input_shape = list(input_param.shape[0].dim) + args[3].set_name("Input") + logger(verbose.INFO).feed(str(layer)) + logger(verbose.INFO).feed(str(OpsRegister()["Input"].input_shape)) + @ParserFeedDecorator("Permute") def Parser_permute(args): @@ -676,6 +698,8 @@ def Parser_reshape(args): layout = 'NCHW' elif len(shape) == 3: layout = 'NHW' + elif len(shape) == 2: + layout = 'NW' OpsRegister()["Reshape"].layout = layout @ParserFeedDecorator("Split") @@ -692,6 +716,14 @@ def Parser_ShuffleChannel(args): shufflechannel_param = layer.shuffle_channel_param OpsRegister()["ShuffleChannel"].group = shufflechannel_param.group +@ParserFeedDecorator("Coord2Patch") +def Parser_Coord2Patch(args): + layer = args[1] + # parser caffe parameter + coord2patch_param = layer.coord2patch_param + OpsRegister()["Coord2Patch"].img_h = coord2patch_param.img_h + OpsRegister()["Coord2Patch"].output_h = coord2patch_param.output_h + OpsRegister()["Coord2Patch"].output_w = coord2patch_param.output_w @ParserFeedDecorator("RPNProposalSSD") def Parser_rpn_proposal_ssd(args): @@ -1113,7 +1145,8 @@ def Parser_priorbox(args): len(prior_box_param.density): OpsRegister()["PriorBox"].fixed_size = list(prior_box_param.fixed_size) OpsRegister()["PriorBox"].fixed_ratio = list(prior_box_param.fixed_ratio) - OpsRegister()["PriorBox"].density = list(prior_box_param.density) + density_list = list(prior_box_param.density) + OpsRegister()["PriorBox"].density = map(float, density_list) OpsRegister()["PriorBox"].is_flip = prior_box_param.flip OpsRegister()["PriorBox"].is_clip = prior_box_param.clip OpsRegister()["PriorBox"].variance = list(prior_box_param.variance) @@ -1180,7 +1213,7 @@ def Parser_normalize(args): OpsRegister()["Normalize"].begin_norm_axis = -1 OpsRegister()["Normalize"].is_across_spatial = False OpsRegister()["Normalize"].is_shared_channel = False - OpsRegister()["Normalize"].eps = 1e-6 + OpsRegister()["Normalize"].eps = 1e-5 OpsRegister()["Normalize"].p = 2 @ParserFeedDecorator("Activation") @@ -1201,6 +1234,58 @@ def Parser_interp(args): OpsRegister()["Interp"].pad_beg = interp_param.pad_beg OpsRegister()["Interp"].pad_end = interp_param.pad_end +@ParserFeedDecorator("RoiPool") +def Parser_roi_pool(args): + layer = args[1] + roi_pool_param = layer.roi_pool_param + OpsRegister()["RoiPool"].pooled_h = roi_pool_param.pooled_h + OpsRegister()["RoiPool"].pooled_w = roi_pool_param.pooled_w + OpsRegister()["RoiPool"].spatial_scale = roi_pool_param.spatial_scale + +@ParserFeedDecorator("Pad2D") +def Parser_pad2d(args): + layer = args[1] + pad2d_param = layer.pad2d_param + mode = "" + if pad2d_param.mode == Pad2DParameter.EDGE: + mode = "edge" + elif pad2d_param.mode == Pad2DParameter.REFLECT: + mode = "reflect" + elif pad2d_param.mode == Pad2DParameter.CONSTANT: + mode = "constant" + else: + mode = "constant" + OpsRegister()["Pad2D"].mode = mode + value = 0.0 + if pad2d_param.HasField("value"): + value = pad2d_param.value + OpsRegister()["Pad2D"].value = value + pad_h = [pad2d_param.pad_top, pad2d_param.pad_bottom] + OpsRegister()["Pad2D"].pad_h = pad_h + pad_w = [pad2d_param.pad_left, pad2d_param.pad_right] + OpsRegister()["Pad2D"].pad_w = pad_w + +@ParserFeedDecorator("SRoiAlign") +def Parser_sroiAlign(args): + layer = args[1] + sroi_align_param = layer.sun_roi_align_param + OpsRegister()["SRoiAlign"].pooled_h = sroi_align_param.pooled_h + OpsRegister()["SRoiAlign"].pooled_w = sroi_align_param.pooled_w + OpsRegister()["SRoiAlign"].spatial_scale = sroi_align_param.spatial_scale + +@ParserFeedDecorator("SProposal") +def Parser_sproposal(args): + layer = args[1] + proposal_param = layer.proposal_param + OpsRegister()["SProposal"].feat_stride = proposal_param.feat_stride + OpsRegister()["SProposal"].basesize = proposal_param.basesize + OpsRegister()["SProposal"].scale = list(proposal_param.scale) + OpsRegister()["SProposal"].ratio = list(proposal_param.ratio) + OpsRegister()["SProposal"].boxminsize = proposal_param.boxminsize + OpsRegister()["SProposal"].pre_nms_topn = proposal_param.pre_nms_topn + OpsRegister()["SProposal"].post_nms_topn = proposal_param.post_nms_topn + OpsRegister()["SProposal"].nms_thresh = proposal_param.nms_thresh + # caffe layer parameter parser map CAFFE_LAYER_PARSER = { "Split": OpsParam().set_parser(Parser_split), @@ -1218,7 +1303,7 @@ def Parser_interp(args): "Crop": OpsParam().set_parser(Parser_crop), "Data": OpsParam().set_parser(NotNeededInInference), "Dropout": OpsParam().set_parser(Parser_dropout), - "DummyData": OpsParam().set_parser(NotNeededInInference), + "DummyData": OpsParam().set_parser(Parser_dummydata), "Eltwise": OpsParam().set_parser(Parser_eltwise), "ELU": OpsParam().set_parser(Parser_elu), "Embed": OpsParam().set_parser(Parser_embed), @@ -1270,6 +1355,11 @@ def Parser_interp(args): "ReLU6": OpsParam().set_parser(Parser_relu6), "Normalization": OpsParam().set_parser(Parser_normalize), "ShuffleChannel": OpsParam().set_parser(Parser_ShuffleChannel), + "Coord2Patch": OpsParam().set_parser(Parser_Coord2Patch), "RoisAnchorFeature": OpsParam().set_parser(Parser_rois_anchor_feature), - "Interp": OpsParam().set_parser(Parser_interp) + "Interp": OpsParam().set_parser(Parser_interp), + "ROIPooling": OpsParam().set_parser(Parser_roi_pool), + "Pad2D": OpsParam().set_parser(Parser_pad2d), + "SUNROIAlign": OpsParam().set_parser(Parser_sroiAlign), + "Proposal": OpsParam().set_parser(Parser_sproposal) } diff --git a/tools/external_converter_v2/parser/caffe/parser_caffe.py b/tools/external_converter_v2/parser/caffe/parser_caffe.py index ae15003c6..ac0152941 100644 --- a/tools/external_converter_v2/parser/caffe/parser_caffe.py +++ b/tools/external_converter_v2/parser/caffe/parser_caffe.py @@ -43,6 +43,7 @@ def _DetectionArch(self): self._InsSplitBtwSliceConcat() self._InsSplitBtwSliceEltwise() self._InsertSplits() + self._InsSplitBtwSplitConcat() self._ScatterInputLayer() # create input node #self._CreateInputNode() maybe not need @@ -216,6 +217,38 @@ def _UpgradeNetAsNeeded(self): UpgradeNetBatchNorm(self.net_parameter) logger(verbose.INFO).feed("[ Upgrade Level 5 ] Details: need BatchNorm upgrade [ ... ]") + def _InsSplitBtwSplitConcat(self): + ''' + Currently, the connection between Slice and Concat must be implemented via Split. + ''' + layers = self.net_parameter.layer or self.net_parameter.layers + top_blobs_of_splits = list() + btm_blobs_of_concats = list() + for layer in layers: + if layer.type == 'Split': + top_blobs_of_splits.extend(layer.top) + elif layer.type == 'Concat': + btm_blobs_of_concats.extend(layer.bottom) + intersection_blobs = list(set(top_blobs_of_splits).intersection(set(btm_blobs_of_concats))) + new_param = NetParameter() + for layer in layers: + new_layer = new_param.layer.add() + new_layer.CopyFrom(layer) + if layer.type == 'Split': + for top_blob in layer.top: + if top_blob in intersection_blobs: + split_param = new_param.layer.add() + split_param.bottom.append(top_blob) + split_param.top.append(top_blob) + split_param.name = 'Split_' + top_blob + split_param.type = 'Split' + if self.net_parameter.layer: + del self.net_parameter.layer[:] + self.net_parameter.layer.extend(new_param.layer) + else: + del self.net_parameter.layers[:] + self.net_parameter.layers.extend(new_param.layer) + def _InsSplitBtwSliceConcat(self): ''' Currently, the connection between Slice and Concat must be implemented via Split. @@ -254,13 +287,13 @@ def _InsSplitBtwSliceEltwise(self): ''' layers = self.net_parameter.layer or self.net_parameter.layers top_blobs_of_slices = list() - btm_blobs_of_concats = list() + btm_blobs_of_eltwises = list() for layer in layers: if layer.type == 'Slice': top_blobs_of_slices.extend(layer.top) elif layer.type == 'Eltwise': - btm_blobs_of_concats.extend(layer.bottom) - intersection_blobs = list(set(top_blobs_of_slices).intersection(set(btm_blobs_of_concats))) + btm_blobs_of_eltwises.extend(layer.bottom) + intersection_blobs = list(set(top_blobs_of_slices).intersection(set(btm_blobs_of_eltwises))) new_param = NetParameter() for layer in layers: new_layer = new_param.layer.add() @@ -474,6 +507,43 @@ def _CreateInputNode(self): self.graphIO.add_node(node_io()) self.graphIO.add_in(in_name) + def _UpdateScaleModelLayer(self): + """ + """ + rlayers = self.net_parameter.layer or self.net_parameter.layers + mlayers = self.net_param_weights.layers or self.net_param_weights.layer + def search_filler(rlayers): + scale_dict = dict() + for rlayer in rlayers: + if rlayer.type == "Scale" and rlayer.scale_param.HasField("filler"): + scale_dict[rlayer.name] = rlayer.scale_param.filler.value + return scale_dict + def all_names(layers): + name_list = list() + for layer in layers: + name_list.append(layer.name) + return name_list + def pick_layer(layer_name, layers): + assert layer_name in all_names(layers) + for layer in layers: + if layer_name == layer.name: + return layer + def add_scale_model_layer(rlayers, mlayers): + scale_dict = search_filler(rlayers) + mlayer_names = all_names(mlayers) + for layer_name in scale_dict.keys(): + if layer_name not in mlayer_names: + mlayer = pick_layer(layer_name, rlayers) + blob = BlobProto() + blob.num = 1 + blob.channels = 1 + blob.height = 1 + blob.width = 1 + blob.data.append(scale_dict[mlayer.name]) + mlayer.blobs.extend([blob]) + mlayers.extend([mlayer]) + add_scale_model_layer(rlayers, mlayers) + def _DealWithRemark(self, layer_type, nodeIO, mlayer, rlayer, tensors, opIO): if self.Remark == 'FaceUniqueBatchNorm': if len(tensors) > 3 and layer_type == "BatchNorm": # this is for Face unique Batchnorm layer(batchnorm + scale) @@ -523,6 +593,7 @@ def _Parsing_new(self): logger(verbose.INFO).feed(" [CAFFE] Model Parameter Parsing ...") self._ParserModel() self._SplitInception(True) + self._UpdateScaleModelLayer() model_layers = self.net_param_weights.layers or self.net_param_weights.layer # we must setting graph edge first @@ -559,48 +630,26 @@ def _Parsing_new(self): opIO.set_out_num(len(rlayer.top)) opIO.set_in_num(len(rlayer.bottom)) - match_in_model_layer = False # find corresponding model layer - for mlayer in model_layers: - if rlayer.name == mlayer.name: # find - #assert source_layer_type == mlayer.type, " real layer type(%s) must be equal to that(%s) of model layer." % (source_layer_type, mlayer.type) - logger(verbose.INFO).feed(" `--[ Match ]Parsing [%s:\t%s] " % (source_layer_type, source_layer_name)) + mlayers = filter(lambda mlayer: mlayer.name == rlayer.name, model_layers) + if len(mlayers) == 0: + mlayer = None + elif len(mlayers) == 1: + logger(verbose.INFO).feed(" `--[ Match ]Parsing [%s:\t%s] " % (source_layer_type, source_layer_name)) + mlayer = mlayers[0] + else: + logger(verbose.FATAL).feed("len(mlayers) == {}".format(len(mlayers))) + exit() + + # merge prototxt layer(rlayer) & caffemodel layer(mlayer) + layer = MergeCaffeLayer(rlayer, mlayer) + tensors = GetTensorsFromCaffeLayer(layer) + # filled nodeIO + if mlayer and self.Remark: + self._DealWithRemark(source_layer_type, nodeIO, mlayer, rlayer, tensors, opIO) + else: + CAFFE_LAYER_PARSER[source_layer_type](nodeIO, layer, tensors, opIO) - # fill node with blobs parameter, such as filter and weights - tensors = [] - if mlayer.blobs: - for blob in mlayer.blobs: - if blob in mlayer.blobs: - tensor = TensorProtoIO() - if len(blob.shape.dim): - n, c, h, w = map(int, [1] * (4 - len(blob.shape.dim)) + list(blob.shape.dim)) - if len(blob.shape.dim) == 1: - c = w - w = 1 - else: - n, c, h, w = blob.num, blob.channels, blob.height, blob.width - #data = np.array(blob.data, dtype=np.float32).reshape(n, c, h, w) - tensor.set_data_type(FLOAT) # default float - if source_layer_type == "Deconvolution": # deconv is different in caffe - tensor.set_shape([c, n, h, w]) - else: - tensor.set_shape([n, c, h, w]) # set shape (n c h w) - tensor.set_data(blob.data, "float") - tensors.append(tensor) - # fill node with layerparameter, such as axis kernel_size... and tensors - if self.Remark is None: - # besides, set the name of opIO - CAFFE_LAYER_PARSER[source_layer_type](nodeIO, rlayer, tensors, opIO) # call parser automatically - else: - self._DealWithRemark(source_layer_type, nodeIO, mlayer, rlayer, tensors, opIO) - match_in_model_layer = True - # TODO... over! - else: # not find - pass - if not match_in_model_layer: - # fill node with layerparameter, such as axis kernel_size... but with [ ] tensors (empty) - # besides, set the name of opIO - CAFFE_LAYER_PARSER[source_layer_type](nodeIO, rlayer, [], opIO) # call parser automatically # add node to graph io self.graphIO.add_node(nodeIO()) diff --git a/tools/external_converter_v2/parser/fluid/fluid_helper.py b/tools/external_converter_v2/parser/fluid/fluid_helper.py index 0bd1d5048..15e3d087b 100644 --- a/tools/external_converter_v2/parser/fluid/fluid_helper.py +++ b/tools/external_converter_v2/parser/fluid/fluid_helper.py @@ -1,8 +1,10 @@ from ..proto import * from ..graph_io import * +from ..logger import * import paddle.fluid as fluid import numpy as np from paddle.fluid.core import VarDesc, AttrType +from ..proto import helper def union(list_a, list_b): @@ -18,33 +20,41 @@ def difference(list_a, list_b): class Edge_for_fluid: - def __init__(self, param, target, var): + def __init__(self, param, target, var, scale): ''' ''' self.param = param self.target = target self.var = var + self.scale = scale + + def __str__(self): + return ''.format( + self.param, self.target, self.var, self.scale) class Fluid_edger: - def __init__(self, param = None, target = None, var = None): + def __init__(self, param=None, target=None, var=None, scale=None): ''' ''' self.edges = [] if param is not None and target is not None: - edge = Edge_for_fluid(param, target, var) + edge = Edge_for_fluid(param, target, var, scale) self.edges.append(edge) + def __str__(self): + return ''.format(self.edges) + def __call__(self): ''' ''' return self.all_targets() - def add(self, param, target, var = None): + def add(self, param, target, var=None, scale=None): ''' ''' - edge = Edge_for_fluid(param, target, var) + edge = Edge_for_fluid(param, target, var, scale) self.edges.append(edge) def rm_edges_by_param(self, param): @@ -67,17 +77,26 @@ def rm(self, target): if res != 0: pass - def mv(self, old_target, new_target): + def mv(self, old_target, new_target, new_scale=None): ''' ''' res = -1 for edge in self.edges: if old_target == edge.target: edge.target = new_target + if new_scale is not None: + edge.scale = new_scale res = res + 1 if res != 0: pass + def reset_target_by_param(self, param, new_target): + ''' + ''' + for edge in self.edges: + if edge.param == param: + edge.target = new_target + def all_params(self): ''' ''' @@ -95,6 +114,28 @@ def all_targets(self): targets.append(edge.target) return targets + def all_scales(self): + ''' + ''' + scales = [] + for edge in self.edges: + scales.append(edge.scale) + return scales + + def set_scale(self, target, scale): + ''' + ''' + for edge in self.edges: + if edge.target == target: + edge.scale = scale + + def get_scale(self, target): + ''' + ''' + for edge in self.edges: + if edge.target == target: + return edge.scale + def targets(self, param): ''' ''' @@ -145,11 +186,12 @@ def __getitem__(self, idx): class Fluid_helper: ''' ''' - def __init__(self, scope, block): + def __init__(self, scope, block, program): ''' ''' self.scope = scope self.block = block + self.program = program def args_by_input_param(self, op, param_name): ''' @@ -171,14 +213,21 @@ def var_by_input_param(self, op, param_name, var_idx = 0): ''' ''' var_name = self.args_by_input_param(op, param_name)[var_idx] - var = self.block.var(var_name) + var = self.get_var(var_name) + return var + + def get_var(self, var_name): + try: + var = self.block.var(var_name) + except: + var = self.program.global_block().var(var_name) return var def var_by_output_param(self, op, param_name, var_idx = 0): ''' ''' var_name = self.args_by_output_param(op, param_name)[var_idx] - var = self.block.var(var_name) + var = self.get_var(var_name) return var def var_name_by_param(self, op, param_name, var_idx = 0): @@ -196,7 +245,8 @@ def var_name_by_param(self, op, param_name, var_idx = 0): var_name_unicode = op.output(param_name)[var_idx] else: raise NameError('ERROR: param %s has not var.' % (param_name)) - var = self.block.var(var_name_unicode) + + var = self.get_var(var_name_unicode) var_name = var.name if isinstance(var_name, unicode): var_name = str(var_name) @@ -206,13 +256,13 @@ def var_by_param(self, op, param_name, var_idx = 0): ''' ''' var_name = self.var_name_by_param(op, param_name, var_idx) - var = self.block.var(var_name) + var = self.get_var(var_name) return var def shape_by_var_name(self, var_name, layout = 'NCHW'): ''' ''' - var = self.block.var(var_name) + var = self.get_var(var_name) long_tuple = var.shape long_list = list(long_tuple) if layout == 'NCHW': @@ -227,17 +277,26 @@ def np_data_by_var_name(self, var_name): ''' ''' if hasattr(fluid.executor, '_fetch_var'): - numpy_array = fluid.executor._fetch_var(str(var_name), self.scope, True) + np_data = fluid.executor._fetch_var(str(var_name), self.scope, True) elif hasattr(fluid.executor, 'fetch_var'): - numpy_array = fluid.executor.fetch_var(var_name, self.scope, True) + np_data = fluid.executor.fetch_var(var_name, self.scope, True) else: raise NameError('ERROR: Unknown Fluid version.') - return numpy_array + + var = self.get_var(var_name) + if var.shape != np_data.shape: + logger(verbose.INFO).feed('NOTICE: var.shape != np_data.shape, var.shape={0}, np_data.shape={1}'.format( + var.shape, np_data.shape)) + # np_data need reshape to var.shape + size = reduce(lambda x, y: x * y, var.shape) + np_data = np_data.flatten()[:size].reshape(var.shape) + + return np_data def dtype_by_var_name(self, var_name): ''' ''' - var = self.block.var(var_name) + var = self.get_var(var_name) fluid_var_type = var.dtype dtype = ANAKIN_TENSOR_DTYPE[fluid_var_type] return dtype @@ -257,6 +316,7 @@ def var_shape_by_param(self, transpose, op, param_name, var_idx = 0, layout = 'N else: var_name = self.var_name_by_param(op, param_name, var_idx) shape = self.shape_by_var_name(var_name, layout) + return shape def data_with_shape_by_param(self, @@ -354,23 +414,33 @@ def attr_data(self, op, attr_name, default_value = 0, type = None): def param_tensor_sh(self, op, param_name, - transpose = False, - axes = None, - reshape = None, - var_idx = 0, - layout = 'NCHW'): + dtype=None, + transpose=False, + axes=None, + reshape=None, + var_idx=0, + layout='NCHW'): ''' ''' tensor = TensorProtoIO() - [flat_data, shape] = self.data_with_shape_by_param(op, param_name, transpose, \ - axes, var_idx, True, layout) - dtype = self.dtype_by_param(op, param_name, var_idx) - tensor.set_data_type(dtype) - if dtype in ANAKIN_TENSOR_DTYPESTR.keys(): - tensor.set_data(flat_data, ANAKIN_TENSOR_DTYPESTR[dtype]) - #pass #debug + [np_data, shape] = self.data_with_shape_by_param(op, param_name, transpose, \ + axes, var_idx, False, layout) + np_dtype = self.dtype_by_param(op, param_name, var_idx) + tensor.set_data_type(np_dtype) + if np_dtype is INT8: + tensor.set_data(np_data.flatten().tobytes(), ANAKIN_TENSOR_DTYPESTR[np_dtype]) + elif np_dtype in ANAKIN_TENSOR_DTYPESTR.keys(): + if dtype is None: + tensor.set_data(np_data.flatten().tolist(), ANAKIN_TENSOR_DTYPESTR[np_dtype]) + #pass #debug + elif dtype == "int8": + np_data = np_data.astype(np.int8) + tensor.set_data(np_data.flatten().tobytes(), "int8") + #pass #debug + else: + raise NameError('ERROR: Unknown data type (%s)' % (dtype)) else: - raise NameError('ERROR: Unknown data type (%s)' % (dtype)) + raise NameError('ERROR: Unknown data type (%s)' % (np_dtype)) if reshape is not None: tensor.set_shape(reshape) else: @@ -380,6 +450,7 @@ def param_tensor_sh(self, def param_tensor(self, op, param_name, + dtype=None, transpose = False, axes = None, reshape = None, @@ -387,19 +458,70 @@ def param_tensor(self, layout = 'NCHW'): ''' ''' - [tensor, shape] = self.param_tensor_sh(op, param_name, transpose, axes, \ + [tensor, shape] = self.param_tensor_sh(op, param_name, dtype, transpose, axes, \ reshape, var_idx, layout) return tensor - def create_tensor(self, data_list, data_shape, dtype): + def create_tensor(self, data_list, data_shape, dtype, scale=None): ''' ''' tensor = TensorProtoIO() tensor.set_data_type(dtype) tensor.set_data(data_list, ANAKIN_TENSOR_DTYPESTR[dtype]) tensor.set_shape(data_shape) + if scale is not None: + tensor.set_scale(scale, FLOAT) return tensor + def fill_tensor(self, op, var): + """fill tensor by fill_constant op & var + """ + if op.type == 'fill_constant': + # prepare fill tensor param. preference selected param from fill_constant_op + shape = var.shape + if op.has_attr('shape'): + shape = self.attr_data(op, 'shape') + dtype = var.dtype + if op.has_attr('dtype'): + dtype = ANAKIN_TENSOR_DTYPE[self.attr_data(op, 'dtype')] + value = self.attr_data(op, 'value') + + if len(shape) < 4: + shape = (4 - len(shape)) * [1] + shape + + # fill tensor + tensor = TensorProtoIO() + tensor.set_data_type(dtype) + tensor.set_shape(shape) + data_size = reduce(lambda x, y: x * y, shape) + + # int8 use bytes + if dtype is INT8: + tensor.set_data( + np.array(data_size * [value,], dtype=np.int8).flatten().tobytes(), + ANAKIN_TENSOR_DTYPESTR[dtype]) + else: + if dtype in [INT32,]: + value = int(value) + tensor.set_data( + np.array(data_size * [value,]).flatten().tolist(), + ANAKIN_TENSOR_DTYPESTR[dtype]) + + return tensor + else: + raise Exception('unexpected op.type={}'.format(op.type)) + + def broad_param_tensor(self, op, param_name, private_data={}): + var = self.var_by_param(op, param_name) + + if var.persistable: + return self.param_tensor(op, param_name) + elif 'fill_constant' in private_data and var.name in private_data['fill_constant']: + fill_constant_op = private_data['fill_constant'][var.name] + return self.fill_tensor(fill_constant_op, var) + else: + return self.create_tensor([1], [1, 1, 1, 1], FLOAT) + def gru_tensor_convert(self, origin_h2h, origin_i2h, origin_b, offset=[2, 1, 0]): ''' ''' @@ -558,8 +680,8 @@ def brothers(self, op_list): else: raise NameError('ERROR: Members of op_list must be greater than 2.') - ANAKIN_TENSOR_DTYPE = { + VarDesc.VarType.INT8: INT8, VarDesc.VarType.BOOL: BOOLEN, VarDesc.VarType.INT32: INT32, VarDesc.VarType.FP16: FLOAT16, @@ -569,14 +691,17 @@ def brothers(self, op_list): ANAKIN_TENSOR_DTYPESTR = { STR: "string", - INT32: "int", + INT8: "int8", + INT32: "int32", FLOAT: "float", - BOOLEN: "bool", + BOOLEN: "bool" } ANAKIN_ATTR_DTYPE = { AttrType.INT: INT32, AttrType.INTS: INT32, + AttrType.LONG: INT32, + AttrType.LONGS: INT32, AttrType.FLOAT: FLOAT, AttrType.FLOATS: FLOAT, AttrType.STRING: STR, @@ -588,6 +713,8 @@ def brothers(self, op_list): ANAKIN_ATTR_IS_LIST = { AttrType.INT: False, AttrType.INTS: True, + AttrType.LONG: False, + AttrType.LONGS: True, AttrType.FLOAT: False, AttrType.FLOATS: True, AttrType.STRING: False, @@ -617,3 +744,31 @@ def brothers(self, op_list): 'row_conv', 'reshape', ] + +FLUID_QUANTIZE_LAYERS = [ + 'fake_quantize_abs_max', + 'fake_quantize_range_abs_max', + 'fake_quantize_moving_average_abs_max', + 'quantize', + 'dequantize_max_abs_rowwise', +] + +FLUID_DEQUANTIZE_LAYERS = [ + 'fake_dequantize_max_abs', + 'fake_dequantize_range_max_abs', + 'dequantize', + 'quantize_abs_max_rowwise', +] + +FLUID_SCALE_WEIGHT_OP = [ + 'conv2d', + 'depthwise_conv2d', + 'mul', +] + +FLUID_SLICE_LAYERS = [ + 'split', +] + + + diff --git a/tools/external_converter_v2/parser/fluid/fluid_layer_param_transmit.py b/tools/external_converter_v2/parser/fluid/fluid_layer_param_transmit.py index 8ff0b5c15..5d841147c 100644 --- a/tools/external_converter_v2/parser/fluid/fluid_layer_param_transmit.py +++ b/tools/external_converter_v2/parser/fluid/fluid_layer_param_transmit.py @@ -1,6 +1,7 @@ from ..operations import OpsParam, OpsRegister from ..logger import * from ..proto import * +from ..proto import helper from fluid_helper import * @@ -14,26 +15,43 @@ def warpper_args(args): return warpper_args return warpper -# common +# common def NotNeededInInference(args): # args is tuple object - node_io = args[0] - layer = args[1] + pass + @ParserFeedDecorator("Input") def Parser_feed(args): + layout_dict = { + 2: "NC", + 3: "NHW", + 4: "NCHW", + } private_data = args[4] input_shape = private_data['input_shape'] alias = private_data['alias'] OpsRegister()["Input"].input_shape = input_shape OpsRegister()["Input"].alias = alias + OpsRegister()["Input"].layout = layout_dict[len(input_shape)] + @ParserFeedDecorator("Convolution") def Parser_conv2d(args): + node = args[0] op = args[1] helper = args[3] private_data = args[4] - [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter') + weights_tensor = None + weights_shape = None + + if 'scale_1' in private_data: + node.set_bit_type(INT8) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter', "int8") + weights_tensor.set_scale(private_data['scale_1'], 'float') + else: + node.set_bit_type(FLOAT) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter') OpsRegister()["Convolution"].weight_1 = weights_tensor OpsRegister()["Convolution"].filter_num = weights_shape[0] OpsRegister()["Convolution"].kernel_size = weights_shape[-2:] @@ -42,19 +60,29 @@ def Parser_conv2d(args): OpsRegister()["Convolution"].dilation_rate = helper.attr_data(op, 'dilations') OpsRegister()["Convolution"].group = helper.attr_data(op, 'groups') OpsRegister()["Convolution"].axis = 1 + if 'bias' in private_data.keys(): OpsRegister()["Convolution"].bias_term = True OpsRegister()["Convolution"].weight_2 = private_data['bias'] else: OpsRegister()["Convolution"].bias_term = False + @ParserFeedDecorator("Deconvolution") def Parser_conv2d_transpose(args): + node = args[0] op = args[1] helper = args[3] private_data = args[4] - [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter') - weights_tensor.set_shape([weights_shape[1], weights_shape[0], weights_shape[2], weights_shape[3]]) + weights_tensor = None + weights_shape = None + if 'scale_1' in private_data: + node.set_bit_type(INT8) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter', "int8") + weights_tensor.set_scale(private_data['scale_1'], 'float') + else: + node.set_bit_type(FLOAT) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Filter') OpsRegister()["Deconvolution"].weight_1 = weights_tensor OpsRegister()["Deconvolution"].filter_num = weights_shape[1] OpsRegister()["Deconvolution"].kernel_size = weights_shape[-2:] @@ -77,6 +105,7 @@ def Parser_relu(args): def Parser_pool2d(args): op = args[1] helper = args[3] + OpsRegister()["Pooling"].pool_size = helper.attr_data(op, 'ksize') OpsRegister()["Pooling"].strides = helper.attr_data(op, 'strides') OpsRegister()["Pooling"].padding = helper.attr_data(op, 'paddings') @@ -84,19 +113,32 @@ def Parser_pool2d(args): if helper.attr_data(op, 'pooling_type') == 'max': OpsRegister()["Pooling"].method = "MAX" elif helper.attr_data(op, 'pooling_type') in ['average', 'avg']: - OpsRegister()["Pooling"].method = "AVG" + if helper.attr_data(op, 'exclusive', True) is True: + OpsRegister()["Pooling"].method = 'AVGEXC' + else: + OpsRegister()["Pooling"].method = "AVG" if helper.attr_data(op, 'ceil_mode') == False: OpsRegister()["Pooling"].cmp_out_shape_floor_as_conv = True else: OpsRegister()["Pooling"].cmp_out_shape_floor_as_conv = False + @ParserFeedDecorator("Dense") def Parser_mul(args): + node = args[0] op = args[1] helper = args[3] private_data = args[4] weights_needs_trans = True - [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Y', weights_needs_trans) + weights_tensor = None + weights_shape = None + if 'scale_1' in private_data: + node.set_bit_type(INT8) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Y', "int8", weights_needs_trans) + weights_tensor.set_scale(private_data['scale_1'], 'float') + else: + node.set_bit_type(FLOAT) + [weights_tensor, weights_shape] = helper.param_tensor_sh(op, 'Y', None, weights_needs_trans) OpsRegister()["Dense"].weight_1 = weights_tensor OpsRegister()["Dense"].out_dim = weights_shape[2] OpsRegister()["Dense"].axis = helper.attr_data(op, 'x_num_col_dims') @@ -155,6 +197,7 @@ def Parser_scale_disc_bn(args): OpsRegister()["Scale"].axis = 1 OpsRegister()["Scale"].num_axes = 1 + @ParserFeedDecorator("Scale") def Parser_scale_of_bn(args): op = args[1] @@ -169,6 +212,7 @@ def Parser_scale_of_bn(args): else: OpsRegister()["Scale"].bias_term = False + @ParserFeedDecorator("Split") def Parser_split_ins(args): op = args[1] @@ -184,16 +228,20 @@ def Parser_split_ins(args): def Parser_slice(args): op = args[1] helper = args[3] - OpsRegister()["Slice"].slice_point = [-1] + sections = list(helper.attr_data(op, 'sections')) + slice_point = list() + for i in range(len(sections) - 1): + slice_point.append(sum(sections[:i + 1])) + OpsRegister()["Slice"].slice_point = slice_point OpsRegister()["Slice"].num = helper.attr_data(op, 'num') OpsRegister()["Slice"].axis = helper.attr_data(op, 'axis') - OpsRegister()["Slice"].sections = helper.attr_data(op, 'sections') @ParserFeedDecorator("Reshape") def Parser_reshape(args): op = args[1] helper = args[3] private_data = args[4] + layout = str() if 'new_shape' in private_data.keys(): shape = private_data['new_shape'] @@ -203,6 +251,8 @@ def Parser_reshape(args): layout = 'NCHW' elif len(shape) == 3: layout = 'NHW' + elif len(shape) == 2: + layout = 'NW' OpsRegister()["Reshape"].dims = shape OpsRegister()["Reshape"].layout = layout @@ -224,10 +274,14 @@ def Parser_transpose(args): op = args[1] helper = args[3] fluid_dims = helper.attr_data(op, 'axis') - n = 4 - len(fluid_dims) - dims = range(0, n) - tail_dims = [i + n for i in fluid_dims] - dims.extend(tail_dims) + dims = 0 + if fluid_dims < 4: + n = 4 - len(fluid_dims) + dims = range(0, n) + tail_dims = [i + n for i in fluid_dims] + dims.extend(tail_dims) + else: + dims = fluid_dims OpsRegister()["Permute"].dims = dims @@ -250,16 +304,45 @@ def Parser_prior_box(args): OpsRegister()["PriorBox"].offset = helper.attr_data(op, 'offset') OpsRegister()["PriorBox"].order = ['MIN', 'COM', 'MAX'] +@ParserFeedDecorator("PriorBox") +def Parser_density_prior_box(args): + op = args[1] + helper = args[3] + + OpsRegister()["PriorBox"].fixed_size = helper.attr_data(op, 'fixed_sizes') + OpsRegister()["PriorBox"].fixed_ratio = helper.attr_data(op, 'fixed_ratios') + OpsRegister()["PriorBox"].density = map(float, helper.attr_data(op, 'densities')) + OpsRegister()["PriorBox"].is_clip = helper.attr_data(op, 'clip') + OpsRegister()["PriorBox"].variance = helper.attr_data(op, 'variances') + OpsRegister()["PriorBox"].img_h = 0 + OpsRegister()["PriorBox"].img_w = 0 + OpsRegister()["PriorBox"].step_h = helper.attr_data(op, 'step_h') + OpsRegister()["PriorBox"].step_w = helper.attr_data(op, 'step_w') + OpsRegister()["PriorBox"].offset = helper.attr_data(op, 'offset') + OpsRegister()["PriorBox"].order = ['MIN', 'COM', 'MAX'] + @ParserFeedDecorator("box_coder") def Parser_box_coder(args): - pass + op = args[1] + helper = args[3] + axis = helper.attr_data(op, 'axis') + box_normalized = helper.attr_data(op, 'box_normalized') + variance = helper.attr_data(op, 'variance') + + OpsRegister()["box_coder"].axis = axis + OpsRegister()["box_coder"].box_normalized = box_normalized + if type(variance) is int: + OpsRegister()["box_coder"].variance = helper.create_tensor([variance,], [1, 1, 1, 1,], FLOAT) + else: + OpsRegister()["box_coder"].variance = helper.create_tensor(variance, [1, len(variance), 1, 1,], FLOAT) @ParserFeedDecorator("DetectionOutput") def Parser_multiclass_nms(args): op = args[1] helper = args[3] private_data = args[4] - OpsRegister()["DetectionOutput"].share_location = True + + OpsRegister()["DetectionOutput"].share_location = True if private_data['net_type'] == 'SSD' else False OpsRegister()["DetectionOutput"].variance_encode_in_target = False OpsRegister()["DetectionOutput"].class_num = 0 OpsRegister()["DetectionOutput"].background_id = helper.attr_data(op, 'background_label') @@ -445,6 +528,7 @@ def Parser_matmul(args): OpsRegister()["MatMul"].transpose_y = helper.attr_data(op, 'transpose_Y') OpsRegister()["MatMul"].coeff = coeff + @ParserFeedDecorator("Scale") def Parser_scale(args): op = args[1] @@ -454,6 +538,8 @@ def Parser_scale(args): OpsRegister()["Scale"].num_axes = 0 OpsRegister()["Scale"].bias_term = False OpsRegister()["Scale"].weight_1 = helper.create_tensor([scale_val], [1, 1, 1, 1], FLOAT) + OpsRegister()["Scale"].weight_2 = helper.create_tensor([], [0, 0, 0, 0], FLOAT) + @ParserFeedDecorator("LayerNorm") def Parser_layer_norm(args): @@ -479,10 +565,16 @@ def Parser_elementwise_mul(args): op = args[1] helper = args[3] private_data = args[4] - if helper.is_persistable_param(op, 'Y'): + + Y = helper.var_by_param(op, 'Y') + if Y.persistable: OpsRegister()["Scale"].weight_1 = helper.param_tensor(op, 'Y') + elif 'fill_constant' in private_data and Y.name in private_data['fill_constant']: + fill_constant_op = private_data['fill_constant'][Y.name] + OpsRegister()["Scale"].weight_1 = helper.fill_tensor(fill_constant_op, Y) else: OpsRegister()["Scale"].weight_1 = helper.create_tensor([1], [1, 1, 1, 1], FLOAT) # developing + OpsRegister()["Scale"].axis = helper.attr_data(op, 'axis') OpsRegister()["Scale"].num_axes = 1 if 'bias' in private_data.keys(): @@ -491,12 +583,13 @@ def Parser_elementwise_mul(args): else: OpsRegister()["Scale"].bias_term = False + @ParserFeedDecorator("Activation") def Parser_relu6(args): op = args[1] helper = args[3] OpsRegister()["Activation"].type = "ClippedRelu" - OpsRegister()["Activation"].clip_relu_num = helper.attr_data(op, 'threshold') + OpsRegister()["Activation"].clip_relu_num = float(helper.attr_data(op, 'threshold')) @ParserFeedDecorator("ReLU") def Parser_leaky_relu(args): @@ -525,14 +618,409 @@ def Parser_flatten(args): OpsRegister()["Flatten"].start_axis = helper.attr_data(op, 'axis') OpsRegister()["Flatten"].end_axis = -1 +@ParserFeedDecorator("PixelShuffle") +def Parser_pixel_shuffle(args): + private_data = args[4] + OpsRegister()["PixelShuffle"].upscale_factor = private_data['factor'] + + @ParserFeedDecorator("assign_value") def Parser_assign_value(args): pass + @ParserFeedDecorator("shape") def Parser_shape(args): pass +@ParserFeedDecorator("fake_quantize_abs_max") +def Parser_fake_quantize_abs_max(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("fake_dequantize_max_abs") +def Parser_fake_dequantize_max_abs(args): + """ + A placeholder for an empty function. + """ + pass + + +@ParserFeedDecorator("fake_dequantize_range_max_abs") +def Parser_fake_dequantize_range_max_abs(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("fake_quantize_range_abs_max") +def Parser_fake_quantize_range_abs_max(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("dequantize") +def Parser_dequantize(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("quantize") +def Parser_quantize(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("increment") +def Parser_increment(args): + """ + A placeholder for an empty function. + """ + pass + +@ParserFeedDecorator("ShuffleChannel") +def Parser_shuffle_channel(args): + private_data = args[4] + OpsRegister()["ShuffleChannel"].group = private_data['group'] + + +@ParserFeedDecorator("Scale") +def Parser_affine_channel(args): + op = args[1] + helper = args[3] + OpsRegister()["Scale"].bias_term = True + OpsRegister()["Scale"].weight_1 = helper.param_tensor(op, 'Scale') + OpsRegister()["Scale"].weight_2 = helper.param_tensor(op, 'Bias') + + +@ParserFeedDecorator("RoiAlign") +def Parser_roi_align(args): + op = args[1] + helper = args[3] + OpsRegister()["RoiAlign"].spatial_scale = helper.attr_data(op, 'spatial_scale') + OpsRegister()["RoiAlign"].pooled_height = helper.attr_data(op, 'pooled_height') + OpsRegister()["RoiAlign"].pooled_width = helper.attr_data(op, 'pooled_width') + OpsRegister()["RoiAlign"].sampling_ratio = helper.attr_data(op, 'sampling_ratio') + +@ParserFeedDecorator("AnchorGenerator") +def Parser_anchor_generator(args): + op = args[1] + helper = args[3] + OpsRegister()["AnchorGenerator"].anchor_sizes = helper.attr_data(op, 'anchor_sizes') + OpsRegister()["AnchorGenerator"].aspect_ratios = helper.attr_data(op, 'aspect_ratios') + OpsRegister()["AnchorGenerator"].variances = helper.attr_data(op, 'variances') + OpsRegister()["AnchorGenerator"].stride = helper.attr_data(op, 'stride') + OpsRegister()["AnchorGenerator"].offset = helper.attr_data(op, 'offset') + +@ParserFeedDecorator("GenerateProposals") +def Parser_generate_proposals(args): + op = args[1] + helper = args[3] + + OpsRegister()["GenerateProposals"].pre_nms_top_n = helper.attr_data(op, 'pre_nms_topN') + OpsRegister()["GenerateProposals"].post_nms_top_n = helper.attr_data(op, 'post_nms_topN') + OpsRegister()["GenerateProposals"].nms_thresh = helper.attr_data(op, 'nms_thresh') + OpsRegister()["GenerateProposals"].min_size = helper.attr_data(op, 'min_size') + OpsRegister()["GenerateProposals"].eta = helper.attr_data(op, 'eta') + +@ParserFeedDecorator("Normalize") +def Parser_norm(args): + op = args[1] + helper = args[3] + OpsRegister()["Normalize"].is_across_spatial = False + OpsRegister()["Normalize"].is_shared_channel = False + OpsRegister()["Normalize"].eps = helper.attr_data(op, 'epsilon') + OpsRegister()["Normalize"].p = 2 + + +@ParserFeedDecorator("Resize") +def Parser_bilinear_interp(args): + op = args[1] + helper = args[3] + OpsRegister()["Resize"].out_width = helper.attr_data(op, 'out_w') + OpsRegister()["Resize"].out_height = helper.attr_data(op, 'out_h') + OpsRegister()["Resize"].method = "BILINEAR_ALIGN" + + +@ParserFeedDecorator("SequencePoolConcat") +def Parser_seqpool_concat(args): + op = args[1] + helper = args[3] + private_data = args[4] + OpsRegister()["SequencePoolConcat"].pooltype = helper.attr_data(op, 'pooltype') + OpsRegister()["SequencePoolConcat"].axis = private_data['axis'] + OpsRegister()["SequencePoolConcat"].slot_num = private_data['slot_num'] + +@ParserFeedDecorator("Scale") +def Parser_data_norm(args): + op = args[1] + helper = args[3] + batch_size = helper.np_param(op, 'BatchSize') + batch_square_sum = helper.np_param(op, 'BatchSquareSum') + batch_sum = helper.np_param(op, 'BatchSum') + np_means = batch_sum / batch_size + np_scales = np.sqrt(batch_size / batch_square_sum) + np_bias = - (np_scales * np_means) + np_scale_shape = map(int, [1] * (4 - len(np_scales.shape)) + list(np_scales.shape)) + np_bias_shape = map(int, [1] * (4 - len(np_bias.shape)) + list(np_bias.shape)) + np_weight_tensor = helper.create_tensor(np_scales.flatten().tolist(), np_scale_shape, FLOAT) + np_bias_tensor = helper.create_tensor(np_bias.flatten().tolist(), np_bias_shape, FLOAT) + OpsRegister()["Scale"].axis = 1 + OpsRegister()["Scale"].num_axes = 1 + OpsRegister()["Scale"].bias_term = True + OpsRegister()["Scale"].weight_1 = np_weight_tensor + OpsRegister()["Scale"].weight_2 = np_bias_tensor + + +@ParserFeedDecorator("fusion_dropout_add_ln_quant") +def Parser_fusion_dropout_add_ln_quant(args): + pass + +@ParserFeedDecorator("dequantize_max_abs_rowwise") +def Parser_dequantize_max_abs_rowwise(args): + pass + +@ParserFeedDecorator("quantize_abs_max_rowwise") +def Parser_quantize_abs_max_rowwise(args): + pass + +@ParserFeedDecorator("fusion_add_relu_dropout_quant") +def Parser_fusion_add_relu_dropout_quant(args): + pass + +@ParserFeedDecorator("fill_constant") +def Parser_fill_constant(args): + pass + +@ParserFeedDecorator("less_than") +def Parser_less_than(args): + pass + +@ParserFeedDecorator("write_to_array") +def Parser_write_to_array(args): + pass + +@ParserFeedDecorator("fill_constant_batch_size_like") +def Parser_fill_constant_batch_size_like(args): + pass + +@ParserFeedDecorator("assign") +def Parser_assign(args): + op = args[1] + helper = args[3] + +@ParserFeedDecorator("while") +def Parser_while(args): + pass + +@ParserFeedDecorator("beam_search_decode") +def Parser_beam_search_decode(args): + pass + + +@ParserFeedDecorator("Resize") +def Parser_nearest_interp(args): + #pass + op = args[1] + helper = args[3] + + out_h = helper.attr_data(op, 'out_h') + out_w = helper.attr_data(op, 'out_w') + interp_method = helper.attr_data(op, 'interp_method') + align_corners = helper.attr_data(op, 'align_corners', False) + align_mode = helper.attr_data(op, 'align_mode', 0) + + if interp_method == 'nearest': + if align_corners: + OpsRegister()["Resize"].method = 'BILINEAR_ALIGN' + else: + OpsRegister()["Resize"].method = 'BILINEAR_NO_ALIGN' + OpsRegister()["Resize"].out_height = out_h + OpsRegister()["Resize"].out_width = out_w + else: + raise Exception('unexpected interp_method={}'.format(interp_method)) + +@ParserFeedDecorator("yolo_box") +def Parser_yolo_box(args): + op = args[1] + helper = args[3] + OpsRegister()["yolo_box"].class_num = helper.attr_data(op, 'class_num') + OpsRegister()["yolo_box"].anchors = list(helper.attr_data(op, 'anchors')) + OpsRegister()["yolo_box"].downsample_ratio = helper.attr_data(op, 'downsample_ratio') + OpsRegister()["yolo_box"].conf_thresh = helper.attr_data(op, 'conf_thresh') + + +@ParserFeedDecorator("slice_v2") +def Parser_slice2(args): + op = args[1] + helper = args[3] + OpsRegister()["slice_v2"].ends = list(helper.attr_data(op, 'ends')) + OpsRegister()["slice_v2"].starts = list(helper.attr_data(op, 'starts')) + OpsRegister()["slice_v2"].axes = list(helper.attr_data(op, 'axes')) + + +@ParserFeedDecorator("reduce") +def Parser_reduce_mean(args): + op = args[1] + helper = args[3] + dim = helper.attr_data(op, 'dim') + keep_dim = helper.attr_data(op, 'keep_dim') + + OpsRegister()['reduce'].reduce_type = 'Reduce_avg' + OpsRegister()['reduce'].keep_dim = keep_dim + if dim is None: + OpsRegister()['reduce'].reduce_all = True + elif type(dim) is list: + OpsRegister()['reduce'].reduce_all = False + OpsRegister()['reduce'].reduce_dim = dim + elif type(dim) is int: + OpsRegister()['reduce'].reduce_all = False + OpsRegister()['reduce'].reduce_dim = [dim,] + else: + raise Exception('unexpected type(dim)={0}'.format(type(dim))) + + +@ParserFeedDecorator("Argmax") +def Parser_arg_max(args): + op = args[1] + helper = args[3] + + OpsRegister()["Argmax"].top_k = 1 + OpsRegister()["Argmax"].axis_term = True + OpsRegister()["Argmax"].out_max_value = False + OpsRegister()["Argmax"].axis = helper.attr_data(op, 'axis') + +@ParserFeedDecorator("sequence_expand") +def Parser_sequence_expand(args): + op = args[1] + helper = args[3] + ref_level = helper.attr_data(op, 'ref_level') + + OpsRegister()['sequence_expand'].ref_level = ref_level + + +@ParserFeedDecorator("Scale") +def Parser_elementwise_div(args): + op = args[1] + helper = args[3] + private_data = args[4] + + axis = helper.attr_data(op, 'axis', -1) + Y = helper.var_by_param(op, 'Y') + if Y.persistable: + weight_1 = helper.param_tensor(op, 'Y') + elif 'fill_constant' in private_data and Y.name in private_data['fill_constant']: + fill_constant_op = private_data['fill_constant'][Y.name] + weight_1 = helper.fill_tensor(fill_constant_op, Y) + else: + weight_1 = helper.create_tensor([1], [1, 1, 1, 1], FLOAT) # developing + # reverse cache_data + helper.reverse_cache_data(weight_1.tensor_proto.data) + + OpsRegister()["Scale"].axis = axis + OpsRegister()["Scale"].num_axes = 1 + OpsRegister()["Scale"].weight_1 = weight_1 + + +@ParserFeedDecorator("box_clip") +def Parser_box_clip(args): + pass + + +@ParserFeedDecorator("Reduce") +def Parser_reduce_prod(args): + op = args[1] + helper = args[3] + dim = helper.attr_data(op, 'dim') + keep_dim = helper.attr_data(op, 'keep_dim') + + OpsRegister()['reduce'].reduce_type = 'Reduce_prod' + OpsRegister()['reduce'].keep_dim = keep_dim + if dim is None: + OpsRegister()['reduce'].reduce_all = True + elif type(dim) is list: + OpsRegister()['reduce'].reduce_all = False + OpsRegister()['reduce'].reduce_dim = dim + elif type(dim) is int: + OpsRegister()['reduce'].reduce_all = False + OpsRegister()['reduce'].reduce_dim = [dim,] + else: + raise Exception('unexpected type(dim)={0}'.format(type(dim))) + + +@ParserFeedDecorator("equal") +def Parser_equal(args): + pass + + +@ParserFeedDecorator("split_lod_tensor") +def Parser_split_lod_tensor(args): + pass + + +@ParserFeedDecorator("conditional_block") +def Parser_conditional_block(args): + pass + + +@ParserFeedDecorator("merge_lod_tensor") +def Parser_merge_lod_tensor(args): + pass + + +@ParserFeedDecorator('lod_reset') +def Parser_lod_reset(args): + """fluid.layers.lod_reset parser + """ + pass + + +@ParserFeedDecorator('GroupNormal') +def Parser_group_norm(args): + """fluid.layers.group_norm parser + """ + op = args[1] + helper = args[3] + private_data = args[4] + + Bias = helper.broad_param_tensor(op, 'Bias', private_data) + Scale = helper.broad_param_tensor(op, 'Scale', private_data) + epsilon = helper.attr_data(op, 'epsilon', 0.0) + groups = helper.attr_data(op, 'groups', 0) + + OpsRegister()['GroupNormal'].has_scale = True + OpsRegister()['GroupNormal'].scale = Scale + OpsRegister()['GroupNormal'].has_bias = True + OpsRegister()['GroupNormal'].bias = Bias + OpsRegister()['GroupNormal'].eps = epsilon + OpsRegister()['GroupNormal'].group = groups + + +@ParserFeedDecorator('fake_quantize_moving_average_abs_max') +def Parser_fake_quantize_moving_average_abs_max(args): + """fluid.layers.fake_quantize_moving_average_abs_max parser + """ + pass + + +@ParserFeedDecorator('Activation') +def Parser_swish(args): + """fluid.layers.swish parser + """ + op = args[1] + helper = args[3] + + beta = helper.attr_data(op, 'beta', 1.0) + + OpsRegister()['Activation'].type = 'Swish' + OpsRegister()['Activation'].clip_relu_num = beta + + FLUID_NODE_FILLER = { "feed":OpsParam().set_parser(Parser_feed), "conv2d":OpsParam().set_parser(Parser_conv2d), @@ -551,9 +1039,12 @@ def Parser_shape(args): "split_ins":OpsParam().set_parser(Parser_split_ins), "depthwise_conv2d":OpsParam().set_parser(Parser_conv2d), "reshape":OpsParam().set_parser(Parser_reshape), + "reshape2":OpsParam().set_parser(Parser_reshape), "concat":OpsParam().set_parser(Parser_concat), "transpose":OpsParam().set_parser(Parser_transpose), + "transpose2":OpsParam().set_parser(Parser_transpose), "prior_box":OpsParam().set_parser(Parser_prior_box), + "density_prior_box":OpsParam().set_parser(Parser_density_prior_box), "box_coder":OpsParam().set_parser(Parser_box_coder), "multiclass_nms":OpsParam().set_parser(Parser_multiclass_nms), "concat_btw_priorbox_boxcoder":OpsParam().set_parser(Parser_concat_btw_priorbox_boxcoder), @@ -575,10 +1066,60 @@ def Parser_shape(args): "dropout":OpsParam().set_parser(Parser_dropout), "scale":OpsParam().set_parser(Parser_scale), "flatten":OpsParam().set_parser(Parser_flatten), + "flatten2":OpsParam().set_parser(Parser_flatten), "assign_value":OpsParam().set_parser(Parser_assign_value), "shape":OpsParam().set_parser(Parser_shape), "relu6":OpsParam().set_parser(Parser_relu6), "leaky_relu":OpsParam().set_parser(Parser_leaky_relu), "prelu":OpsParam().set_parser(Parser_prelu), "split":OpsParam().set_parser(Parser_slice), + "quantize":OpsParam().set_parser(Parser_quantize), + "dequantize":OpsParam().set_parser(Parser_dequantize), + "fake_quantize_abs_max":OpsParam().set_parser(Parser_fake_quantize_abs_max), + "fake_quantize_range_abs_max":OpsParam().set_parser(Parser_fake_quantize_range_abs_max), + "fake_dequantize_max_abs":OpsParam().set_parser(Parser_fake_dequantize_max_abs), + "fake_dequantize_range_max_abs":OpsParam().set_parser(Parser_fake_dequantize_range_max_abs), + "pixel_shuffle":OpsParam().set_parser(Parser_pixel_shuffle), + "shuffle_channel":OpsParam().set_parser(Parser_shuffle_channel), + # FastRCNN start + "affine_channel":OpsParam().set_parser(Parser_affine_channel), + "anchor_generator":OpsParam().set_parser(Parser_anchor_generator), + "generate_proposals":OpsParam().set_parser(Parser_generate_proposals), + "roi_align":OpsParam().set_parser(Parser_roi_align), + # FastRCNN end + "norm":OpsParam().set_parser(Parser_norm), + "increment":OpsParam().set_parser(Parser_increment), + "bilinear_interp":OpsParam().set_parser(Parser_bilinear_interp), + # feed + "data_norm":OpsParam().set_parser(Parser_data_norm), + "seqpool_concat":OpsParam().set_parser(Parser_seqpool_concat), + # capi + "fusion_dropout_add_ln_quant":OpsParam().set_parser(Parser_fusion_dropout_add_ln_quant), + "dequantize_max_abs_rowwise":OpsParam().set_parser(Parser_dequantize_max_abs_rowwise), + "quantize_abs_max_rowwise":OpsParam().set_parser(Parser_quantize_abs_max_rowwise), + "fusion_add_relu_dropout_quant":OpsParam().set_parser(Parser_fusion_add_relu_dropout_quant), + "fill_constant":OpsParam().set_parser(Parser_fill_constant), + "less_than":OpsParam().set_parser(Parser_less_than), + "write_to_array":OpsParam().set_parser(Parser_write_to_array), + "fill_constant_batch_size_like":OpsParam().set_parser(Parser_fill_constant_batch_size_like), + "assign":OpsParam().set_parser(Parser_assign), + "while":OpsParam().set_parser(Parser_while), + "beam_search_decode":OpsParam().set_parser(Parser_beam_search_decode), + "slice":OpsParam().set_parser(Parser_slice2), + "nearest_interp":OpsParam().set_parser(Parser_nearest_interp), + "yolo_box":OpsParam().set_parser(Parser_yolo_box), + "reduce_mean":OpsParam().set_parser(Parser_reduce_mean), + "arg_max":OpsParam().set_parser(Parser_arg_max), + "sequence_expand":OpsParam().set_parser(Parser_sequence_expand), + "elementwise_div":OpsParam().set_parser(Parser_elementwise_div), + "box_clip":OpsParam().set_parser(Parser_box_clip), + "reduce_prod":OpsParam().set_parser(Parser_reduce_prod), + "equal":OpsParam().set_parser(Parser_equal), + "split_lod_tensor":OpsParam().set_parser(Parser_split_lod_tensor), + "conditional_block":OpsParam().set_parser(Parser_conditional_block), + "merge_lod_tensor": OpsParam().set_parser(Parser_merge_lod_tensor), + 'lod_reset': OpsParam().set_parser(Parser_lod_reset), + 'group_norm': OpsParam().set_parser(Parser_group_norm), + 'fake_quantize_moving_average_abs_max': OpsParam().set_parser(Parser_fake_quantize_moving_average_abs_max), + 'swish': OpsParam().set_parser(Parser_swish), } diff --git a/tools/external_converter_v2/parser/fluid/parser_fluid.py b/tools/external_converter_v2/parser/fluid/parser_fluid.py index 476583fb4..b5ad21e1e 100644 --- a/tools/external_converter_v2/parser/fluid/parser_fluid.py +++ b/tools/external_converter_v2/parser/fluid/parser_fluid.py @@ -5,6 +5,7 @@ from ..logger import * from ..proto import * from fluid_layer_param_transmit import * +import proto_helper class FluidParser: @@ -20,12 +21,13 @@ def __init__(self, fluid_config_dict): self.exe = fluid.Executor(self.place) self.scope = fluid.core.Scope() # in and out edges of node - self.ins = {} - self.outs = {} + self.ins = dict() + self.outs = dict() # inplaced main node - self.inplace_nodes = {} - self.graph_ins = [] - self.graph_outs = [] + self.inplace_nodes = dict() + self.graph_ins = list() + self.graph_outs = list() + self.scale_dict = dict() def __call__(self): return self._Parsing() @@ -84,6 +86,10 @@ def _AddProtoNode(self, node_name, op_of_node, helper, private_data, op_type=Non nodeIO.set_name(node_name) if op_type is None: op_type = op_of_node.type + if private_data is None: + private_data = {} + private_data['net_type'] = self.NetType + FLUID_NODE_FILLER[op_type](nodeIO, op_of_node, opIO, helper, private_data) self.graphIO.add_node(nodeIO()) @@ -151,53 +157,81 @@ def _GetDebugOuts(self, source_ops, helper): return [] def _ParseBase(self, source_ops, helper, sub_graph_nodes=None): + # Create the original base graph as described in fluid program. if sub_graph_nodes is None: sub_graph_nodes = list() self.graphIO = GraphProtoIO() self.graphIO.set_name('default_graph_name') + debug_fetch_list = self._GetDebugOuts(source_ops, helper) self._EdgeInplace(source_ops, helper) + + # add fill_constant private_data + private_data = { + 'fill_constant': {}, + } + # record every fill_constant op for affected args + fill_constant_ops = filter(lambda op: op.type == 'fill_constant', source_ops) + for op in fill_constant_ops: + for arg_name in op.output_arg_names: + private_data['fill_constant'][arg_name] = op + for source_op in source_ops: - if source_op.type not in ['feed', 'fetch']: + if source_op.type in ['feed', 'fetch', 'fill_constant']: + pass + else: main_node_name = self._NameNodeMid(source_op) in_edges = Fluid_edger() out_edges = Fluid_edger() for param in source_op.input_names: - for idx in range(0, len(helper.args_by_input_param(source_op, param))): - arg = helper.var_name_by_param(source_op, param, idx) - for tmp_op in source_ops: - if tmp_op.idx != source_op.idx and arg in tmp_op.output_arg_names: - if tmp_op.type == 'feed': - if arg not in self.graph_ins: - self.graph_ins.append(arg) - self.graphIO.add_in(self._NameNodeIn(arg)) - in_edges.add(param, self._NameNodeIn(arg), arg) - else: - tmp_node_name = self._NameNodeMid(tmp_op) - if tmp_node_name in self.inplace_nodes.keys(): - inplace_node_name = self.inplace_nodes[tmp_node_name][-1] - in_edges.add(param, inplace_node_name, arg) - elif tmp_node_name not in self._InplaceNodes('All'): - in_edges.add(param, tmp_node_name, arg) + if param not in ['InScale']: + for idx in range(0, len(helper.args_by_input_param(source_op, param))): + arg = helper.var_name_by_param(source_op, param, idx) + for tmp_op in source_ops: + if tmp_op.idx != source_op.idx and arg in tmp_op.output_arg_names: + if tmp_op.type == 'feed': + if arg not in self.graph_ins: + self.graph_ins.append(arg) + self.graphIO.add_in(self._NameNodeIn(arg)) + in_edges.add(param, self._NameNodeIn(arg), arg) + else: + tmp_node_name = self._NameNodeMid(tmp_op) + if tmp_node_name in self.inplace_nodes.keys(): + inplace_node_name = self.inplace_nodes[tmp_node_name][-1] + in_edges.add(param, inplace_node_name, arg) + elif tmp_node_name not in self._InplaceNodes('All'): + in_edges.add(param, tmp_node_name, arg) for param in source_op.output_names: - for idx in range(0, len(helper.args_by_output_param(source_op, param))): - arg = helper.var_name_by_param(source_op, param, idx) - for tmp_op in source_ops: - if tmp_op.idx != source_op.idx and arg in tmp_op.input_arg_names: - if tmp_op.type == 'fetch': - if arg not in debug_fetch_list: - arg_node_name = self._NameNodeOut(arg) - if arg not in self.graph_outs: - self.graph_outs.append(arg) - self.graphIO.add_out_fluid(arg_node_name, \ + if param not in ['OutScale']: + for idx in range(0, len(helper.args_by_output_param(source_op, param))): + extra_out = True + arg = helper.var_name_by_param(source_op, param, idx) + for tmp_op in source_ops: + if tmp_op.idx != source_op.idx and arg in tmp_op.input_arg_names: + extra_out = False + if tmp_op.type == 'fetch': + if arg not in debug_fetch_list: + arg_node_name = self._NameNodeOut(arg) + if arg not in self.graph_outs: + self.graph_outs.append(arg) + self.graphIO.add_out_fluid(arg_node_name, \ + main_node_name) + out_edges.add(param, arg_node_name, arg) + self.ins[arg_node_name] = Fluid_edger(bytes(source_op.idx), \ main_node_name) - out_edges.add(param, arg_node_name, arg) - self.ins[arg_node_name] = Fluid_edger(bytes(source_op.idx), \ - main_node_name) - else: - out_edges.add(param, self._NameNodeMid(tmp_op), arg) - self._AddProtoNode(main_node_name, source_op, helper, {}) + else: + out_edges.add(param, self._NameNodeMid(tmp_op), arg) + if extra_out is True and source_op.type in ['split']: + arg_node_name = self._NameNodeOut(arg) + if arg not in self.graph_outs: + self.graph_outs.append(arg) + self.graphIO.add_out_fluid(arg_node_name, \ + main_node_name) + out_edges.add(param, arg_node_name, arg) + self.ins[arg_node_name] = Fluid_edger(bytes(source_op.idx), \ + main_node_name) + self._AddProtoNode(main_node_name, source_op, helper, private_data) if main_node_name not in self._InplaceNodes('Mid'): if main_node_name not in self._InplaceNodes('End'): self.ins[main_node_name] = in_edges @@ -210,6 +244,8 @@ def _ParseBase(self, source_ops, helper, sub_graph_nodes=None): for redundant_target in self.inplace_nodes[main_node_name][1:]: self.outs[inplace_node_name].rm(redundant_target) + self.outs + def _PrintEdge(self, node, target, direction): var_name = 'Unknown' if direction == 'in': @@ -220,19 +256,29 @@ def _PrintEdge(self, node, target, direction): var_name = var[0] print node + ",\t" + target + ",\t" + var_name - def _Graph(self, need_print=False): + def _Graph(self, reverse=False, need_print=False): for node in self.ins.keys(): targets_list = self.ins[node]() - for target in targets_list: - self.graphIO.add_in_edge(target, node) + targets_scale = self.ins[node].all_scales() + for idx, target in enumerate(targets_list): + scale = targets_scale[idx] + if reverse is False: + self.graphIO.add_in_edge(target, node, scale) + else: + self.graphIO.add_out_edge(target, node, scale) for node in self.outs.keys(): targets_list = self.outs[node]() - for target in targets_list: - self.graphIO.add_out_edge(node, target) + targets_scale = self.outs[node].all_scales() + for idx, target in enumerate(targets_list): + scale = targets_scale[idx] + if reverse is False: + self.graphIO.add_out_edge(node, target, scale) + else: + self.graphIO.add_in_edge(node, target, scale) if need_print is True: self._PrintEdge(node, target, 'out') - def _ReplaceInputs(self, source_ops, helper, reshape_dict=None, layout='NCHW'): + def _ReplaceInputs(self, source_ops, helper, reshape_dict=None, layout='NCHW', quantized=False): if reshape_dict is None: reshape_dict = dict() for source_op in source_ops: @@ -251,7 +297,7 @@ def _ReplaceInputs(self, source_ops, helper, reshape_dict=None, layout='NCHW'): if shape[0] == -1: shape[0] = 1 if layout == 'NCHW': - shape = map(int, [1] * (4 - len(shape)) + shape) + shape = map(int, shape + [1] * (4 - len(shape))) if input_node_name in reshape_dict.keys(): shape = reshape_dict[input_node_name] private_data['input_shape'] = shape @@ -259,10 +305,11 @@ def _ReplaceInputs(self, source_ops, helper, reshape_dict=None, layout='NCHW'): self.outs[input_node_name] = out_edges self._AddProtoNode(input_node_name, source_op, helper, private_data) - def _InsertSplit(self, source_ops, helper): + def _InsertSplit(self, source_ops, helper, quantized=False): # If a layer has two identical output tensors, add a split layer. for node in self.outs.keys(): - if node.startswith('split#') is False: + if node.startswith('split#') is False and \ + node.startswith('increment#') is False: out_edges = self.outs[node] for param in out_edges.all_params(): out_targets_list = out_edges.targets(param) @@ -321,7 +368,7 @@ def next_out(node): cache.pop() return results - def _CropGraph(self, ins_of_subgraph, outs_of_subgraph, helper, need_io = True): + def _CropGraph(self, ins_of_subgraph, outs_of_subgraph, helper, need_io=True, quantized=False): ''' ''' def all_nodes(): @@ -364,7 +411,8 @@ def all_nodes(): self.outs[in_node_name] = Fluid_edger('_Out', node_name) self._AddProtoNode(in_node_name, None, helper, private_data, 'feed') - def _IntegrateNodes(self, main_op, main_node_name, sec_node_name, helper, private_data): + def _IntegrateNodes(self, main_op, main_node_name, sec_node_name, \ + helper, private_data, quantized=False): # Merge secondary nodes to the primary node and process the edges. self._RmProtoNode(main_node_name) self._RmProtoNode(sec_node_name) @@ -378,7 +426,7 @@ def _IntegrateNodes(self, main_op, main_node_name, sec_node_name, helper, privat self.outs[main_node_name].rm(sec_node_name) self._AddProtoNode(main_node_name, main_op, helper, private_data) - def _DealWithBias(self, source_ops, helper): + def _DealWithBias(self, source_ops, helper, quantized=False): # In fluid, the bias parameter of the conv2d is split into elementwise_add. for source_op in source_ops: if source_op.type in APPEND_BIAS_OP_TYPE: @@ -396,10 +444,12 @@ def _DealWithBias(self, source_ops, helper): new_shape = [1, shape[3], 1, 1] elt_tensor.set_shape(new_shape) private_data['bias'] = elt_tensor + if main_node_name in self.scale_dict.keys(): + private_data['scale_1'] = self.scale_dict[main_node_name] self._IntegrateNodes(source_op, main_node_name, \ elt_node_name, helper, private_data) - def _DealWithBatchnorm(self, source_ops, helper): + def _DealWithBatchnorm(self, source_ops, helper, quantized=False): # In anakin, the scale part of batchnorm layer is independent. for source_op in source_ops: if source_op.type == 'batch_norm': @@ -432,7 +482,7 @@ def _DealWithBatchnorm(self, source_ops, helper): self.ins[append_node_name].add('_Ins', main_node_name) self._AddProtoNode(append_node_name, source_op, helper, {}, 'scale_of_bn') - def _DealWithAxpy(self, source_ops, helper): + def _DealWithAxpy(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'elementwise_mul': mul_node_name = self._NameNodeMid(source_op) @@ -453,7 +503,7 @@ def _DealWithAxpy(self, source_ops, helper): self._RmProtoNode(mul_node_name) self._AddProtoNode(add_node_name, None, helper, {}, 'axpy') - def _DealWithPriorBox(self, source_ops, helper, is_dev_v2=True): + def _DealWithPriorBox(self, source_ops, helper, is_dev_v2=True, quantized=False): nodes_to_del = [] for source_op in source_ops: if source_op.type == 'prior_box': @@ -484,11 +534,39 @@ def _DealWithPriorBox(self, source_ops, helper, is_dev_v2=True): self._RmProtoNode(bc_node_name) self._AddProtoNode(bc_node_name, None, helper, private_data, \ 'concat_btw_priorbox_boxcoder') + elif source_op.type == 'density_prior_box': + if is_dev_v2 is True: + axis = 2 + else: + axis = 3 + private_data = {"axis": axis} + pb_node_name = self._NameNodeMid(source_op) + br_node_name = self.outs[pb_node_name].target('Boxes') + vr_node_name = self.outs[pb_node_name].target('Variances') + bc_node_name = self.outs[br_node_name].target('Out') + vc_node_name = self.outs[vr_node_name].target('Out') + boxcoder_node_name = self.outs[bc_node_name].target('Out') + self.outs[pb_node_name].mv(br_node_name, bc_node_name) + self.outs[pb_node_name].rm(vr_node_name) + self.ins[bc_node_name].mv(br_node_name, pb_node_name) + self.ins[boxcoder_node_name].rm(vc_node_name) + for node_name in [br_node_name, vr_node_name, vc_node_name]: + if node_name not in nodes_to_del: + nodes_to_del.append(node_name) + input_node_name = self.ins[pb_node_name].target('Input') + image_node_name = self.ins[pb_node_name].target('Image') + self.ins[pb_node_name].rm(input_node_name) + self.ins[pb_node_name].rm(image_node_name) + self.ins[pb_node_name].add('Input', input_node_name) + self.ins[pb_node_name].add('Image', image_node_name) + self._RmProtoNode(bc_node_name) + self._AddProtoNode(bc_node_name, None, helper, private_data, \ + 'concat_btw_priorbox_boxcoder') for node_name in nodes_to_del: self._RmProtoNode(node_name) self._ClearEdges(node_name) - def _DealWithDetectionOutput(self, source_ops, helper): + def _DealWithDetectionOutput(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'box_coder': bc_node_name = self._NameNodeMid(source_op) @@ -516,7 +594,7 @@ def _DealWithDetectionOutput(self, source_ops, helper): self._AddProtoNode(nms_node_name, nms_op, helper, \ private_data, 'multiclass_nms') - def _DealWithMultiFC(self, source_ops, helper): + def _DealWithMultiFC(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'sum': sum_node_name = self._NameNodeMid(source_op) @@ -546,7 +624,7 @@ def _DealWithMultiFC(self, source_ops, helper): self._RmProtoNode(first_mul_name) self._AddProtoNode(first_mul_name, first_mul_op, helper, private_data) - def _DealWithGru(self, source_ops, helper): + def _DealWithGru(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'gru': private_data = {} @@ -593,7 +671,7 @@ def _DealWithGru(self, source_ops, helper): if node_to_del_name is not gru_node_name: self._ClearEdges(node_to_del_name) - def _SearchBilstm(self, source_ops, helper): + def _SearchBilstm(self, source_ops, helper, quantized=False): comp = Fluid_comparator(helper) lstm_ops = [] for source_op in source_ops: @@ -611,7 +689,7 @@ def _SearchBilstm(self, source_ops, helper): else: return False - def _DealWithLstm(self, source_ops, helper): + def _DealWithLstm(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'lstm': private_data = {} @@ -661,7 +739,7 @@ def _DealWithLstm(self, source_ops, helper): self._ClearEdges(node_to_del_name) self._AddProtoNode(lstm_node_name, lstm_op, helper, private_data) - def _DealWithCast(self, source_ops, helper): + def _DealWithCast(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'cast': if helper.attr_data(source_op, 'out_dtype') == 5: @@ -679,7 +757,7 @@ def _DealWithCast(self, source_ops, helper): else: raise NameError('The out type of cast must be float32.') - def _DealWithArgmax(self, source_ops, helper): + def _DealWithArgmax(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'top_k': private_data = {} @@ -719,9 +797,9 @@ def _DealWithArgmax(self, source_ops, helper): self._RmProtoNode(topk_node_name) self._AddProtoNode(topk_node_name, source_op, helper, private_data) - def _RefreshReshape(self, source_ops, helper, need_assign=False): + def _RefreshReshape(self, source_ops, helper, need_assign=False, quantized=False): for source_op in source_ops: - if source_op.type == 'reshape': + if source_op.type in ['reshape', 'reshape2']: reshape_node_name = self._NameNodeMid(source_op) # Make sure this node exists in this graph. if reshape_node_name in self.ins: @@ -729,14 +807,13 @@ def _RefreshReshape(self, source_ops, helper, need_assign=False): tensor_inputs = self.ins[reshape_node_name].targets('X') if len(shape_inputs) == 1 and len(tensor_inputs) == 1: self.ins[reshape_node_name].rm(shape_inputs[0]) - if shape_inputs[0].split('#')[0] != 'assign_value' \ - or need_assign is True: + if shape_inputs[0].split('#')[0] != 'assign_value' or need_assign is True: self.ins[reshape_node_name].add('Shape', shape_inputs[0]) else: self._RmProtoNode(shape_inputs[0]) self._ClearEdges(shape_inputs[0]) - def _CutReshape(self, reshape_node_name): + def _CutReshape(self, reshape_node_name, quantized=False): branch = [] branch.append(reshape_node_name) shape_inputs = self.ins[reshape_node_name].targets('Shape') @@ -779,7 +856,7 @@ def _CutReshape(self, reshape_node_name): self._RmProtoNode(input_node_name) self._ClearEdges(input_node_name) - def _RefreshSplit(self, split_node_name, helper): + def _RefreshSplit(self, split_node_name, helper, quantized=False): outputs_of_split = self.outs[split_node_name].targets('_Out') inputs_of_split = self.ins[split_node_name].targets('_In') assert len(inputs_of_split) < 2 @@ -796,15 +873,15 @@ def _RefreshSplit(self, split_node_name, helper): self._RmProtoNode(split_node_name) self._AddProtoNode(split_node_name, None, helper, private_data, 'split_ins') - def _DealWithSoftmax(self, source_ops, helper): + def _DealWithSoftmax(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'softmax': softmax_node_name = self._NameNodeMid(source_op) outs_of_softmax = self.outs[softmax_node_name].targets('Out') ins_of_softmax = self.ins[softmax_node_name].targets('X') - if outs_of_softmax[0].split('#')[0] == 'reshape': - if ins_of_softmax[0].split('#')[0] == 'reshape' or \ - ins_of_softmax[0].split('#')[0] == 'flatten': + if outs_of_softmax[0].split('#')[0] in ['reshape', 'reshape2']: + if ins_of_softmax[0].split('#')[0] in ['reshape', 'reshape2'] or \ + ins_of_softmax[0].split('#')[0] in ['flatten', 'flatten2']: private_data = {} private_data['axis'] = 3 self._CutReshape(outs_of_softmax[0]) @@ -816,7 +893,7 @@ def _DealWithSoftmax(self, source_ops, helper): if ins_of_softmax[0].startswith('split'): self._RefreshSplit(ins_of_softmax[0], helper) - def _DealWithMatmal(self, source_ops, helper): + def _DealWithMatmal(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'matmul': matmul_node_name = self._NameNodeMid(source_op) @@ -844,7 +921,7 @@ def _DealWithMatmal(self, source_ops, helper): self._RmProtoNode(matmul_node_name) self._AddProtoNode(matmul_node_name, source_op, helper, private_data) - def _DealWithDiscBatchNorm(self, source_ops, helper): + def _DealWithDiscBatchNorm(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'batch_norm': discrete_flag = True @@ -860,7 +937,7 @@ def _DealWithDiscBatchNorm(self, source_ops, helper): self._RmProtoNode(bn_node_name) self._AddProtoNode(bn_node_name, source_op, helper, {}, 'disc_bn') - def _DealWithSSD(self, source_ops, helper): + def _DealWithSSD(self, source_ops, helper, quantized=False): for source_op in source_ops: if source_op.type == 'softmax': private_data = dict() @@ -877,6 +954,209 @@ def _DealWithSSD(self, source_ops, helper): self._RmProtoNode(sm_node_name) self._AddProtoNode(sm_node_name, source_op, helper, private_data, 'softmax') + + def _DealWithPixelShuffle(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type in ['transpose', 'transpose2']: + axis = helper.attr_data(source_op, 'axis') + if axis == [0, 1, 4, 2, 5, 3]: + private_data = dict() + ts_node_name = self._NameNodeMid(source_op) + in_of_transpose = self.ins[ts_node_name].target('X') + out_of_transpose = self.outs[ts_node_name].target('Out') + if in_of_transpose.startswith('reshape') and \ + out_of_transpose.startswith('reshape'): + in_reshape_op = self._GetOp(source_ops, in_of_transpose) + out_reshape_op = self._GetOp(source_ops, out_of_transpose) + in_shape = helper.attr_data(in_reshape_op, 'shape') + out_shape = helper.attr_data(out_reshape_op, 'shape') + private_data['factor'] = out_shape[-1] / in_shape[-1] + in_first_reshape = self.ins[in_of_transpose].target('X') + out_last_reshape = self.outs[out_of_transpose].target('Out') + self.outs[in_first_reshape].mv(in_of_transpose, ts_node_name) + self.outs[ts_node_name].mv(out_of_transpose, out_last_reshape) + self.ins[out_last_reshape].mv(out_of_transpose, ts_node_name) + self.ins[ts_node_name].mv(in_of_transpose, in_first_reshape) + self._RmProtoNode(in_of_transpose) + self._RmProtoNode(out_of_transpose) + self._ClearEdges(in_of_transpose) + self._ClearEdges(out_of_transpose) + self._RmProtoNode(ts_node_name) + self._AddProtoNode(ts_node_name, None, helper, \ + private_data, 'pixel_shuffle') + + def _DealWithShuffleChannel(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type in ['transpose', 'transpose2']: + axis = helper.attr_data(source_op, 'axis') + if axis == [0, 2, 1, 3, 4]: + private_data = dict() + ts_node_name = self._NameNodeMid(source_op) + in_of_transpose = self.ins[ts_node_name].target('X') + out_of_transpose = self.outs[ts_node_name].target('Out') + if in_of_transpose.startswith('reshape') and \ + out_of_transpose.startswith('reshape'): + in_reshape_op = self._GetOp(source_ops, in_of_transpose) + out_reshape_op = self._GetOp(source_ops, out_of_transpose) + in_shape = helper.attr_data(in_reshape_op, 'shape') + out_shape = helper.attr_data(out_reshape_op, 'shape') + private_data['group'] = out_shape[-3] / in_shape[-3] + in_first_reshape = self.ins[in_of_transpose].target('X') + out_last_reshape = self.outs[out_of_transpose].target('Out') + self.outs[in_first_reshape].mv(in_of_transpose, ts_node_name) + self.outs[ts_node_name].mv(out_of_transpose, out_last_reshape) + self.ins[out_last_reshape].mv(out_of_transpose, ts_node_name) + self.ins[ts_node_name].mv(in_of_transpose, in_first_reshape) + self._RmProtoNode(in_of_transpose) + self._RmProtoNode(out_of_transpose) + self._ClearEdges(in_of_transpose) + self._ClearEdges(out_of_transpose) + self._RmProtoNode(ts_node_name) + self._AddProtoNode(ts_node_name, None, helper, \ + private_data, 'shuffle_channel') + + def _DealWithAnchorGenerator(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type == 'anchor_generator': + private_data = dict() + ag_node_name = self._NameNodeMid(source_op) + out_edges = self.outs[ag_node_name] + for param in out_edges.all_params(): + arg = helper.args_by_output_param(source_op, param) + out_target = out_edges.target(param) + if out_target.startswith('generate_proposals') is False: + raise NameError('ERROR: Unknown output of AnchorGenerator.') + private_data['split_num'] = 1 + split_node_name = 'split#' + \ + bytes(out_edges.all_params().index(param)) + '#' + ag_node_name + self._InitEdges(split_node_name) + self.outs[ag_node_name].reset_target_by_param(param, split_node_name) + in_edges = self.ins[out_target] + in_op = self._GetOp(source_ops, out_target) + for in_param in in_edges.all_params(): + in_arg = helper.args_by_input_param(in_op, in_param) + if in_arg == arg: + self.ins[out_target].reset_target_by_param(in_param, split_node_name) + self.outs[split_node_name].add('_Out', out_target) + self._AddPairEdges(ag_node_name, split_node_name, param, '_In') + self._AddProtoNode(split_node_name, None, helper, private_data, 'split_ins') + + def _DealWithGenerateProposals(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type == 'generate_proposals': + gp_node_name = self._NameNodeMid(source_op) + targets = self.outs[gp_node_name].all_targets() + if len(targets) == 1 is True or targets[0].startswith('split#') is True: + arg_node_name = 'temp_out_of_generate_proposals' + self.graph_outs.append(arg_node_name) + self.graphIO.add_out_fluid(arg_node_name, \ + gp_node_name) + self.outs[gp_node_name].add('temp_out', arg_node_name) + self.ins[arg_node_name] = Fluid_edger(bytes(source_op.idx), \ + gp_node_name) + ''' + anchors_in = self.ins[gp_node_name].target('Anchors') + bboxdeltas_in = self.ins[gp_node_name].target('BboxDeltas') + iminfo_in = self.ins[gp_node_name].target('ImInfo') + scores_in = self.ins[gp_node_name].target('Scores') + variances_in = self.ins[gp_node_name].target('Variances') + targets_in = [anchors_in, bboxdeltas_in, iminfo_in, \ + scores_in, variances_in] + for target_in in targets_in: + self.ins[gp_node_name].rm(target_in) + self.ins[gp_node_name].add('Anchors', anchors_in) + self.ins[gp_node_name].add('BboxDeltas', bboxdeltas_in) + self.ins[gp_node_name].add('ImInfo', iminfo_in) + self.ins[gp_node_name].add('Scores', scores_in) + self.ins[gp_node_name].add('Variances', variances_in) + ''' + + def _DelIncInQuantize(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type in ['increment']: + inc_node_name = self._NameNodeMid(source_op) + self._RmProtoNode(inc_node_name) + self._ClearEdges(inc_node_name) + + def _DealWithQuantize(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type in FLUID_QUANTIZE_LAYERS: + qt_node_name = self._NameNodeMid(source_op) + in_of_qt = self.ins[qt_node_name].target('X') + out_param_of_in = self.outs[in_of_qt].all_params()[0] + outs_of_qt = self.outs[qt_node_name].targets('Out') + qt_node = self._GetOp(source_ops, qt_node_name) + in_scale = helper.data_with_shape_by_param(qt_node, 'InScale')[0][0] + in_scale = in_scale / 127 + self.outs[in_of_qt].rm(qt_node_name) + for out_of_qt in outs_of_qt: + op_out_q = self._GetOp(source_ops, out_of_qt) + param_name = out_param_of_in + self.outs[in_of_qt].add(param_name, out_of_qt, None, in_scale) + self.ins[out_of_qt].mv(qt_node_name, in_of_qt) + self.ins[out_of_qt].set_scale(in_of_qt, in_scale) + self._RmProtoNode(qt_node_name) + self._ClearEdges(qt_node_name) + self._DelIncInQuantize(source_ops, helper, quantized) + + def _DealWithDequantize(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type in FLUID_DEQUANTIZE_LAYERS: + private_data = dict() + qt_node_name = self._NameNodeMid(source_op) + qt_node = self._GetOp(source_ops, qt_node_name) + in_of_qt = self.ins[qt_node_name].target('X') + out_of_qt = self.outs[qt_node_name].target('Out') + op_in_q = self._GetOp(source_ops, in_of_qt) + scale_of_weight = helper.attr_data(source_op, 'max_range') + scale_of_weight = 127 / scale_of_weight + self.scale_dict[in_of_qt] = [scale_of_weight] + private_data['scale_1'] = self.scale_dict[in_of_qt] + scale = helper.data_with_shape_by_param(qt_node, 'Scale')[0][0] + scale = scale / 127 + self.outs[in_of_qt].mv(qt_node_name, out_of_qt) + self.outs[in_of_qt].set_scale(out_of_qt, scale) + self.ins[out_of_qt].mv(qt_node_name, in_of_qt) + self.ins[out_of_qt].set_scale(in_of_qt, scale) + self._RmProtoNode(qt_node_name) + self._ClearEdges(qt_node_name) + self._RmProtoNode(in_of_qt) + self._AddProtoNode(in_of_qt, op_in_q, helper, private_data) + + def _DealWithRoiAlign(self, source_ops, helper, quantized=False): + for source_op in source_ops: + if source_op.type == 'roi_align': + ra_node_name = self._NameNodeMid(source_op) + x_in_of_ra = self.ins[ra_node_name].target('X') + rois_in_of_ra = self.ins[ra_node_name].target('ROIs') + self.ins[ra_node_name].rm(x_in_of_ra) + self.ins[ra_node_name].rm(rois_in_of_ra) + self.ins[ra_node_name].add('X', x_in_of_ra, None) + self.ins[ra_node_name].add('ROIs', rois_in_of_ra, None) + + def _FusionSequencePoolConcat(self, source_ops, helper, slot_num=1, quantized=False): + for source_op in source_ops: + if source_op.type == 'sequence_pool': + seqpool_node_name = self._NameNodeMid(source_op) + if seqpool_node_name in self.ins: + op_seqpool = self._GetOp(source_ops, seqpool_node_name) + in_of_sp = self.ins[seqpool_node_name].target('X') + concat_node_name = self.outs[seqpool_node_name].target('Out') + out_of_concat = self.outs[concat_node_name].target('Out') + private_data = {'axis': 1, + 'slot_num': slot_num} + self.outs[seqpool_node_name].mv(concat_node_name, out_of_concat) + self.ins[out_of_concat].mv(concat_node_name, seqpool_node_name) + self._RmProtoNode(concat_node_name) + self._ClearEdges(concat_node_name) + self._RmProtoNode(seqpool_node_name) + self._AddProtoNode(seqpool_node_name, op_seqpool, helper, \ + private_data, 'seqpool_concat') + + def _DealWithFeedSequencePool(self, source_ops, helper, quantized=False): + self._CropGraph(['input_0'], ['fc_5.tmp_2_gout'], helper) + self._FusionSequencePoolConcat(source_ops, helper, 176) + def _NewCommonLayer(self, source_ops, in_target, @@ -886,7 +1166,8 @@ def _NewCommonLayer(self, layer_type, private_data, helper, - insert_mode=True): + insert_mode=True, + quantized=False): main_layer = layer_type + '_after_' + in_target if insert_mode is True: if in_target in self.ins[out_target].all_targets() and \ @@ -902,7 +1183,7 @@ def _NewCommonLayer(self, self.outs[main_layer] = Fluid_edger(out_param, out_target) self._AddProtoNode(main_layer, None, helper, private_data, layer_type) - def _ParseNetwork(self, source_ops, helper): + def _ParseNetwork(self, source_ops, helper, quantized=False): self._ParseBase(source_ops, helper) if self.NetType == "FLUIDBASE": pass @@ -913,21 +1194,34 @@ def _ParseNetwork(self, source_ops, helper): elif self.NetType == "ROUTEDNN": reshape_dict['input_0'] = [1, 37, 1, 1] self._ReplaceInputs(source_ops, helper, reshape_dict) + self._DealWithQuantize(source_ops, helper) + self._DealWithDequantize(source_ops, helper) self._InsertSplit(source_ops, helper) + self._DealWithBias(source_ops, helper) self._DealWithGru(source_ops, helper) self._DealWithLstm(source_ops, helper) - self._DealWithBias(source_ops, helper) self._DealWithBatchnorm(source_ops, helper) self._DealWithMultiFC(source_ops, helper) self._DealWithArgmax(source_ops, helper) self._DealWithAxpy(source_ops, helper) + self._DealWithPixelShuffle(source_ops, helper) + self._DealWithShuffleChannel(source_ops, helper) + if self.NetType == "FASTRCNN": + self._DealWithAnchorGenerator(source_ops, helper) + self._DealWithGenerateProposals(source_ops, helper) + self._DealWithRoiAlign(source_ops, helper) if self.NetType == "SSD": self._DealWithPriorBox(source_ops, helper) self._DealWithDetectionOutput(source_ops, helper) self._DealWithSoftmax(source_ops, helper) self._DealWithSSD(source_ops, helper) self._RefreshReshape(source_ops, helper) - self._Graph() + if self.NetType == "FEED": + self._DealWithFeedSequencePool(source_ops, helper) + if self.Debug == 'IN': + self._Graph(True, False) + else: + self._Graph(False, False) def _Parsing(self): with fluid.scope_guard(self.scope): @@ -941,8 +1235,26 @@ def _Parsing(self): fluid.io.load_inference_model(self.ModelPath, self.exe) global_block = self.net_program.global_block() + source_ops = list(global_block.ops) - helper = Fluid_helper(self.scope, global_block) + helper = Fluid_helper(self.scope, global_block, self.net_program) self._ParseNetwork(source_ops, helper) + + self._hard_decode() + return self.graphIO + + def _hard_decode(self): + """deeplabv3 hard decode + """ + if self.NetType == 'deeplabv3': + # deeplab_v3 hard decode + drop_list = [ + 'cast#700(tmp_22)', + ] + proto_helper.drop_nodes(self.graphIO.graph_proto, drop_list) + proto_helper.add_edge( + self.graphIO.graph_proto, + 'arg_max#699(arg_max_0)', + 'scale#701(save_infer_model/scale_0)') diff --git a/tools/external_converter_v2/parser/fluid/proto_helper.py b/tools/external_converter_v2/parser/fluid/proto_helper.py new file mode 100644 index 000000000..409d55647 --- /dev/null +++ b/tools/external_converter_v2/parser/fluid/proto_helper.py @@ -0,0 +1,39 @@ +from .. import proto + +def add_edge(graph_proto, bottom, top): + """add_edge in graph_proto + """ + bottom_target = proto.TargetProto() + bottom_target.node = top + graph_proto.edges_out[bottom].target.extend([bottom_target]) + top_target = proto.TargetProto() + top_target.node = bottom + graph_proto.edges_in[top].target.extend([top_target]) + + +def drop_nodes(graph_proto, drop_list): + """drop nodes of graph_proto + """ + tmp_nodes = filter(lambda node: node.name not in drop_list, graph_proto.nodes) + del graph_proto.nodes[:] + graph_proto.nodes.extend(tmp_nodes) + + for drop_node in drop_list: + if drop_node in graph_proto.edges_in: + del graph_proto.edges_in[drop_node] + if drop_node in graph_proto.edges_out: + del graph_proto.edges_out[drop_node] + if drop_node in graph_proto.edges_info: + del graph_proto.edges_info[drop_node] + + for edge_name in graph_proto.edges_in: + targets = graph_proto.edges_in[edge_name].target + tmp_targets = filter(lambda target: target.node not in drop_list, targets) + del targets[:] + targets.extend(tmp_targets) + + for edge_name in graph_proto.edges_out: + targets = graph_proto.edges_out[edge_name].target + tmp_targets = filter(lambda target: target.node not in drop_list, targets) + del targets[:] + targets.extend(tmp_targets) diff --git a/tools/external_converter_v2/parser/fluid/tools/feed_ones.py b/tools/external_converter_v2/parser/fluid/tools/feed_ones.py index aa1fbab58..d0fced3ce 100644 --- a/tools/external_converter_v2/parser/fluid/tools/feed_ones.py +++ b/tools/external_converter_v2/parser/fluid/tools/feed_ones.py @@ -17,6 +17,7 @@ GLB_arg_name = '' GLB_batch_size = 1 + def load_inference_model(model_path, exe): ''' ''' @@ -27,6 +28,7 @@ def load_inference_model(model_path, exe): else: return fluid.io.load_inference_model(model_path, exe) + def feed_ones(block, feed_target_names, batch_size=1): """ """ @@ -52,6 +54,36 @@ def fill_ones(var_name, batch_size): feed_dict[feed_target_name] = fill_ones(feed_target_name, batch_size) return feed_dict + +def feed_randn(block, feed_target_names, batch_size=1, need_save=True): + """ + """ + feed_dict = dict() + def set_batch_size(shape, batch_size): + if shape[0] == -1: + shape[0] = batch_size + return shape + def fill_randn(var_name, batch_size, need_save): + var = block.var(var_name) + np_shape = set_batch_size(list(var.shape), 1) + var_np = { + core.VarDesc.VarType.BOOL: np.bool_, + core.VarDesc.VarType.INT32: np.int32, + core.VarDesc.VarType.INT64: np.int64, + core.VarDesc.VarType.FP16: np.float16, + core.VarDesc.VarType.FP32: np.float32, + core.VarDesc.VarType.FP64: np.float64, + } + np_dtype = var_np[var.dtype] + numpy_array = np.random.random(np_shape).astype(np.float32) + if need_save is True: + numpy_to_txt(numpy_array, 'feed_' + var_name + '.txt', True) + return numpy_array + for feed_target_name in feed_target_names: + feed_dict[feed_target_name] = fill_randn(feed_target_name, batch_size, need_save) + return feed_dict + + def draw(block, filename='debug'): """ """ @@ -61,6 +93,7 @@ def draw(block, filename='debug'): cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path] subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + def fetch_tmp_vars(block, fetch_targets, var_names_list=None): """ """ @@ -91,6 +124,28 @@ def var_names_of_fetch(fetch_targets): i = i + 1 return new_fetch_vars + +def numpy_var(scope, var_name): + """ + get numpy data by the name of var. + """ + if hasattr(fluid.executor, '_fetch_var'): + numpy_array = fluid.executor._fetch_var(var_name, scope, True) + elif hasattr(fluid.executor, 'fetch_var'): + numpy_array = fluid.executor.fetch_var(var_name, scope, True) + else: + raise NameError('ERROR: Unknown Fluid version.') + return numpy_array + + +def var_dtype(block, var_name): + """ + get dtype of fluid var. + """ + var = block.var(var_name) + return var.dtype + + def print_ops_type(block): """ """ @@ -106,7 +161,8 @@ def ops_type(block): for op_type in type_cache: print op_type -def print_results(results, fetch_targets, need_save=True): + +def print_results(results, fetch_targets, need_save=False): """ """ for result in results: @@ -114,11 +170,25 @@ def print_results(results, fetch_targets, need_save=True): print fetch_targets[idx] print np.array(result) if need_save is True: - fluid_fetch_list = list(np.array(result).flatten()) - fetch_txt_fp = open('result_' + fetch_targets[idx].name + '.txt', 'w') - for num in fluid_fetch_list: - fetch_txt_fp.write(str(num) + '\n') - fetch_txt_fp.close() + numpy_to_txt(result, 'result_' + fetch_targets[idx].name, True) + + +def numpy_to_txt(numpy_array, save_name, print_shape=True): + """ + transform numpy to txt. + """ + np_array = np.array(numpy_array) + fluid_fetch_list = list(np_array.flatten()) + fetch_txt_fp = open(save_name + '.txt', 'w') + for num in fluid_fetch_list: + fetch_txt_fp.write(str(num) + '\n') + if print_shape is True: + fetch_txt_fp.write('Shape: (') + for val in np_array.shape: + fetch_txt_fp.write(str(val) + ', ') + fetch_txt_fp.write(')\n') + fetch_txt_fp.close() + def fluid_inference_test(model_path): """ @@ -132,13 +202,15 @@ def fluid_inference_test(model_path): fetch_targets] = load_inference_model(model_path, exe) global_block = net_program.global_block() draw(global_block) - feed_list = feed_ones(global_block, feed_target_names) + feed_list = feed_ones(global_block, feed_target_names, 1) + #feed_list = feed_randn(global_block, feed_target_names, 1, need_save=True) fetch_targets = fetch_tmp_vars(global_block, fetch_targets, [GLB_arg_name]) results = exe.run(program=net_program, feed=feed_list, fetch_list=fetch_targets, return_numpy=False) - print_results(results, fetch_targets) + print_results(results, fetch_targets, need_save=False) + if __name__ == "__main__": if len(sys.argv) == 1: diff --git a/tools/external_converter_v2/parser/graph.py b/tools/external_converter_v2/parser/graph.py index f6738f981..fde28582a 100644 --- a/tools/external_converter_v2/parser/graph.py +++ b/tools/external_converter_v2/parser/graph.py @@ -35,6 +35,12 @@ def __init__(self, config): elif config.framework == 'FLUID': from fluid import FluidParser self.parser = FluidParser(config.framework_config_dict) + elif config.framework == 'ONNX': + from onnx import OnnxParser + self.parser = OnnxParser(config.framework_config_dict) + elif config.framework == 'HOUYI': + from houyi import HouyiParser + self.parser = HouyiParser(config.framework_config_dict) else: raise NameError('ERROR: GrapProtoIO not support %s model.' % (config.framework)) self.graph_io = self.parser() @@ -96,7 +102,7 @@ def run_with_server(self, ip="0.0.0.0", port=8888): """ return self.graph_io, self.config - def serialization(self): + def serialization(self): """ serialize to disk """ diff --git a/tools/external_converter_v2/parser/graph_io.py b/tools/external_converter_v2/parser/graph_io.py index af463c3d1..271fe5524 100644 --- a/tools/external_converter_v2/parser/graph_io.py +++ b/tools/external_converter_v2/parser/graph_io.py @@ -7,35 +7,37 @@ from utils import * from proto import * + class NodeAttrWrapper(object): """ """ + def __init__(self): self.value_data = valueType() def __call__(self, data, data_type_str): """ """ - if data_type_str == type(""): # type string + if data_type_str == type(""): # type string self.value_data.s = data self.value_data.type = STR - elif data_type_str == type(int()): # type int + elif data_type_str == type(int()): # type int self.value_data.i = data self.value_data.type = INT32 - elif data_type_str == type(float()): # type float + elif data_type_str == type(float()): # type float self.value_data.f = data self.value_data.type = FLOAT - elif data_type_str == type(bool()): # type bool + elif data_type_str == type(bool()): # type bool self.value_data.b = data self.value_data.type = BOOLEN - elif data_type_str == type(TensorProtoIO()): # type tensor + elif data_type_str == type(TensorProtoIO()): # type tensor self.value_data.tensor.CopyFrom(data()) self.value_data.type = TENSOR - elif data_type_str == type(unicode()): # not used + elif data_type_str == type(unicode()): # not used return self.value_data - elif data_type_str == type(list()): # type shape + elif data_type_str == type(list()): # type shape self.value_data.type = CACHE_LIST - if len(data): # in case of error(empty data list): index out of range + if len(data): # in case of error(empty data list): index out of range if type(data[0]) == type(float()): self.value_data.cache_list.f[:] = data self.value_data.cache_list.type = FLOAT @@ -52,7 +54,7 @@ def __call__(self, data, data_type_str): self.value_data.cache_list.s[:] = data self.value_data.cache_list.type = STR self.value_data.cache_list.size = len(data) - elif type(data[0]) == type(data): # Recursive Structures of list..[list...] (deep num is only 2) + elif type(data[0]) == type(data): # Recursive Structures of list..[list...] (deep num is only 2) self.value_data.cache_list.type = CACHE_LIST self.value_data.cache_list.size = len(data) for idx, list_one in enumerate(data): @@ -63,7 +65,10 @@ def __call__(self, data, data_type_str): data_cache.size = len(list_one) self.value_data.cache_list.l.extend([data_cache]) else: - raise NameError('ERROR: UnSupport Recursive list data type(%s) in list ' % (str(type(list_one[0])))) + raise NameError( + 'ERROR: UnSupport Recursive list data type(%s) in list ' + % (str(type(list_one[0]))) + ) else: raise NameError('ERROR: UnSupport data type(%s) in list ' % (str(type(data[0])))) else: @@ -78,16 +83,28 @@ def __call__(self, data, data_type_str): class TensorProtoIO(object): """ """ - def __init__(self): + + def __init__(self, proto=None): """ """ - self.tensor_proto = TensorProto() - + self.tensor_proto = None + if proto is None: + self.tensor_proto = TensorProto() + else: + self.tensor_proto = proto + + def set_shared(self, is_shared): + self.tensor_proto.shared = is_shared + + def set_shared_from(self, shared_node_name): + # current tensor is shared from the node shared_node_name if it needs. + self.tensor_proto.share_from = shared_node_name + def set_data_type(self, data_type): - self.tensor_proto.data.type = data_type + self.tensor_proto.data.type = data_type def get_shape(self): - return self.tensor_proto.shape.dim.value + return self.tensor_proto.shape.dim.value def set_shape(self, shape_list): """ @@ -116,9 +133,13 @@ def set_data(self, data_list, data_type): if data_type == "string": self.tensor_proto.data.s[:] = data_list self.tensor_proto.data.type = STR - elif data_type == "int": + elif data_type == "int32": self.tensor_proto.data.i[:] = data_list - self.tensor_proto.data.type = INT + self.tensor_proto.data.type = INT32 + elif data_type == "int8": + assert type(data_list) is str + self.tensor_proto.data.c = data_list + self.tensor_proto.data.type = INT8 elif data_type == "float": self.tensor_proto.data.f[:] = data_list self.tensor_proto.data.type = FLOAT @@ -129,6 +150,16 @@ def set_data(self, data_list, data_type): raise NameError('ERROR: Unknown data type (%s) in message CacheDate' % (data_type)) self.tensor_proto.data.size = len(data_list) + def set_scale(self, data_list, data_type): + """ + """ + if data_type == "float": + self.tensor_proto.scale.f[:] = data_list + self.tensor_proto.scale.type = FLOAT + else: + raise NameError('ERROR: Unknown data type (%s) in message CacheDate' % (data_type)) + self.tensor_proto.scale.size = len(data_list) + def __call__(self): return self.tensor_proto @@ -136,10 +167,14 @@ def __call__(self): class OpsProtoIO(object): """ """ - def __init__(self): + def __init__(self, proto=None): """ """ - self.op_proto = OpsProto() + self.op_proto = None + if proto is None: + self.op_proto = OpsProto() + else: + self.op_proto = proto def set_name(self, op_name): self.op_proto.name = op_name @@ -159,15 +194,19 @@ def set_desc(self, description): def __call__(self): return self.op_proto - class NodeProtoIO(object): """ Node io class of NodeProto """ - def __init__(self): + + def __init__(self, proto=None): """ """ - self.node_proto = NodeProto() + self.node_proto = None + if proto is None: + self.node_proto = NodeProto() + else: + self.node_proto = proto self.attr_warpper = NodeAttrWrapper() def set_name(self, node_name): @@ -182,6 +221,12 @@ def add_out(self, node_name): def set_op(self, operator=OpsProto()): self.node_proto.Op.CopyFrom(operator) + def set_bit_type(self, bit_type): + """ + Bit width setting. + """ + self.node_proto.bit_type = bit_type + def add_attr(self, value_name, data, data_type_str): """ set tensor data: @@ -206,16 +251,21 @@ class GraphProtoIO(object): """ Graph io class of GraphProto. """ - def __init__(self): + + def __init__(self, proto=None): """ """ - self.graph_proto = GraphProto() + self.graph_proto = None + if proto is None: + self.graph_proto = GraphProto() + else: + self.graph_proto = proto def serialization(self, file_path): """ Serialize to disk. """ - #self._get_graph_proto(); + # self._get_graph_proto(); with open(file_path, "wb") as f: f.write(self.graph_proto.SerializeToString()) f.close() @@ -245,56 +295,85 @@ def rm_node(self, node): del self.graph_proto.nodes[index] else: raise NameError('ERROR: (%s) node not exist.' % (node)) - + def find_node_proto(self, node_name): for node in self.graph_proto.nodes: if node.name == node_name: return node - def get_edge_nexts(self, node_name_0): + + def get_node_io(self, node_name): + """ + get node's io by name + """ + node_proto = self.find_node_proto(node_name) + assert node_proto is not None + node_io = NodeProtoIO(node_proto) + return node_io + + def get_edge_nexts(self, node_name, with_info=False): """ get edge's next node_name """ - if node_name_0 in self.graph_proto.edges_out: - return list(self.graph_proto.edges_out[node_name_0].val[:]) - else: - return [] + edges_out = self.graph_proto.edges_out + nexts = list() + if node_name in edges_out: + if with_info is False: + for target in edges_out[node_name].target: + nexts.append(target.node) + else: + nexts = edges_out[node_name].target[:] + return nexts def rm_edge(self, node_name_0, node_name_1): """ remove edge is directive from node_name_0 to node_name_1 """ if node_name_0 in self.graph_proto.edges_out: - index = -1 - for idx, node_name in enumerate(self.graph_proto.edges_out[node_name_0].val): - if node_name == node_name_1: + index = -1 + for idx, target in enumerate(self.graph_proto.edges_out[node_name_0].target): + if target.node == node_name_1: index = idx break if index >= 0: - #print "suc in " + node_name_0 + " -> " + node_name_1 + " idx: " + str(index) - del self.graph_proto.edges_out[node_name_0].val[index] + # print "suc in " + node_name_0 + " -> " + node_name_1 + " idx: " + str(index) + del self.graph_proto.edges_out[node_name_0].target[index] if node_name_1 in self.graph_proto.edges_in: index = -1 - for idx, node_name in enumerate(self.graph_proto.edges_in[node_name_1].val): - if node_name == node_name_0: + for idx, target in enumerate(self.graph_proto.edges_in[node_name_1].target): + if target.node == node_name_0: index = idx break if index >= 0: - #print "suc in " + node_name_0 + " -> " + node_name_1 + " idx: " + str(index) - del self.graph_proto.edges_in[node_name_1].val[index] + # print "suc in " + node_name_0 + " -> " + node_name_1 + " idx: " + str(index) + del self.graph_proto.edges_in[node_name_1].target[index] - def add_in_edge(self, node_name_0, node_name_1): + def add_in_edge(self, node_name_0, node_name_1, scale=None): """ add_in_edge is directive from node_name_0 to node_name_1 """ - if node_name_0 not in self.graph_proto.edges_in[node_name_1].val: - self.graph_proto.edges_in[node_name_1].val.append(node_name_0) + edges_in = self.graph_proto.edges_in + nexts = list() + for target in edges_in[node_name_1].target: + nexts.append(target.node) + if node_name_0 not in nexts: + target = edges_in[node_name_1].target.add() + if scale is not None: + target.scale.append(scale) + target.node = node_name_0 - def add_out_edge(self, node_name_0, node_name_1): + def add_out_edge(self, node_name_0, node_name_1, scale=None): """ add_out_edge is directive from node_name_0 to node_name_1 """ - if node_name_1 not in self.graph_proto.edges_out[node_name_0].val: - self.graph_proto.edges_out[node_name_0].val.append(node_name_1) + edges_out = self.graph_proto.edges_out + nexts = list() + for target in edges_out[node_name_0].target: + nexts.append(target.node) + if node_name_1 not in nexts: + target = edges_out[node_name_0].target.add() + if scale is not None: + target.scale.append(scale) + target.node = node_name_1 def add_in(self, node_name): self.graph_proto.ins.append(node_name) @@ -306,8 +385,6 @@ def rm_in(self, node_name): idx = graph_ins.index(in_name) del graph_ins[idx] self.graph_proto.ins[:] = graph_ins - print 'self.graph_proto.ins[:]' - print self.graph_proto.ins[:] def ins(self): return list(self.graph_proto.ins) @@ -352,6 +429,10 @@ def rm_out(self, node_name): self.graph_proto.outs[:] = graph_outs def format_edge_from_nodes(self): + """ + format edge from nodes with input and output list + :return: + """ in_set = set() out_set = set() for node in self.graph_proto.nodes: @@ -373,4 +454,3 @@ def format_edge_from_nodes(self): def __call__(self): return self.graph_proto - diff --git a/tools/external_converter_v2/parser/graph_to_json.py b/tools/external_converter_v2/parser/graph_to_json.py index e4306e1d3..64c09103a 100644 --- a/tools/external_converter_v2/parser/graph_to_json.py +++ b/tools/external_converter_v2/parser/graph_to_json.py @@ -31,6 +31,20 @@ def __init__(self, graph_io=GraphProtoIO()): # decide layout #self.get_layout_coordinate() + def get_edge_nexts(self, node_name, with_info=False): + """ + get edge's next node_name + """ + edges_out = self.graph_proto.edges_out + nexts = list() + if node_name in edges_out: + if with_info is False: + for target in edges_out[node_name].target: + nexts.append(target.node) + else: + nexts = edges_out[node_name].target[:] + return nexts + def get_layout_coordinate(self): """ get layout coordinate of node in graph board @@ -53,7 +67,7 @@ def get_layout_coordinate(self): x = self.map_node_to_coordinate[node_proto.name][0] y = self.map_node_to_coordinate[node_proto.name][1] inc_step = 0 - for next_node_name in self.graph_proto.edges_out[node_proto.name].val: + for next_node_name in self.get_edge_nexts(node_proto.name): self.map_node_to_coordinate[next_node_name] = [0, 0] self.map_node_to_coordinate[next_node_name][0] = x + inc_step inc_step = inc_step + horizon_step @@ -91,7 +105,7 @@ def create_edges(self): new_color = lambda: ("#%02X%02X%02X" % (r(), r(), r())) for node_proto in self.graph_proto.nodes: if node_proto.name in self.graph_proto.edges_out: - for node_name in self.graph_proto.edges_out[node_proto.name].val: + for node_name in self.get_edge_nexts(node_proto.name): edge_name = node_proto.name + '_' + node_name if edge_name in self.graph_proto.edges_info: tensor_proto = self.graph_proto.edges_info[edge_name] @@ -104,7 +118,7 @@ def create_edges(self): edges = [] for node_proto in self.graph_proto.nodes: if node_proto.name in self.graph_proto.edges_out: - for node_name in self.graph_proto.edges_out[node_proto.name].val: + for node_name in self.get_edge_nexts(node_proto.name): edge_name = node_proto.name + '_' + node_name tensor_name = "" shared = "" @@ -182,6 +196,22 @@ def create_attr(self): type=type_str, value=str(value)) node_attrs.append(target_attr()) + # Quantitative information + name = 'bit_mode' + type_str = 'type' + if node_proto.bit_type == FLOAT: + value = 'FLOAT32' + elif node_proto.bit_type == INT8: + value = 'INT8' + elif node_proto.bit_type == STR: + value = 'UNKNOWN' + else: + raise NameError('ERROR: Unknown data type (%d) in message valueType' \ + % (node_proto.bit_type)) + target_attr = CreateJson(id=name, + type=type_str, + value=str(value)) + node_attrs.append(target_attr()) node_map = CreateJson(key_name=key_id, key_attrs=node_attrs) attrs.append(node_map()) diff --git a/tools/external_converter_v2/parser/lego/parser_lego_test.py b/tools/external_converter_v2/parser/lego/parser_lego_test.py index c3a087911..053cf5eac 100644 --- a/tools/external_converter_v2/parser/lego/parser_lego_test.py +++ b/tools/external_converter_v2/parser/lego/parser_lego_test.py @@ -43,6 +43,7 @@ def _Parsing(self): expect_model_size = os.path.getsize(self.ModelPath) sum_s = 0 layer_cache = {} + shared_layer = {} # deal with each layer for source_layer in source_layers: source_layer_name = source_layer.name @@ -64,17 +65,32 @@ def _Parsing(self): #get weights from lego. tensors = [] size_list = blob_size_of_layer(source_layer) - if len(size_list): - for size in size_list: - data = np.fromfile(f, '=2 + :param med_node: + :param med_graph: + :return: + """ + output = med_node['output'] + #print 'output!!:', output + if len(output) > 1: + split_node = MedNodeUtil.new_med_node() + split_node['name'] = med_node['name'] + '_split#' + str(len(output)) + split_node['ak_type'] = 'Split' + split_node['type'] = 'Split' + split_node['ak_attr']['split_num'] = len(output) + # print ('-------------') + # print ('split', split_node['name']) + MedGraphUtil.append_node(med_node, split_node, graph=med_graph) + pass + + @staticmethod + def _auto_input_name(med_node, med_graph): + """ + gen input name + :param med_node: + :param med_graph: + :return: + """ + assert med_node['ak_type'] == 'Input' + old_name = med_node['name'] + med_node['name'] = 'input_' + str(MedGraph_Input_Cnt) + for i in med_node['output']: + out_node = med_graph[i] + out_node['input'] = MedNodeUtil.replace_name_with_list(out_node['input'], old_name, + [[med_node['name']]]) + @staticmethod + def _fusionPermute(med_node, med_graph): + """ + when permute param >= 5, fusion Permute node to pixelshuffle + :param med_node: + :param med_graph: + :return: + """ + if len(med_node['ak_attr']['shape']) >= 5: + ins = med_node['input'] + outs = med_node['output'] + if len(ins) == 1 and len(outs) == 1: + in_node = med_graph[ins[0]] + out_node = med_graph[outs[0]] + if in_node['ak_type'] == 'Reshape' and out_node['ak_type'] == 'Reshape': + # print(in_node) + rw = in_node['ak_attr']['shape'][1] + rh = in_node['ak_attr']['shape'][2] + in_node['type'] = 'PixelShuffle' + in_node['ak_type'] = 'PixelShuffle' + in_node['ak_attr']['type'] = 'PixelShuffle' + in_node['ak_attr']['rw'] = int(rw) + in_node['ak_attr']['rh'] = int(rh) + in_node['ak_attr']['channel_first'] = True + #delete med_node and out_node + in_node['output']=out_node['output'] + for i in out_node['output']: + in_node_node = med_graph[i] + in_node_node['input'] = MedNodeUtil.replace_name_with_list(in_node_node['input'], + out_node['name'], + [in_node['name']]) + # print(in_node_node['input']) + # print(in_node) + med_graph.pop(med_node['name']) + med_graph.pop(out_node['name']) + + @staticmethod + def _fusionScale(med_node, med_graph): + """ + fusion scale node after convolution node + :param med_node: + :param med_graph: + :return: + """ + if len(med_node['input']) == 1: + input_node = med_graph[med_node['input'][0]] + med_ak_attr = med_node['ak_attr'] + if input_node['ak_type'] == 'Convolution': + input_attr = input_node['ak_attr'] + conv_weights = input_attr['weights'] + scale_weights = med_ak_attr['weights'] + + assert (conv_weights['shape'][0] == scale_weights['shape'][-1]) \ + or (conv_weights['shape'][0] == scale_weights['shape'][0]) + shape = conv_weights['shape'] + new_conv_weights = {} + new_conv_weights['shape'] = conv_weights['shape'] + new_conv_weights['dtype'] = 'float32' + new_conv_weights['data'] = np.zeros(shape) + tmp = scale_weights['data'].flatten() + conv_weights['data'] = conv_weights['data'].reshape(shape) + for i in range(shape[0]): + new_conv_weights['data'][i] = conv_weights['data'][i] * tmp[i] + input_attr['weights'] = new_conv_weights + if input_attr.get('bias') is not None: + bias_val = input_attr['bias'] + if 'bias' in med_ak_attr: + new_conv_bias = {} + new_conv_bias['shape'] = bias_val['shape'] + new_conv_bias['dtype'] = 'float32' + new_conv_bias['data'] = np.zeros(bias_val['shape']) + med_val = med_ak_attr['bias'] + for i in range(bias_val['shape'][0]): + new_conv_bias['data'][i] = bias_val['data'][i] + med_val['data'][i] + input_attr['bias'] = new_conv_bias + else: + input_attr['bias'] = bias_val + elif med_ak_attr.get('bias') is not None: + bias_val = med_ak_attr['bias'] + input_attr['bias'] = bias_val + else: + print ('conv+scale does not have bias') + # input_attr['bias'] = med_ak_attr['bias'] + med_node['ak_type'] = None + input_node['output'] = MedNodeUtil.replace_name_with_list(input_node['output'], + med_node['name'], + med_node['output']) + MedNodeUtil.redirecto_outputs_input_to_this(med_node, med_graph, input_node['name']) + input_node['fusion_out_name'] = med_node['name'] + # conv+scale+scale * n, bias_n1 = bias_n0 * weights + bias_n1 + if len(input_node['output']) == 1: + tmp_node = med_graph[input_node['output'][0]] + while tmp_node['ak_type'] == 'Scale': + input_attr = input_node['ak_attr'] + conv_weights = input_attr['weights'] + scale_weights = tmp_node['ak_attr']['weights'] + assert (conv_weights['shape'][0] == scale_weights['shape'][-1]) or (conv_weights['shape'][0] == scale_weights['shape'][0]) + shape = conv_weights['shape'] + new_conv_weights = {} + new_conv_weights['shape'] = conv_weights['shape'] + new_conv_weights['dtype'] = 'float32' + new_conv_weights['data'] = np.zeros(shape) + tmp = scale_weights['data'].flatten() + conv_weights['data'] = conv_weights['data'].reshape(shape) + for i in range(shape[0]): + new_conv_weights['data'][i] = conv_weights['data'][i] * tmp[i] + input_attr['weights'] = new_conv_weights + if input_attr.get('bias') is not None: + bias_val = input_attr['bias'] + if 'bias' in tmp_node['ak_attr']: + new_conv_bias = {} + new_conv_bias['shape'] = bias_val['shape'] + new_conv_bias['dtype'] = 'float32' + new_conv_bias['data'] = np.zeros(bias_val['shape']) + med_val = tmp_node['ak_attr']['bias'] + for i in range(bias_val['shape'][0]): + new_conv_bias['data'][i] = bias_val['data'][i] * scale_weights['data'][i] + med_val['data'][i] + input_attr['bias'] = new_conv_bias + else: + input_attr['bias'] = bias_val + elif med_ak_attr.get('bias') is not None: + bias_val = tmp_node['ak_attr']['bias'] + input_attr['bias'] = bias_val + else: + print ('conv+scale does not have bias') + tmp_node['ak_type'] = None + input_node['output'] = MedNodeUtil.replace_name_with_list(input_node['output'], + tmp_node['name'], + tmp_node['output']) + MedNodeUtil.redirecto_outputs_input_to_this(tmp_node, med_graph, input_node['name']) + input_node['fusion_out_name'] = tmp_node['name'] + if len(input_node['output']) == 1: + tmp_node = med_graph[input_node['output'][0]] + else: + break + + pass + + @staticmethod + def _deleteScale(med_node, med_graph): + """ + delete dropout node when is_test = 0 + :param med_node: + :param med_graph: + :return: + """ + ak_attr = med_node['ak_attr'] + if 'drop' in ak_attr.keys() and ak_attr['drop'] == 0: + #not do scale, delete node + input = med_node['input'] + output = med_node['output'] + # print ('name: ', med_node['name']) + # print ('inputs: ', input) + # print ('outputs: ', output) + #replace node + for node in input: + for out in med_graph.keys(): + if out == node: + out_node = med_graph[out]['output'] + # print 'name: ', out + # print 'input: ', med_graph[out]['input'] + # print 'output: ', out_node + for i in range(0, len(out_node)): + if out_node[i] == med_node['name']: + out_node.pop(i) + out_node += output + # print 'name: ', out + # print 'input: ', med_graph[out]['input'] + # print 'output: ', out_node + break + for node in output: + for out in med_graph.keys(): + if out == node: + in_node = med_graph[out]['input'] + # print 'name: ', out + # print 'input: ', in_node + # print 'output: ', med_graph[out]['output'] + for i in range(0, len(in_node)): + if in_node[i] == med_node['name']: + in_node.pop(i) + in_node += input + # print 'name: ', out + # print 'input: ', in_node + # print 'output: ', med_graph[out]['output'] + # print ('pop: ', med_node['name']) + med_graph.pop(med_node['name']) + # print ('graph: -----') + # for key in med_graph.keys(): + # node = med_graph[key] + # print(node['name'], node['ak_type'], node['input'], node['output']) + #del med_graph[med_node] + pass + + @staticmethod + def _all_search_table(graph, table): + """ + search template for dict + :param graph: + :param table: + :return: + """ + for onnx_node in graph.values(): + if onnx_node['med_visted']: + continue + type_name = onnx_node['ak_type'] + if table.get(type_name) is not None: + table[type_name](onnx_node, graph) + + @staticmethod + def _all_search_fusion(graph, fusion_func): + """ + search template for func + :param graph: + :param fusion_func: + :return: + """ + for onnx_node in graph.values(): + if onnx_node['med_visted']: + continue + if onnx_node['ak_type'] is not None: + fusion_func(onnx_node, graph) + + @staticmethod + def solve(med_graph): + """ + do fusion and adjust for med graph that we can convert med graph to ak graph + :param med_graph: + :return: + """ + for node in med_graph.values(): + node['med_visted'] = False + + print ('********split***********') + MedGraphUtil._all_search_fusion(med_graph, MedGraphUtil._auto_split) + print ('********scale***********') + MedGraphUtil._all_search_table(med_graph, {'Scale': MedGraphUtil._deleteScale}) + print ('********pixelShuffle***********') + MedGraphUtil._all_search_table(med_graph, {'Permute': MedGraphUtil._fusionPermute}) + print ('********fusion scale***********') + MedGraphUtil._all_search_table(med_graph, {'Scale': MedGraphUtil._fusionScale}) + print ('********finish***********') + # MedGraphUtil._all_search_table(med_graph, {'Input': MedGraphUtil._auto_input_name}) + + @staticmethod + def search_output_list(graph): + """ + search output list in recursive method + :param graph: + :return: + """ + output_list = set() + graph_cp = graph.copy() + + def recursive_search(node): + if node.get('out_search_flag') is not None: + return set() + node['out_search_flag'] = True + outputs = node['output'] + result = set() + if len(outputs) == 0: + result.add(node['name']) + else: + for i in outputs: + result |= recursive_search(graph[i['name']]) + return result + + for i in graph_cp.values(): + output_list |= recursive_search(i) + return list(output_list) diff --git a/tools/external_converter_v2/parser/onnx/med_trans_util.py b/tools/external_converter_v2/parser/onnx/med_trans_util.py new file mode 100644 index 000000000..31c479efe --- /dev/null +++ b/tools/external_converter_v2/parser/onnx/med_trans_util.py @@ -0,0 +1,360 @@ +import numpy as np +from ..graph_io import TensorProtoIO, OpsProtoIO +from ..operations import OpsParam + +def shape_2_ak_shape(shape): + """ + onnx shape to anakin shape + :param shape: + :return: + """ + mini_shape = [i for i in shape if (i is not None and i > 0)] + return map(int, [1] * (4 - len(mini_shape)) + list(mini_shape)) + +def np_2_ak_tensor(np_tensor): + """ + onnx np array to tensor + :param np_tensor: + :return: + """ + data_type_map2 ={ + np.dtype('float32'): 'float', + np.dtype('int32'): 'int', + np.dtype('bool'): 'bool' + } + data_type_map = { + 'float32': 'float', + 'int32': 'int', + 'bool': 'bool' + } + # print 'np_tensor: ', np_tensor['dtype'] + #exit() + type_str = data_type_map.get(np_tensor['dtype']) + #assert type_str != None + ak_tensor = TensorProtoIO() + ak_tensor.set_shape(shape_2_ak_shape(np_tensor['shape'])) + # ak_tensor.set_data(np_tensor['data'], type_str) + # print('type: ', type(np_tensor['data']), np_tensor['shape'], np_tensor['dtype'], type_str) + if (len(np_tensor['shape']) == 1): + ak_tensor.set_data(np_tensor['data'], type_str) + else: + ak_tensor.set_data(np_tensor['data'].flatten(), type_str) + return ak_tensor + + +class MedTransAK: + """ + tools on med graph to anakin graph + """ + def __init__(self): + self.input_count=0 + + def Convolution(self, med_attr, param): + """ + get Conv param + :param med_attr: + :param param: + :return: + """ + np_filters = med_attr['weights'] + param.weight_1 = np_2_ak_tensor(np_filters) + param.filter_num = np_filters['shape'][0] #? + param.kernel_size = med_attr['kernel'] + param.strides = med_attr['strides'] + param.padding = med_attr['padding'] #T L B R + param.dilation_rate = med_attr['dilations'] + # print('-------conv group----') + # print('filter_num: ', param.filter_num) + # print('group: ', med_attr['group']) + param.group = med_attr['group'] + param.axis = 1 + if med_attr.get('bias') is not None: + param.bias_term = True + bias_tensor = med_attr['bias'] + bias_tensor['shape'] = [1, 1, 1, bias_tensor['shape'][-1]] + param.weight_2 = np_2_ak_tensor(bias_tensor) + else: + param.bias_term = False + + def Normalize(self, med_attr, param): + """ + get Normalize param + :param med_attr: + :param param: + :return: + """ + np_filters = med_attr['weights'] + param.weight_1 = np_2_ak_tensor(np_filters) + param.begin_norm_axis = med_attr['begin_norm_axis'] + param.is_across_spatial = med_attr['is_across_spatial'] + param.is_shared_channel = med_attr['is_shared_channel'] #T L B R + param.eps = med_attr['eps'] + param.p = med_attr['p'] + + def Dense(self, med_attr, param): + """ + get dense param + :param med_attr: + :param param: + :return: + """ + param.axis = 1 + param.out_dim = 0 + if med_attr['Gemm'] == 1: + param.weight_1 = np_2_ak_tensor(med_attr['weights']) + # if med_attr.get('trans') is not None: + # param.out_dim = med_attr['weights']['shape'][1] + # print'trans out_dim', param.out_dim, type(param.out_dim) + # else: + # param.out_dim = med_attr['weights']['shape'][0] + # print'out_dim', param.out_dim + else: + param.weight_1 = TensorProtoIO() + + if med_attr.get('bias') is not None: + param.bias_term = True + param.weight_2 = np_2_ak_tensor(med_attr['bias']) + param.out_dim = len(med_attr['bias']['data'].flatten()) + else: + param.bias_term = False + #print 'shape: ', med_attr['weights']['shape'] + + def ReLU(self, med_attr, param): + """ + get relu param + :param med_attr: + :param param: + :return: + """ + if med_attr.get('alpha') is None: + param.alpha = 0.0 + else: + param.alpha = med_attr['type'] + + def PReLU(self, med_attr, param): + """ + get relu param + :param med_attr: + :param param: + :return: + """ + if med_attr.get('channel_shared') is None: + param.channel_shared = False + else: + param.channel_shared = med_attr['channel_shared'] + + def Concat(self, med_attr, param): + """ + get concat param + :param med_attr: + :param param: + :return: + """ + if med_attr.get('axis') is None: + param.axis = 0.0 + else: + param.axis = med_attr['axis'] + + def Activation(self, med_attr, param): + """ + grt act param + :param med_attr: + :param param: + :return: + """ + param.type = med_attr['type'] + if med_attr['type'] == 'PReLU': + if med_attr.get('channel_shared') is None: + param.channel_shared = False + else: + param.channel_shared = med_attr['channel_shared'] + param.weight_1 = np_2_ak_tensor(med_attr['weights']) + + def Reshape(self, med_attr, param): + """ + get reshape param + :param med_attr: + :param param: + :return: + """ + shape = med_attr['shape'] + if isinstance(shape, type(np.array([]))): + shape = [int(i) for i in shape] + # print('***Reshape:*** ', shape) + param.dims = shape_2_ak_shape(shape) + # print(param.dims) + pass + + def Permute(self, med_attr, param): + """ + get Permute param + :param med_attr: + :param param: + :return: + """ + shape = med_attr['shape'] + param.dims = shape + + def Pooling(self, med_attr, param): + """ + get pooling param + :param med_attr: + :param param: + :return: + """ + param.method = med_attr['type'] + param.pool_size = med_attr['window'] + param.strides = med_attr['strides'] + param.padding = med_attr['padding'] # T L B R + if med_attr.get('global_pooling') is None: + param.global_pooling = False + else: + param.global_pooling = med_attr['global_pooling'] + # if med_attr['padding'][0] == 0: + # param.cmp_out_shape_floor_as_conv = False + # else: + # param.cmp_out_shape_floor_as_conv = True + param.cmp_out_shape_floor_as_conv = True + pass + + def Input(self, med_attr, param): + """ + get input param + :param med_attr: + :param param: + :return: + """ + param.input_shape = shape_2_ak_shape(med_attr['shape']) + param.alias = 'input_' + str(self.input_count) + self.input_count += 1 + + def Dropout(self, med_attr, param): + """ + get dropoout param + :param med_attr: + :param param: + :return: + """ + param.ratio = med_attr['ratio'] + + def Split(self, med_attr, param): + """ + get split param + :param med_attr: + :param param: + :return: + """ + param.split_num = med_attr['split_num'] + + def Eltwise(self, med_attr, param): + """ + get eltwise param + :param med_attr: + :param param: + :return: + """ + assert med_attr['type'] == 'Add' + param.type = med_attr['type'] + param.coeff = [1.0, 1.0] + + def Scale(self, med_attr, param): + """ + get scale param + :param med_attr: + :param param: + :return: + """ + # print 'weights' + param.weight_1 = np_2_ak_tensor(med_attr['weights']) + # print 'bias' + if med_attr.get('bias') is not None: + param.weight_2 = np_2_ak_tensor(med_attr['bias']) + param.bias_term = True + else: + param.bias_term = False + + param.axis = 1 + param.num_axes = 1 + + def Flatten(self, med_attr, param): + """ + get flatten param + :param med_attr: + :param param: + :return: + """ + param.start_axis = med_attr['start_axis'] + param.end_axis = med_attr['end_axis'] + + def LRN(self, med_attr, param): + """ + get lrn param + :param med_attr: + :param param: + :return: + """ + param.local_size = med_attr['local_size'] + param.alpha = med_attr['alpha'] + param.beta = med_attr['beta'] + param.k = med_attr['k'] + param.norm_region = "ACROSS_CHANNELS" + + def Softmax(self, med_attr, param): + """ + get softmax param + :param med_attr: + :param param: + :return: + """ + if med_attr.get('axis') is None: + param.axis = 3 + else: + param.axis = med_attr['axis'] + pass + + def PixelShuffle(self, med_attr, param): + if med_attr.get('rw') is None: + param.rw = 2 + else: + param.rw = med_attr['rw'] + if med_attr.get('rh') is None: + param.rh = 2 + else: + param.rh = med_attr['rh'] + if med_attr.get('channel_first') is None: + param.channel_first = True + else: + param.channel_first = med_attr['channel_first'] + # if med_attr.get('scale_factor') is None: + # param.scale_factor = 2 + # else: + # param.scale_factor = med_attr['scale_factor'] + + def map_med_2_ak(self, ak_node, med_node): + """ + med graph convert to anakin graph + :param ak_node: + :param med_node: + :return: + """ + type_name = med_node['ak_type'] + func = getattr(self, type_name, None) + param = OpsParam() + ak_op = OpsProtoIO() + med_attr = med_node['ak_attr'] + #print type_name + + # print med_node['name'], med_node['type'], med_node['ak_type'] + func(med_attr, param) + # print 'func success' + + param.feed_node_attr(ak_node) + ak_op.set_name(med_node['ak_type']) + ak_node.set_op(ak_op()) + + # print 'name', med_node['name'] + # print 'type', type(med_node['input']), med_node['input'] + # print 'type', type(med_node['output']), med_node['output'] + [ak_node.add_in(i) for i in med_node['input']] + [ak_node.add_out(i) for i in med_node['output']] + diff --git a/tools/external_converter_v2/parser/onnx/onnx_graph.py b/tools/external_converter_v2/parser/onnx/onnx_graph.py new file mode 100644 index 000000000..6a91877fa --- /dev/null +++ b/tools/external_converter_v2/parser/onnx/onnx_graph.py @@ -0,0 +1,509 @@ +import onnx +import numpy as np +import math +#from tensorflow.core.framework import types_pb2, tensor_pb2 +import logging as log +import collections +from onnx_trans_utils import * + +class ParseOnnxToMed: + def __init__(self, onnx_model_path, txt_path = None): + self.model_path = onnx_model_path + if txt_path is not None: + self.txt_path = txt_path + else: + self.txt_path = None + + def _parse_onnx_node(self, onnx_graph, shape_override): + """ + Load onnx graph and parse node + :param onnx_graph: + :param shape_override: + :return: + """ + + # ignore the following attributes + ignored_attr = ["unknown_rank", "_class", "Tidx", "Tshape", "use_cudnn_on_gpu", "Index", + "Tpaddings", "TI", "Tparams", "Tindices", "Tlen", "Tdim", + "dynamic_size", "element_shape", "Tmultiples", "output_dtype", + "Tblock_shape", "Tcrops", "index_type", "Taxis", "U", + "maxval", "Tout"] + # some stats + op_cnt = collections.Counter() + attr_cnt = collections.Counter() + anakin_nodes = {} + dtypes = {} + + # find ops + ops = onnx_graph.node + + # minimal conversion of attributes + # print '***********node*******' + for node in ops: + attr = {} + takeit = True + + for a in node.attribute: + attr_cnt[a.name] += 1 + if a.type == 1: ##FLAOT + attr[a.name] = a.f + elif a.type == 2: #INT + attr[a.name] = int(a.i) + elif a.type == 3: #String + attr[a.name] = a.s + elif a.type == 4: #tensor + val_list = onnx_to_anakin_tensor(a.t) + attr[a.name] = val_list + elif a.type == 5: #graph + attr[a.name] = a.t + elif a.type == 6: #FLOATS + val_list = [] + for val in a.floats: + val_list.append(val) + attr[a.name] = val_list + elif a.type == 7: #INTS + val_list = [] + #print 'type: ', a.name, type(a.ints[0]) + for val in a.ints: + val_list.append(int(val)) + attr[a.name] = val_list + else: + print 'Error type: ', a.type, a + # attr[a.name] = a.auto_pad + exit(0) + + if takeit: + try: + #input_names = [i for i in node.input] + #output_names = [i for i in node.output] + # if node.name == '': + # node.name = node.output[0] + name = node.name #name + '_' + + node.name = name + '_' + str(node.op_type) + '_' + str(op_cnt[node.op_type]) + op_cnt[node.op_type] += 1 + #print node_name + #node_name = node.output[0]; + anakin_nodes[node.name] = {'name': node.name, 'type': node.op_type, + 'input': [str(i) for i in node.input], + 'output': [str(i) for i in node.output], + 'onnx_attr': attr, 'visited': False, 'name:': False, + 'shape': None, 'ak_type': None, 'ak_attr': {}} + except Exception as ex: + log.error("pass1 convert failed for %s, ex=%s", node, ex) + raise + # print 'anakin_node', anakin_nodes + # exit() + #weights and bias + graph = onnx_graph.initializer + # print 'weights: ', graph + weights = {} + for init_ptr in graph: + # print 'init_ptr: ', init_ptr.name + # print ('onnx_to_anakin_tensor: ') + [data, shape, dtype] = onnx_to_anakin_tensor(init_ptr) + # print ('end') + anakin_tensor = {} + # print'before', shape + if len(shape) == 3: + # print'before', shape + shape.append(1) + a = shape[2] + shape[2] = 1 + shape[3] = a + anakin_tensor['shape'] = shape + anakin_tensor['data'] = data + anakin_tensor['dtype'] = dtype + + # print('**************initializer*******') + # print ('shape: ', shape) + # print('len: ', len(data)) + #attr[init_ptr.name] = anakin_tensor + #anakin_nodes[init_ptr.name] = {'name': init_ptr.name, 'onnx_attr': attr, 'visited': False, + # 'shape':None, 'ak_type': None, 'ak_attr': {}} + weights[init_ptr.name] = anakin_tensor + # if init_ptr.name == 'OC2_DUMMY_3': + # print (init_ptr, type(data), data, shape, dtype) + # exit(0) + # print 'name: ', init_ptr.name, dtype, shape, + + #print 'tensor: ', anakin_tensor + #exit() + input_name = onnx_graph.input + inputs = {} + input_shape = {} + in_cnt = 0 + # print '--------input---------' + # print input_name + for input_a in input_name: + shape = [] + for dim in input_a.type.tensor_type.shape.dim: + shape.append(dim.dim_value) + if len(shape) == 3: + # print 'before', shape + shape.append(1) + a = shape[2] + shape[2] = 1 + shape[3] = a + # print'after', shape + #attr["shape"] = shape + if input_a.name.startswith('data') or (input_a.name == ('gpu_0/data_0')) \ + or (input_a.name == '0') or (input_a.name == 'image'): + inputs[input_a.name] = shape + output_node = [] + print 'input: ', input_a.name + for node in anakin_nodes.values(): + for name in node['input']: + if name == input_a.name: + output_node.append(name) #(node_name) + #print 'out: ', output_node + node_name = str('input') + '_' + str(in_cnt) + # change inputs name in anakin nodes + ''' + for node in anakin_nodes.values(): + in_name = node['input'] + for i in range(len(in_name)): + if in_name[i] == input_a.name: + in_name[i] = node_name + ''' + + anakin_nodes[node_name] = {'name': node_name, 'type': 'Input', + 'input': [], 'output': output_node, + 'onnx_attr': {}, 'visited': True, + 'shape': shape, 'ak_type': 'Input', + 'ak_attr': {'shape': shape}} + + in_cnt += 1 + else: + # print 'name: ', input_a.name + input_shape[input_a.name] = shape + + output_name = onnx_graph.output + outputs = {} + for output_a in output_name: + shape = [] + for dim in output_a.type.tensor_type.shape.dim: + shape.append(dim.dim_value) + outputs[output_a.name] = shape + input_node = [] + for node in anakin_nodes.values(): + for name in node['output']: + if name == output_a.name: + input_node.append(name) + + anakin_nodes[output_a.name] = {'name': output_a.name, 'type': 'Output', + 'input': input_node, + 'output': [], 'onnx_attr': {}, 'visited': True, + 'shape': shape, 'ak_type': None, 'ak_attr': {}} + #print 'weights', len(weights) + #print 'weights', weights + ''' + for node_name in anakin_nodes.keys(): + for node_out in output_name: + if node_name == node_out: + anakin_nodes[node_name]['output'] = [] + ''' + # change inputs outputs name + self._change_inout_nodename(anakin_nodes, weights) + # print 'anakin_node', anakin_nodes + + output_node = {} + for node in anakin_nodes.values(): + for out_name in node['output']: + if out_name in outputs: + output_node[node['name']] = outputs[out_name] + # delete output + node['output'].pop() + outnode = node['output'] + for i in range(len(outnode)): + if outnode[i] in outputs: + outnode.pop(i) + + #print 'inputs', inputs + #print 'outputs', outputs + return [anakin_nodes, weights, outputs, output_node] + + def _change_inout_nodename(self, nodes, weights): + """ + convert tensor connection to op connection + :param nodes: + :param weights: + :return: + """ + out2nodename = {} + for node in nodes.values(): + for out_name in node['output']: + if out2nodename.get(out_name) is None: + out2nodename[out_name] = [node['name']] + else: + out2nodename[out_name].append(node['name']) + in2nodename = {} + for node in nodes.values(): + for in_name in node['input']: + if in2nodename.get(in_name) is None: + in2nodename[in_name] = [node['name']] + else: + in2nodename[in_name].append(node['name']) + + # print 'in2node_name', in2nodename + # print 'out2node_name', out2nodename + # print 'shape', shape + + for node in nodes.values(): + # print 'in:', node['input'] + # print 'out:', node['output'] + new_output = [] + new_input = [] + + for out_name in node['output']: + if in2nodename.get(out_name) is not None: + new_output += [op_name for op_name in in2nodename[out_name]] + for in_name in node['input']: + if out2nodename.get(in_name) is not None: + new_input += [op_name for op_name in out2nodename[in_name]] + # bias and weights + if weights.has_key(in_name): + new_input += [in_name] + + + node['input'] = new_input + node['output'] = new_output + # print 'node:', node['name'] + # print 'in:', node['input'] + # print 'out:', node['output'] + + def _parse_onnx_graph(self, nodes, weights): + """ + parse onnx + :param nodes: + :param weights: + :return: + """ + # out2nodename = {i['name']:[] for i in nodes} + #self._fix_self_output(nodes) + + for node in nodes.values(): + if node['type'] == 'Div': + parse_Div(node, weights, nodes) + + def all_search(graph, table): + """ + search the graph + :param graph: + :param table: + :return: + """ + for onnx_node in graph.values(): + if onnx_node['visited']: + continue + type_name = onnx_node['type'] + if table.get(type_name) != None: + table[type_name](onnx_node, weights, graph) + + all_search(nodes, {'Conv': parse_Conv, + 'Gemm': parse_Gemm, + 'Mul': parse_Mul, + 'BatchNormalization': parse_BatchNorm}) + + all_search(nodes, {'Concat': parse_Concat}) + + all_search(nodes, {'Add': parse_Add, + 'Sum': parse_Sum, + 'Transpose': parse_Transpose, + 'LRN': parse_Lrn, + 'Slice': parse_Slice, + 'Softmax': parse_Softmax, + 'Dropout': parse_Dropout, + 'Relu': parse_Act, + 'LeakyRelu': parse_Act, + 'ImageScaler': parse_ImageScaler, + 'MaxPool': parse_Pooling, + 'GlobalAveragePool': parse_Pooling, + 'AveragePool': parse_Pooling, + 'Reshape': parse_Reshape}) + #nodes = rm_weight_node(nodes, weights) + #print 'anakin_node: ', nodes + return nodes + + def _read_file(self): + fp = open(self.txt_path, mode='r') + lines = fp.readlines() + cnt = 0 + weights =[] + bias = [] + for l in lines: + l = l.rstrip('\n') + if 'Scales' in l: + st = l.split('[') + st2 = st[-1].split(']') + st3 = st2[0].split(' ') + # print st3, type(st3[1]) + for i in range(1, len(st3)-1): + # print st3[i] + weights.append(float(st3[i])) + # print '---------------' + # print 'weights: ', weights + # print(len(st3), len(weights)) + if 'Offsets' in l: + st = l.split('[') + st2 = st[-1].split(']') + st3 = st2[0].split(' ') + # print st3 + + for i in range(1, len(st3) - 1): + # print st3[i] + bias.append(float(st3[i])) + # print '---------------' + # print 'bias: ', bias + # print(len(st3), len(bias)) + cnt = cnt + 1 + # print l + if cnt >= 2: + break + # print 'weights: ', weights + # print 'bias: ', bias + # print 'len: ', len(weights), len(bias) + weights_node = {} + bias_node = {} + weights_node['data'] = np.array(weights) + weights_node['shape'] = [len(weights), 1, 1] + weights_node['dtype'] = 'float32' + bias_node['data'] = np.array(bias) + bias_node['shape'] = [len(bias), 1, 1] + bias_node['dtype'] = 'float32' + self.weights_data = weights_node + self.bias_data = bias_node + + def _cal_shape(self, graph, weights): + """ + calculate shape + :param graph: + :param weights: + :return: + """ + input_node = graph['input_0'] + out_node = input_node['output'] + op_list = ['Relu', 'Add', 'Dropout', 'Mul', 'BatchNormalization', 'Sum', + 'Softmax', 'LRN', 'Div', 'ReduceL2', 'Unsqueeze', 'Shape', + 'ImageScaler', 'LeakyRelu', 'Slice', 'Squeeze', 'Transpose'] + while len(out_node) > 0: + # print ('out_node: ', out_node) + for out_name in out_node: + # print out_name + node = graph[out_name] + op_type = node['type'] + top_shape = [1, 1, 1, 1] + if graph[node['input'][0]]['shape'] is not None: + top_shape = graph[node['input'][0]]['shape'] + if op_type in op_list: + node['shape'] = top_shape + else: + ak_attr = node['onnx_attr'] + if op_type == 'Conv': + strides =[1, 1] + if 'strides' in ak_attr: + strides = ak_attr['strides'] + pads =[1, 1] + if 'pads' in ak_attr: + pads = ak_attr['pads'] + # dilations = ak_attr['dilations'] + kernel_shape = ak_attr['kernel_shape'] + out_ch = weights[node['input'][1]]['shape'][0] + w = (top_shape[-1] + 2 * pads[0] - kernel_shape[0]) / strides[0] + 1 + h = 1 + node['shape'] = [top_shape[0], out_ch, h, w] + elif op_type == 'Gemm': + if node['input'][1] in weights and node['input'][2] in weights: + wei_shape = weights[node['input'][1]]['shape'] + bia_shape = weights[node['input'][2]]['shape'] + # print top_shape, bia_shape, wei_shape + node['shape'] = [top_shape[0], bia_shape[-1], + top_shape[2], wei_shape[1]] + else: + node['shape'] = [1, 1, 1, 1] + elif op_type == 'MaxPool' or op_type == 'AveragePool': + strides =[1, 1] + if 'strides' in ak_attr: + strides = ak_attr['strides'] + pads =[1, 1] + if 'pads' in ak_attr: + pads = ak_attr['pads'] + # dilations = ak_attr['dilations'] + kernel_shape = ak_attr['kernel_shape'] + out_ch = top_shape[1] + w = (top_shape[-1] + 2 * pads[1] - kernel_shape[0] + + strides[0] - 1) / strides[0] + 1 + h = 1 + node['shape'] = [top_shape[0], out_ch, h, w] + elif op_type == 'GlobalMaxPool' or op_type == 'GlobalAveragePool': + node['shape'] = [top_shape[0], out_ch, 1, 1] + elif op_type == 'Reshape': + re_shape = [1, 128] + if node['input'][1] in weights: + re_shape = weights[node['input'][1]]['shape'] + if len(re_shape) < 4: + re_shape = map(int, [1] * (4 - len(re_shape)) + list(re_shape)) + node['shape'] = re_shape + elif op_type == 'Concat': + axis = ak_attr['axis'] + num = 0 + for i in node['input']: + if graph[i]['shape'] is not None: + num += graph[i]['shape'][axis] + node_shape = [1, 1, 1, 1] + # print axis, top_shape + for i in range(0, 4): + if i == axis: + node_shape[i] = num + else: + node_shape[i] = top_shape[i] + else: + print ('Error op_type: ', op_type) + exit(0) + out_node = graph[out_node[0]]['output'] + + def _delete_ConstantOP(self, graph): + """ + Delete constant op + :param graph: + :return: + """ + med_graph = {} + for name in graph: + val = graph[name] + if val['type'] == 'Unsqueeze' or val['type'] == 'Squeeze' \ + or val['type'] == 'Constant': + #graph.pop(name) + print('constant op name: ', name) + else: + med_graph[name] = graph[name] + return med_graph + def parse(self): + """ + parse onnx + :return: + """ + if self.txt_path is not None: + self._read_file() + else: + self.weights_data = None + self.bias_data = None + onnx_model = onnx.load(self.model_path) + onnx_graph = onnx_model.graph + [nodes, weights, outputs, output_node] = self._parse_onnx_node(onnx_graph, {}) + print ('onnx_node') + for node in nodes.values(): + print(node['name'], node['type'], node['input'], node['output']) + + print ('-------------------------------') + self._cal_shape(nodes, weights) + print('parse onnx graph') + med_mid_graph = self._parse_onnx_graph(nodes, weights) + #delete Unsqueeze Constant Squeeze op + print ('delete extra constant OP') + med_graph = self._delete_ConstantOP(med_mid_graph) + print ('med_graph') + for name in med_graph.keys(): + node = med_graph[name] + print(node['name'], node['type'], node['input'], node['output'], node['shape']) + print ('-------------------------------') + return med_graph, output_node #filter_graph, outputs diff --git a/tools/external_converter_v2/parser/onnx/onnx_trans_utils.py b/tools/external_converter_v2/parser/onnx/onnx_trans_utils.py new file mode 100644 index 000000000..4c71a85b5 --- /dev/null +++ b/tools/external_converter_v2/parser/onnx/onnx_trans_utils.py @@ -0,0 +1,1330 @@ +import onnx +import numpy as np +import math +from google.protobuf import text_format +from med_graph import MedNodeUtil, MedGraphUtil + +ONNX_TO_ANAKIN_DTYPE = { + 1: np.float32, + 6: np.int32, + 7: np.int64, + 11: np.float64, + 12: np.uint32, + 13: np.uint64, +} + +ANAKIN_VALID_ATTRIBUTES = { + 'p', 'bias', 'axes', 'pads', 'mean', 'activation_beta', + 'spatial_scale', 'broadcast', 'pooled_shape', 'high', 'activation_alpha', + 'is_test', 'hidden_size', 'activations', + 'beta', 'input_as_shape', 'drop_states', 'alpha', + 'momentum', 'scale', 'axis', 'dilations', 'transB', 'axis_w', 'blocksize', + 'output_sequence', 'mode', 'perm', + 'min', 'seed', 'ends', 'paddings', 'to', 'gamma', 'width_scale', + 'normalize_variance', 'group', 'ratio', 'values', + 'dtype', 'output_shape', 'spatial', 'split', 'input_forget', 'keepdims', 'transA', + 'auto_pad', 'border', 'low', 'linear_before_reset', 'height_scale', 'output_padding', + 'shape', 'kernel_shape', 'epsilon', 'size', 'starts', + 'direction', 'max', 'clip', 'across_channels', 'value', 'strides', + 'extra_shape', 'scales', 'k', 'sample_size', + 'blocksize', 'epsilon', 'momentum' +} + +def get_onnx_tensor_data(tensor): + """ + Get data from tensor + :param tensor: + :return: + """ + assert isinstance(tensor, onnx.TensorProto) + is_raw = False + # print 'tensor', tensor + # tensor has raw_data and other_data + if tensor.float_data is not None and len(tensor.float_data) > 0: + data = tensor.float_data + is_raw = False + elif tensor.int32_data is not None and len(tensor.int32_data) > 0: + data = tensor.int32_data + is_raw = False + elif tensor.string_data is not None and len(tensor.string_data) > 0: + data = tensor.string_data + is_raw = False + elif tensor.int64_data is not None and len(tensor.int64_data) > 0: + data = tensor.int64_data + is_raw = False + elif tensor.double_data is not None and len(tensor.double_data) > 0: + data = tensor.double_data + is_raw = False + elif tensor.uint64_data is not None and len(tensor.uint64_data) > 0: + data = tensor.uint64_data + is_raw = False + elif tensor.raw_data is not None and len(tensor.raw_data) > 0: + data = tensor.raw_data + is_raw = True + else: + print ('Error: ', tensor) + exit(0) + # da = np.array(data) + # print da + if tensor.data_type == 1: #FLOAT + dtype = 'float32' + elif tensor.data_type == 6: #INT32 + dtype = 'int32' + elif tensor.data_type == 7: #INT64 + dtype = 'int64' + elif tensor.data_type == 8: #string + dtype = 'string' + elif tensor.data_type == 11: # string + dtype = 'double' + elif tensor.data_type == 12: #uint32 + dtype = 'uint32' + elif tensor.data_type == 13: #uint32 + dtype = 'uint64' + else: + print ('Error: ', tensor.data_type) + exit(0) + return [is_raw, data, dtype] + +def map_onnx_dtype(dtype): + """ + :param dtype: + :return: + """ + return ONNX_TO_ANAKIN_DTYPE.get(dtype) + +def has_key(attr, key_name): + """ + dict key + :param attr: + :param key_name: + :return: + """ + for it in attr.keys(): + if it == key_name: + return True + + return False + +def onnx_to_anakin_tensor(tensor): + """ + Convert onnx tensor to anakin med tensor + :param tensor: + :return: + """ + # print tensor + shape = [] + for dim in tensor.dims: + shape.append(int(dim)) + # print('--shape: ', shape) + [is_raw, data, dtype] = get_onnx_tensor_data(tensor) + # print 'shape: ', shape + # print 'is_raw: ', is_raw + #print 'float_data: ', tensor.float_data + # print(type(data),data,tensor.dtype,is_raw) + if is_raw: + if len(shape) > 0: + # print 'type: ', tensor.data_type + # print 'data: ', len(data) + # print 'dtype: ', map_onnx_dtype(tensor.data_type) + anakin_tensor = np.frombuffer(data, map_onnx_dtype(tensor.data_type)) + # print 'last len: ', len(anakin_tensor), anakin_tensor.shape + # print 'shape: ', shape + anakin_tensor = anakin_tensor.reshape(shape) + # print 'last len: ', len(anakin_tensor), anakin_tensor.shape + # exit(0) + else: + anakin_tensor = np.zeros(0) + #print 'anakin_tensor: ', anakin_tensor + # print('dtype: ', tensor.name, dtype, anakin_tensor.dtype) + return anakin_tensor, shape, dtype + else: + #print 'data' + return np.array(data).astype(map_onnx_dtype(tensor.data_type)), shape, dtype + +def trans_const_node(node, weights): + """ + trans const input to weight tensor + :param node: + :param weights: + :return: + """ + if len(node['input']) > 0: + in_name = node['input'][0] + weights_data = {} + if in_name in weights: + weights_node = weights[in_name] + # print ('weights_node: ', node['name'], weights_node['shape'], weights_node['dtype']) + if node['type'] == 'Reshape': + shape_name = node['input'][1] + if shape_name in weights: + shape_node = weights[shape_name] + shape = shape_node['data'] + weights_data['shape'] = shape + weights_data['data'] = weights_node['data'].reshape(shape) + weights_data['dtype'] = weights_node['dtype'] + # print ('weights_data: ', node['name'], weights_data['shape'], weights_data['dtype']) + else: + print('Mul can not find shape_node', shape_name) + return None + elif node['type'] == 'Unsqueeze': # axes = [1,2] [64] -> [64, 1, 1] + axes = node['onnx_attr']['axes'] + shape = weights_node['shape'] # default nchw + new_shape = [] + new_shape += shape + num = len(shape) + for i in axes: + if i >= num: + new_shape.append(1) + # print ('shape: ', shape) + # print ('new_shape: ', new_shape) + weights_data['shape'] = new_shape + weights_data['data'] = weights_node['data'].reshape(new_shape) + weights_data['dtype'] = weights_node['dtype'] + elif node['type'] == 'Squeeze': # axes = [1,2] [1, 64, 1, 1] -> [1,64] + axes = node['onnx_attr']['axes'] + shape = weights_node['shape'] # default nchw + new_shape1 = shape + new_shape = [] + num = len(shape) + if num >= 1: + for i in range(0, num): + if i in axes: + new_shape1[i] = 0 + for i in range(0, num): + if new_shape1[i] is not 0: + new_shape.append(new_shape1[i]) + else: + return None + weights_data['shape'] = new_shape + weights_data['data'] = weights_node['data'].reshape(new_shape) + weights_data['dtype'] = weights_node['dtype'] + else: + weights_data = weight_node + node['visited'] = True + else: + print('Mul can not find input_node', in_name) + return None + # weights_data['shape'] = weights_data['shape'].astype(np.float32) + return weights_data + else: + print('this node does not have input', node['name']) + return None + +def get_bias(node, weights, graph): + """ + search graph find const input and the next op_type is Add, then convert the node to bias + :param node: + :param weights: + :param graph: + :return: + """ + outs = node['output'] + output0 = graph[outs[0]] + bias_node = None + if len(outs) == 1 and output0['type'] == 'Add': + ins = output0['input'] + for i in range(0, len(ins)): + optype = graph[ins[i]]['type'] + if optype == 'Reshape' or optype == 'Unsqueeze' or optype == 'Squezze': + bias_node = trans_const_node(graph[ins[i]], weights) + if bias_node is not None: + #delete Add node + MedNodeUtil.redirecto_outputs_input_to_this(output0, graph, node['name']) + node['output'] = output0['output'] + graph.pop(output0['name']) + #delete bias node + graph.pop(ins[i]) + return bias_node + +def fusion_normL2_node(node_a, node_b, node_c, node, graph): + """ + A->node_a->node->node_b->node_c->B + A->node_c->B + fusion: A->node->B + :param node_a: + :param node_b: + :param node_c: + :param node: + :param graph: + :return: + """ + # print("node delete before: ", node['input'], node['output']) + #first delete edge A->node_c + top_in = node_a['input'] + A = graph[top_in[0]] + # print('A delete before: ', A['output']) + for ou in A['output']: + if ou == node_c['name']: + A['output'].remove(ou) + break + # print('A delete after: ', A['output']) + B = node_c['output'] + #change node output + # print('B delete before: ', graph[B[0]]['input']) + node['output'] = B + ins = graph[B[0]]['input'] + for i in range(0, len(ins)): + if ins[i] == node_c['name']: + ins[i] = node['name'] + # graph[B[0]]['input'].remove(ins) + # graph[B[0]]['input'].append(node['name']) + # print('B delete after: ', graph[B[0]]['input']) + #delete node_b and node_c + graph.pop(node_b['name']) + graph.pop(node_c['name']) + #change node input + # print('A delete before: ', A['output']) + node['input'] = node_a['input'] + A['output'] = node_a['output'] + # print('A delete after: ', A['output']) + #delete node_a + graph.pop(node_a['name']) + # print("node delete after: ", node['input'], node['output']) + +def fusion_PixelShuffle(node, out_node, outs, weights, graph): + """ + node->out_node->transpose->reshape->B + node->outs[0]->...->reshape->B + node->outs[1]->...->reshape->B + fusion: node->op_pixelshuffle->B + :param node: + :param out_node: + :param outs: + :param weights: + :param graph: + :return: + """ + # print ('fusion_PixelShuffle begin: ') + # print('node: ', node['name'], node['type'], node['input'], node['output']) + # aaa = graph[node['output'][0]] + # print('output: ', aaa['name'], aaa['type'], aaa['input'], aaa['output']) + for ou in outs: + if ou is not out_node['name']: + if graph[ou]['type'] == 'Shape': + continue + else: + print('Error Pattern: ', outs) + return + out_a = out_node['output'] + if len(out_a) == 1: + out_b = graph[out_a[0]] + if out_b['type'] == 'Transpose' and len(out_b['output']) == 1: + out_name = out_b['output'][0] + out_data = graph[out_name] + if out_data['type'] == 'Reshape': + out_list = [out_node['name'], out_name] + for name in outs: + # print ('name: ', name) + if name not in out_list: + if graph.has_key(name) is not True: + continue + node1 = graph[name] + list_tmp = [] + for name_a in node1['output']: + if graph.has_key(name_a) is not True: + if len(node1['output']) == 1: + graph.pop(node1['name']) + break + graph[name_a]['input'] = [node['name']] + list_tmp.append(name_a) + #outs.remove(name) + out_list.append(name) + if graph.has_key(name): + graph.pop(name) + # print ('remove name: ', name) + outs += list_tmp + # print ('delete output: ', out_list) + node['output'] = [out_name] + #delete Transpose and out_node + graph.pop(out_b['name']) + graph.pop(out_node['name']) + out_data['type'] = 'PixelShuffle' + scale_factor = 1 + if node['input'][1] in weights: + wei_shape = weights[node['input'][1]]['shape'] + if len(wei_shape) == 4: + num = (wei_shape[0] / wei_shape[1]) + sq = int(math.sqrt(num)) + if num == sq * sq: + scale_factor = sq + else: + print('Error shape, it does not meet a*a = wei_shape[0] / wei_shape[1]', wei_shape[0], wei_shape[1]) + exit(0) + else: + print('input is not right', node['input']) + exit(0) + else: + print('weigths is not right', node['input'][1], wei_shape) + out_data['onnx_attr'] ['scale_factor'] = scale_factor + out_data['visited'] = True + out_data['ak_type'] = 'PixelShuffle' + out_data['ak_attr']['scale_factor'] = scale_factor + out_data['ak_attr']['rw'] = scale_factor + out_data['ak_attr']['rh'] = scale_factor + if 'channel_first' in out_data['onnx_attr']: + out_data['ak_attr']['channel_first'] = out_data['onnx_attr']['channel_first'] + else: + out_data['ak_attr']['channel_first'] = True + else: + print('Error type ', out_data['name'], out_data['type']) + exit() + else: + print('Error type ', out_b['name'], out_b['type']) + exit() + else: + print('Error output lists ', out_a) + exit() + # print ('fusion_PixelShuffle after: ') + # print('node: ', node['name'], node['type'], node['input'], node['output']) + # aaa = graph[node['output'][0]] + # print('output: ', aaa['name'], aaa['type'], aaa['input'], aaa['output']) + +def delete_extra_node(node_a, node_b, node_c, graph): + """ + A->node_a->..->B1->C1->node_c + A->node_b->..->B2->C1->node_c + A->node_c + delete extra node: A->node_c->D + :param node_a: + :param node_b: + :param node_c: + :param node: + :param graph: + :return: + """ + outs = node_c['name'] + while node_a['output'][0] is not outs: + out_node = node_a['output'][0] + a = graph[out_node]['output'] + if graph.has_key(a[0]) is not True: + graph.pop(out_node) + break + graph[a[0]]['input'] = [node_a['name']] + node_a['output'] = a + graph.pop(out_node) + # print('delete node: ', out_node) + while node_b['output'][0] is not outs: + # print (node_b['name'], node_b['output']) + out_node = node_b['output'][0] + b = graph[out_node]['output'] + if graph.has_key(b[0]) is not True: + graph.pop(out_node) + break + graph[b[0]]['input'] = [node_b['name']] + node_b['output'] = b + graph.pop(out_node) + graph.pop(node_a['name']) + graph.pop(node_b['name']) + +def parse_Div(onnx_node, weights, graph): + """ + # Compute Y = normal_l2 + parse Div to Normalize + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Normalize' + input_node = onnx_node['input'] + assert len(input_node) == 2 + input0 = input_node[0] + input1 = input_node[1] + + # print 'input0', input0 + # print 'graph', graph + in0_node = graph[input0] + in1_node = graph[input1] + const_node = {} + in_node = {} + # print in0_node + # print in1_node + if in0_node['type'] == 'Constant': + #find the top node + const_node = in0_node + in_node = in1_node + elif in1_node['type'] == 'Constant': + #find the top node + const_node = in1_node + in_node = in0_node + else: + return + top_node = in_node + bot_node = graph[onnx_node['output'][0]] + if top_node['type'] == 'ReduceL2': + op_type = bot_node['type'] + if op_type == 'Unsqueeze' or op_type == 'Constant': + bot_next_node = graph[bot_node['output'][0]] + if bot_next_node['type'] == 'Div': + ak_attr = onnx_node['ak_attr'] + ak_attr['begin_norm_axis'] = top_node['onnx_attr']['axes'][0] + ak_attr['is_across_spatial'] = False + ak_attr['is_shared_channel'] = True + ak_attr['eps'] = 1e-6 + ak_attr['p'] = 2 + weights_node = {} + weights_node['shape'] = [1] + weights_node['data'] = [np.sqrt(bot_next_node['shape'][1])]#np.array(np.sqrt(bot_next_node['shape'][1])).astype(np.float32) + weights_node['dtype'] = 'float32' + ak_attr['weights'] = weights_node + # delete node + fusion_normL2_node(top_node, bot_node, bot_next_node, onnx_node, graph) + ous = onnx_node['output'] + if len(ous) == 3: + if graph[ous[0]]['type'] == 'Reshape': + #change node + if graph[ous[1]]['type'] == 'Shape' and graph[ous[2]]['type'] == 'Shape': + #reshape + node_re = graph[ous[0]] + node_re['visited'] = 'True' + node_re['ak_type'] = 'Reshape' + ak_shape = node_re['ak_attr'] + ak_shape['shape'] = [1, 128] + node_next = graph[node_re['output'][0]] + if node_next['input'][0] == node_re['name']: + wei_name = node_next['input'][1] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + else: + wei_name = node_next['input'][0] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + # print 'Reshape------: ', node_re['name'], node_re['ak_type'], node_re['shape'] + #delete node + delete_extra_node(graph[ous[1]], graph[ous[2]], node_re, graph) + node_re['input'] = [onnx_node['name']] + onnx_node['output'] = [node_re['name']] + elif graph[ous[1]]['type'] == 'Reshape': + #change node + if graph[ous[0]]['type'] == 'Shape' and graph[ous[2]]['type'] == 'Shape': + #reshape + node_re = graph[ous[1]] + node_re['visited'] = 'True' + node_re['ak_type'] = 'Reshape' + ak_shape = node_re['ak_attr'] + ak_shape['shape'] = [1, 128] + node_next = graph[node_re['output'][0]] + if node_next['input'][0] == node_re['name']: + wei_name = node_next['input'][1] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + else: + wei_name = node_next['input'][0] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + #delete node + delete_extra_node(graph[ous[1]], graph[ous[2]], node_re, graph) + node_re['input'] = [onnx_node['name']] + onnx_node['output'] = [node_re['name']] + elif graph[ous[2]]['type'] == 'Reshape': + #change node + if graph[ous[0]]['type'] == 'Shape' and graph[ous[1]]['type'] == 'Shape': + #reshape + node_re = graph[ous[2]] + node_re['visited'] = 'True' + node_re['ak_type'] = 'Reshape' + ak_shape = node_re['ak_attr'] + ak_shape['shape'] = [1, 128] + node_next = graph[node_re['output'][0]] + if node_next['input'][0] == node_re['name']: + wei_name = node_next['input'][1] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + else: + wei_name = node_next['input'][0] + wshape = weights[wei_name]['shape'] + ak_shape['shape'] = [1, wshape[0]] + #delete node + delete_extra_node(graph[ous[1]], graph[ous[2]], node_re, graph) + node_re['input'] = [onnx_node['name']] + onnx_node['output'] = [node_re['name']] + + else: + print('Error: ', in_node['type']) + exit(0) + else: + print('Error Pattern: ', in_node['type']) + # exit(0) + else: + print('Error Pattern: ', in_node['type']) + # exit(0) + +def rm_weight_node(onnx_node, weights, graph): + """ + remove weights node + :param onnx_node: + :param weights: + :param graph: + :return: + """ + for node in onnx_node.keys(): + in_node = onnx_node[node]['input'] + for name in in_node: + if weights.has_key(name): + in_node.remove(name) + +def parse_Conv(onnx_node, weights, graph): + """ + parse conv + :param onnx_node: + :param weights: + :param graph: + :return: + """ + #print 'parse_Conv2D' + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Convolution' + wei_name = onnx_node['input'][1] + weights_node = weights[wei_name] + if weights.has_key(wei_name): + weights_node = weights[wei_name] + else: + print ('conv can not find weights', wei_name) + #assert weights_node['type'] == 'Const' + weights_data = weights_node + + #print 'weights: ', weights_data + #exit() + bias_node = None + if len(onnx_node['input']) > 2: + bias_name = onnx_node['input'][2] + bias_node = weights[bias_name] + if weights.has_key(bias_name): + bias_node = weights[bias_name] + else: + print ('conv can not find bias', bias_name) + ''' + print 'bias dtype', bias_node['dtype'] + print 'bias shape ', bias_node['shape'] + print 'bias data', bias_node['data'] + exit() + ''' + onnx_node['input'].remove(bias_name) + + onnx_attr = onnx_node['onnx_attr'] + group = 1 + if 'group' in onnx_attr.keys(): + group = onnx_attr['group'] + + padding_val = [] + if 'pads' in onnx_attr.keys(): + #print 'pads: ', type(onnx_attr['pads'][0]) + padding_val = onnx_attr['pads'] #T L B R + if len(onnx_attr['pads']) == 1: + padding_val = [0, onnx_attr['pads'][0]] + else: + padding_val = [0, 0] + + dilations = [] + if 'dilations' in onnx_attr.keys(): + dilations = onnx_attr['dilations'] + if len(onnx_attr['dilations']) == 1: + dilations = [1, onnx_attr['dilations'][0]] + else: + dilations = [1, 1] + + strides = [] + if 'strides' in onnx_attr.keys(): + strides = onnx_attr['strides'] + if len(onnx_attr['strides']) == 1: + strides = [1, onnx_attr['strides'][0]] + else: + strides = [1, 1] + + kernel_shape = onnx_attr['kernel_shape'] + + if len(onnx_attr['kernel_shape']) == 1: + chin = weights_data['shape'][1] + # print '**shape**', weights_data['shape'], type(chin), type(strides[0]) + kernel_shape = [1, onnx_attr['kernel_shape'][0]] + #padding deal include padding + if 'auto_pad' in onnx_attr.keys(): #onnx_attr['auto_pad'] == 'SAME_LOWER' or onnx_attr['auto_pad'] == 'SAME_UPPER': + #out_shape[2] = ceil((in_shape[2]- kernel_h) / stride_h) + #pad[0] = (out_shape[2] - 1) * stride_h + \ kernel_h - in_shape[2] + padding = [1, 1] + padding = [padding_val[0], padding_val[1]] + + ak_attr = onnx_node['ak_attr'] + ak_attr['weights'] = weights_data + ak_attr['padding'] = padding + ak_attr['dilations'] = dilations + ak_attr['strides'] = strides + ak_attr['kernel'] = kernel_shape + ak_attr['group'] = group + if bias_node is not None: + ak_attr['bias'] = bias_node + + # pixelShuffle + if len(onnx_node['output']) == 5: + outs = onnx_node['output'] + for i in range(0, len(outs)): + if graph[outs[i]]['type'] == 'Reshape': + fusion_PixelShuffle(onnx_node, graph[outs[i]], outs, weights, graph) + # refind_node_delete(onnx_node, graph) + break + + inputs = onnx_node['input'] + inputs.remove(wei_name) + ''' + for name in inputs: + if name == wei_name: + inputs.remove(name) + if name == bias_name: + inputs.remove(bias_name) + ''' + +def parse_Mul(onnx_node, weights, graph): + """ + # Compute Y = A * B + C + parse Mul to dense + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visted'] = True + onnx_node['ak_type'] = 'Scale' + input_node = onnx_node['input'] + input0 = input_node[0] + input1 = input_node[1] + in0_type = graph[input0]['type'] + in1_type = graph[input1]['type'] + weights_node = {} + if in0_type == 'Reshape' or in0_type == 'Unsqueeze' or in0_type == 'Squezze': + weights_node = trans_const_node(graph[input0], weights) + if weights_node is not None: + # remove the input node + graph.pop(input0) + onnx_node['input'].remove(input0) + # onnx_node['input'].remove(wei_name) + else: + print ('MUL can not find weights', input0) + exit(0) + elif in1_type == 'Reshape' or in1_type == 'Unsqueeze' or in1_type == 'Squezze': + weights_node = trans_const_node(graph[input1], weights) + if weights_node is not None: + # remove the input node + graph.pop(input1) + onnx_node['input'].remove(input1) + else: + print ('can not find weights', input1) + exit(0) + elif in0_type == 'Constant' or in1_type == 'Constant': + weights_node = {} + ''' + node = graph[onnx_node['input'][0]] + wei_name = node['input'][1] + a = weights[wei_name]['shape'][0] + ''' + weights_node['shape'] = [64] #[a] + data = np.ones(weights_node['shape']) + if 'broadcast' in onnx_node['onnx_attr']: + for i in range(0, weights_node['shape'][0]): + data[i] = onnx_node['onnx_attr']['broadcast'] # 1 + weights_node['data'] = data + weights_node['dtype'] = "float32" + if in0_type == 'Constant': + # print('input0: ', input0) + graph.pop(input0) + onnx_node['input'].remove(input0) + else: + # print('input1: ', input1) + graph.pop(input1) + onnx_node['input'].remove(input1) + else: + print ('Mul parse Error') + exit(0) + else: + print ('Mul parse Error Pattern: ', in0_type, in1_type) + # return + # exit(0) + ak_attr = onnx_node['ak_attr'] + ak_attr['weights'] = weights_node + bias_node = get_bias(onnx_node, weights, graph) + if bias_node is not None: + ak_attr['bias'] = bias_node + +def parse_Gemm(onnx_node, weights, graph): + """ + # Compute Y = alpha * A' * B' + beta * C + parse Gemm to dense + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Dense' + + onnx_attr = onnx_node['onnx_attr'] + alpha = 1.0 + if 'alpha' in onnx_attr.keys(): + alpha = onnx_attr['alpha'] + + beta = 1.0 + if 'beta' in onnx_attr.keys(): + beta = onnx_attr['beta'] + + transA = 0 + if 'transA' in onnx_attr.keys(): + transA = onnx_attr['transA'] + else: + transA = 0 + + transB = 0 + if 'transB' in onnx_attr.keys(): + transB = onnx_attr['transB'] + else: + transB = 0 + + wei_name = onnx_node['input'][1] + weights_node = {} + if weights.has_key(wei_name): + weights_node = weights[wei_name] + # onnx_node['input'].remove(wei_name) + else: + node = graph[wei_name] + weights_node = trans_const_node(node, weights) + if weights_node is not None: + # remove the input node + graph.pop(wei_name) + # onnx_node['input'].remove(wei_name) + else: + print ('Gemm can not find weights', wei_name) + exit(0) + #assert weights_node['type'] == 'Const' + # weights_data = weights_node + + ak_attr = onnx_node['ak_attr'] + if beta == 1: + if len(onnx_node['input']) > 2: + bias_name = onnx_node['input'][2] + # bias_node = weights[bias_name] + if weights.has_key(bias_name): + bias_node = weights[bias_name] + else: + bias_node = graph[bias_name] + print ('Gemm can not find bias', bias_name) + # print('Dense input: ', onnx_node['input']) + onnx_node['input'].remove(bias_name) + # print('Dense input: ', onnx_node['input']) + ak_attr['bias'] = bias_node + + #print 'name: ', onnx_node['name'] + #print 'shape', weights_data['shape'] + if alpha == 0 or transA == 1: + ak_attr['weights'] = None + ak_attr['Gemm'] = 0 + print ('Gemm Error, alpha, transA', alpha, transA) + exit(0) + else: + weights_data = {} + if transB == 1: + #print 'trans' + ak_attr['trans'] = 1 + # print ('trans before: ', weights_node['shape']) + # weights_data['data'] = np.transpose(weights_node['data']) + # weights_data['shape'] = [weights_node['shape'][1], weights_node['shape'][0]] + # weights_data['dtype'] = weights_node['dtype'] + # print ('trans after: ', weights_data['shape']) + else: + ak_attr['trans'] = 0 + # weights_data = weights_node + weights_data = weights_node + ak_attr['weights'] = weights_data + ak_attr['Gemm'] = 1 + #ak_attr['out_dim'] = weights_data + onnx_node['input'].remove(wei_name) + +def parse_Act(onnx_node, weights, graph): + """ + parse Act + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Activation' + if onnx_node['type'] == 'Relu': + onnx_node['ak_type'] = 'ReLU' + onnx_node['ak_attr']['type'] = 'Relu' + elif onnx_node['type'] == 'LeakyRelu': + # onnx_node['ak_type'] = 'PReLU' + onnx_node['ak_attr']['type'] = 'PReLU' + onnx_attr = onnx_node['onnx_attr'] + slope = 0.01 + if 'alpha' in onnx_attr: + slope = onnx_attr['alpha'] + weights_node = {} + weights_node['dtype'] = 'float32' + weights_node['shape'] = [1] + weights_node['data'] = [slope] + onnx_node['ak_attr']['weights'] = weights_node + onnx_node['ak_attr']['channel_shared'] = True + else: + raise Exception('un handel activation ' + str(onnx_node.op_type)) + +def parse_Concat(onnx_node, weights, graph): + """ + parse Concat + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Concat' + onnx_attr = onnx_node['onnx_attr'] + ak_attr = onnx_node['ak_attr'] + if 'axis' in onnx_attr.keys(): + ak_attr['axis'] = onnx_attr['axis'] + else: + ak_attr['axis'] = 0 + +def parse_Reshape(onnx_node, weights, graph): + """ + parse Reshape + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Reshape' + shape_name = onnx_node['input'][1] + shape_node = {} #weights[shape_name] + if weights.has_key(shape_name): + shape_node = weights[shape_name] + else: + if len(onnx_node['input']) == 2: + in_node0 = graph[onnx_node['input'][0]] + in_node1 = graph[onnx_node['input'][1]] + if in_node0['type'] == 'Constant': + shape_node['data'] = in_node1['onnx_attr']['value'][0] + elif in_node1['type'] == 'Constant': + shape_node['data'] = in_node1['onnx_attr']['value'][0] + # print shape_node, type(shape_node['data']) + else: + print ('Reshape can not find weights', shape_name) + exit(0) + else: + shape_node['shape'] = [1,1,1,1] + shape_node['data'] = [1] + print ('Reshape can not find weights', shape_name) + exit(0) + + ak_attr = onnx_node['ak_attr'] + # array = np.array(shape_node['shape']) + data = shape_node['data'] + + input_name = onnx_node['input'][0] + + shape = [] + if data[0] == 0: + onnx_node['ak_type'] = 'Flatten' + ak_attr['start_axis'] = 1 + ak_attr['end_axis'] = -1 + ak_attr['type'] = 'Flatten' + else: + if len(data) == 5: + if data[0] == 1: + shape = [data[1], data[2], data[3], data[4]] + else: + print ('Reshape does not support 5 dims ', data) + exit() + # elif len(data) > 5: + # print ('Reshape does not support >5 dims ', data) + # exit() + else: + shape = data + + ak_attr['type'] = 'Reshape' + # print ('***Reshape:*** ', shape) + ak_attr['shape'] = shape + + # print onnx_node['input'] + onnx_node['input'].pop(1) + # print onnx_node['input'] + +def parse_Transpose(onnx_node, weights, graph): + """ + parse Transpose to Permute + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Permute' + + ak_attr = onnx_node['ak_attr'] + data = onnx_node['onnx_attr']['perm'] + + shape = [] + + if len(data) == 5 and data[0] == 0: + shape = [data[1]-1, data[2]-1, data[3]-1, data[4]-1] + # elif len(data) >= 5: + # shape = data + # print ('Permute does not support 5 dims permute ', data) + # # exit(0) + else: + shape = data + # print('data: ', data) + # print('shape: ', shape) + ak_attr['shape'] = shape + +def parse_Add(onnx_node, weights, graph): + """ + parse Add to Eltwise + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + assert len(onnx_node['input']) == 2 + + ak_attr = onnx_node['ak_attr'] + onnx_node['ak_type'] = 'Eltwise' + ak_attr['type'] = 'Add' + +def parse_Sum(onnx_node, weights, graph): + """ + parse Sum to Eltwise + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + assert len(onnx_node['input']) == 2 + + ak_attr = onnx_node['ak_attr'] + onnx_node['ak_type'] = 'Eltwise' + ak_attr['type'] = 'Add' + +def parse_Pooling(onnx_node, weights, graph): + """ + parse Pooling + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Pooling' + ak_attr = onnx_node['ak_attr'] + onnx_attr = onnx_node['onnx_attr'] + + padding_val = [] + if 'pads' in onnx_attr.keys(): + padding_val = onnx_attr['pads'] + else: + padding_val = [0, 0] + + dilations = [] + if 'dilations' in onnx_attr.keys(): + dilations = onnx_attr['dilations'] + else: + dilations = [1, 1] + + strides = [] + if 'strides' in onnx_attr.keys(): + strides = onnx_attr['strides'] + else: + strides = [1, 1] + + kernel_shape = [] + if 'kernel_shape' in onnx_attr.keys(): + kernel_shape = onnx_attr['kernel_shape'] + else: + kernel_shape = [1, 1] + # padding deal inlcuding pading + if 'auto_pad' in onnx_attr.keys(): #onnx_attr['auto_pad'] == 'SAME_LOWER' or onnx_attr['auto_pad'] == 'SAME_UPPER': + #out_shape[2] = ceil((in_shape[2]- kernel_h) / stride_h) + #pad[0] = (out_shape[2] - 1) * stride_h + \ kernel_h - in_shape[2] + padding_val = [1, 1] + # padding = [1, 1, 1, 1] =[top, left, bottom, right] + # else: + padding = [padding_val[0], padding_val[1]] + if len(padding_val) == 4: + a = padding_val[0] + padding_val[2] + b = padding_val[1] + padding_val[3] + pad_val0 = a / 2 + pad_val1 = b / 2 + # print 'padding:', pad_val0, pad_val1 + padding = [pad_val0, pad_val1] + # inception v2 + # padding = [padding_val[2], padding_val[3]] + + + ak_attr['window'] = kernel_shape + ak_attr['padding'] = padding + ak_attr['strides'] = strides + + if onnx_node['type'] == 'MaxPool': + ak_attr['type'] = 'MAX' + ak_attr['global_pooling'] = False + + if onnx_node['type'] == 'AveragePool': + if 'count_include_pad'in onnx_attr.keys(): + ak_attr['type'] = 'AVG' + else: + ak_attr['type'] = 'AVGEXC' + ak_attr['global_pooling'] = False + # padding deal + # if onnx_attr['atuo_pad'] == 'SAME_LOWER' or onnx_attr['atuo_pad'] == 'SAME_UPPER': + # padding = [0, 0] + # else: + # padding = [padding_val[1], padding_val[0]] + + if onnx_node['type'] == 'GlobalMaxPool': + ak_attr['type'] = 'MAX' + ak_attr['global_pooling'] = True + + padding_val = [0, 0] + strides = [0, 0] + kernel_shape = [1, 1] + + if onnx_node['type'] == 'GlobalAveragePool': + ak_attr['type'] = 'AVG' + ak_attr['global_pooling'] = True + + padding_val = [0, 0] + strides = [0, 0] + kernel_shape = [1, 1] + + ak_attr['window'] = kernel_shape + ak_attr['padding'] = padding #padding_val + ak_attr['strides'] = strides + +def parse_ImageScaler(onnx_node, weights, graph): + """ + parse ImageScaler + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Scale' + ak_attr = onnx_node['ak_attr'] + + scale_val = onnx_node['onnx_attr']['scale'] + shape = [1, 1, 1, 3] + scale_val = [1.0, 1.0, 1.0] + if 'scale' in onnx_node['onnx_attr']: + scale_val = onnx_node['onnx_attr']['scale'] + if type(scale_val) is 'float': + scale_val =[ scale_val, scale_val, scale_val] + scale_np = np.full(shape, scale_val) #np.arange([scale_val]) + weight_tensor = {} + weight_tensor['shape'] = shape + weight_tensor['data'] = scale_np + weight_tensor['dtype'] = 'float32' + ak_attr['weights'] = weight_tensor + + bias_val = [1.0] + if 'bias' in onnx_node['onnx_attr']: + bias_val = onnx_node['onnx_attr']['bias'] + # print 'bias: ', len(bias_val) + shape_b = [len(bias_val)] + # print 'shape_b: ', shape_b + bias_tensor = {} + bias_tensor['shape'] = shape_b + bias_tensor['data'] = bias_val + bias_tensor['dtype'] = 'float32' + ak_attr['bias'] = bias_tensor + + +def parse_Dropout(onnx_node, weights, graph): + """ + parse Dropout + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Scale' + ak_attr = onnx_node['ak_attr'] + ''' + ratio (float, default 0.5) the ratio of random dropout + is_test (int) if nonzero, run dropout in test mode where the output is simply Y = X. + ''' + if 'is_test' in onnx_node['onnx_attr'].keys(): + if onnx_node['onnx_attr']['is_test'] == 0: + ak_attr['drop'] = 1 #Ydata[i] = Xdata[i] * scale * mask_data[i]; + else: + ak_attr['drop'] = 0 + onnx_node['output'].pop(len(onnx_node['output'])-1) #delete mask_node + print ('it not support, Error') + return + else: + ak_attr['drop'] = 0 + scale_val = onnx_node['onnx_attr']['ratio'] + shape = [1, 1, 1, 1] + scale_np = np.full(shape, scale_val) #np.arange([scale_val]) + weight_tensor = {} + weight_tensor['shape'] = shape + weight_tensor['data'] = scale_np + weight_tensor['dtype'] = 'float32' + ak_attr['weights'] = weight_tensor + ak_attr['axis'] = 0 + ak_attr['num_axes'] = 0 + +def parse_Softmax(onnx_node, weights, graph): + """ + parse sooftmax + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Softmax' + if 'axis' in onnx_node['onnx_attr']: + onnx_node['ak_attr']['axis'] = onnx_node['onnx_attr']['axis'] + else: + onnx_node['ak_attr']['axis'] = 1 + +def parse_Lrn(onnx_node, weights, graph): + """ + parse LRN + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'LRN' + ak_attr = onnx_node['ak_attr'] + onnx_attr = onnx_node['onnx_attr'] + local_size = 0 + if 'size' in onnx_attr.keys(): + local_size = onnx_attr['size'] + alpha = 0.0001 + if 'alpha' in onnx_attr.keys(): + alpha = onnx_attr['alpha'] + beta = 0.75 + if 'beta' in onnx_attr.keys(): + beta = onnx_attr['beta'] + k = 1 + if 'bias' in onnx_attr.keys(): + k = onnx_attr['bias'] + ak_attr['local_size'] = local_size + ak_attr['alpha'] = alpha / local_size + ak_attr['beta'] = beta + ak_attr['k'] = k + +def parse_BatchNorm(onnx_node, weights, graph): + """ + parse BatchNorm + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Scale' + ak_attr = onnx_node['ak_attr'] + assert len(onnx_node['input']) == 5 + + alpha_name = onnx_node['input'][1] + beta_name = onnx_node['input'][2] + mean_name = onnx_node['input'][3] + var_name = onnx_node['input'][4] + + alpha_node = weights[alpha_name] + if weights.has_key(alpha_name): + alpha_node = weights[alpha_name] + else: + print ('BatchNorm can not find alpha_name', alpha_name) + exit(0) + return + + beta_node = weights[beta_name] + if weights.has_key(beta_name): + beta_node = weights[beta_name] + else: + print ('BatchNorm can not find beta_name', beta_name) + exit(0) + return + + mean_node = weights[mean_name] + if weights.has_key(mean_name): + mean_node = weights[mean_name] + else: + print ('BatchNorm can not find mean_name', mean_name) + exit(0) + return + + var_node = weights[var_name] + if weights.has_key(var_name): + var_node = weights[var_name] + else: + print ('BatchNorm can not find var_name', var_name) + exit(0) + return + + onnx_attr = onnx_node['onnx_attr'] + eps = 1e-5 + if 'epsilon' in onnx_attr.keys(): + eps = onnx_attr['epsilon'] + momentum = 0.9 + if 'momentum' in onnx_attr.keys(): + momentum = onnx_attr['momentum'] + spatial = 1 + if 'spatial' in onnx_attr.keys(): + spatial = onnx_attr['spatial'] + + # print 'type: ', type(var_node['data']) + var_data = np.array(var_node['data']) + alpha_data = np.array(alpha_node['data']) + beta_data = np.array(beta_node['data']) + mean_data = np.array(mean_node['data']) + var = np.sqrt(var_data.flatten() + eps) + np_scale = alpha_data.flatten() / var + np_bias = beta_data.flatten() - (alpha_data.flatten() * mean_data.flatten() / var) + + # ak_attr['weights'] = np_scale.astype('float32') + # ak_attr['bias'] = np_bias.astype('float32') + scale_tensor = {} + bias_tensor = {} + scale_tensor['dtype'] = 'float32' + scale_tensor['data'] = np_scale + scale_tensor['shape'] = np_scale.shape + + # print 'parse_BatchNorm scale: ', np_scale.shape + + bias_tensor['dtype'] = 'float32' + bias_tensor['data'] = np_bias + bias_tensor['shape'] = np_bias.shape + + # print 'parse_BatchNorm bias: ', np_bias.shape + + ak_attr['weights'] = scale_tensor + ak_attr['bias'] = bias_tensor + + MedNodeUtil.retain_input(onnx_node, [onnx_node['input'][0]]) + +def parse_Slice(onnx_node, weights, graph): + """ + parse Slice [axes, starts, ends] + axes[0]==>[starts[0],ends[0]] + axes[1]==>[starts[1],ends[1]] + :param onnx_node: + :param weights: + :param graph: + :return: + """ + onnx_node['visited'] = True + onnx_node['ak_type'] = 'Slice' + ak_attr = onnx_node['ak_attr'] + onnx_attr = onnx_node['onnx_attr'] + ak_attr['axis'] = onnx_attr['axes'] + ak_attr['slice_point'] = onnx_attr['starts'] + ak_attr['slice_dim'] = onnx_attr['ends'] diff --git a/tools/external_converter_v2/parser/onnx/parser_onnx.py b/tools/external_converter_v2/parser/onnx/parser_onnx.py new file mode 100644 index 000000000..9eac269c0 --- /dev/null +++ b/tools/external_converter_v2/parser/onnx/parser_onnx.py @@ -0,0 +1,117 @@ +import numpy as np +import os +from ..graph_io import * +from ..logger import * +from ..proto import * +import onnx +from onnx_graph import ParseOnnxToMed +from med_trans_util import MedTransAK +from med_graph import MedGraphUtil, MedNodeUtil + +class OnnxParser: + """ + onnx parse begin + """ + def __init__(self, onnx_config_dict): + # anakin graph model io + # config info + # print 'onnx_config_dict', onnx_config_dict + + # self.ProtoPaths = onnx_config_dict['ProtoPaths'] + self.OnnxPaths = onnx_config_dict['ModelPath'] + if onnx_config_dict['TxtPath'] == '': + self.txtPaths = None + else: + self.txtPaths = onnx_config_dict['TxtPath'] + self.med_trans_tool = MedTransAK() + self.input_count = 0 + + def __call__(self): + [med_graph, outputs] = self._conver_onnx_2_med() + self.Output = outputs + MedGraphUtil.solve(med_graph) + anakin_graph = self._conver_med_2_anakin(med_graph) + return anakin_graph + + + def _conver_onnx_2_med(self): + """ + convert onnx to med graph + :return: + """ + parser = ParseOnnxToMed(self.OnnxPaths, self.txtPaths) + return parser.parse() + + def _add_protonode(self, ak_graph, med_node): + """ + add med node to anakin graph + :param ak_graph: + :param med_node: + :return: + """ + ak_type = med_node['ak_type'] + # print '_add_protonode', med_node['name'], ak_type + if ak_type is None: + # print 'ak_type' + return + nodeIO = NodeProtoIO() + if med_node['ak_type'] == 'Input': + nodeIO.set_name('input_' + str(self.input_count)) + self.input_count += 1 + else: + nodeIO.set_name(med_node['name']) + self.med_trans_tool.map_med_2_ak(nodeIO, med_node) + ak_graph.add_node(nodeIO()) + if nodeIO().Op.name == 'Input': + ak_graph.add_in(nodeIO().name) + #print 'node: ', med_node['name'] + + def _search_output_list(self, graph): + """ + search output list + :param graph: + :return: + """ + output_list=set() + graph_cp=graph.copy() + + def recursive_search(node): + """ + recursive search + :param node: + :return: + """ + if node.get('out_search_flat') is not None: + return set() + node['out_search_flat']=True + outputs=node['output'] + result = set() + if len(outputs) == 0: + result.add(node['name']) + else: + for i in outputs: + result |= recursive_search(graph[i]) + return result + + + for i in graph_cp.values(): + output_list |= recursive_search(i) + return list(output_list) + + def _conver_med_2_anakin(self, med_graph): + """ + convert med graph too anakin graph + :param med_graph: + :return: + """ + anakin_graph = GraphProtoIO() + #print 'med_graph: ', med_graph + for node in med_graph.values(): + self._add_protonode(anakin_graph, node) + + print '*************anakin**************' + anakin_graph.format_edge_from_nodes() + for out_node_name in self.Output: + anakin_graph.add_out('output_' + out_node_name, out_node_name) + print 'out', out_node_name + return anakin_graph diff --git a/tools/external_converter_v2/parser/operations/ops.py b/tools/external_converter_v2/parser/operations/ops.py index 70d9274ad..73c644b4f 100755 --- a/tools/external_converter_v2/parser/operations/ops.py +++ b/tools/external_converter_v2/parser/operations/ops.py @@ -10,7 +10,8 @@ max_len = int(), max_batch = int(), alias="NULL", - data_type="NULL") + data_type="NULL", + layout="NCHW") # graph out , only hold place for edge OpsRegister.Register("Output").set_attr() @@ -18,81 +19,81 @@ OpsRegister.Register("Split").set_attr(split_num=int()) ############################# Basic Op define ############################## -# two input +# two input OpsRegister.Register("Dot").set_attr(axes=list()) # one or two input # enum type { -# Add, -# Subtract, -# Multiply, -# Avg, -# Max -# } +# Add, +# Subtract, +# Multiply, +# Avg, +# Max +# } # note : coeff only used by caffe for "Add" -OpsRegister.Register("Eltwise").set_attr(type="Add", +OpsRegister.Register("Eltwise").set_attr(type="Add", coeff=list()) # list input OpsRegister.Register("Concat").set_attr(axis=int()) # one input -OpsRegister.Register("Exp").set_attr(base=float(), - scale=float(), +OpsRegister.Register("Exp").set_attr(base=float(), + scale=float(), shift=float()) # one input # y = log(shift + scale * x) -OpsRegister.Register("Log").set_attr(base=float(), - scale=float(), +OpsRegister.Register("Log").set_attr(base=float(), + scale=float(), shift=float()) # one input # y = (shift + scale * x) ^ power -OpsRegister.Register("Power").set_attr(shift=float(), - scale=float(), +OpsRegister.Register("Power").set_attr(shift=float(), + scale=float(), power=float()) # one input OpsRegister.Register("Softmax").set_attr(axis=int()) # applies an activation parameter function to an output -# enum type: -# enum type { -# TanH, -# Sigmoid, -# } +# enum type: +# enum type { +# TanH, +# Sigmoid, +# } OpsRegister.Register("Activation").set_attr(type="", - clip_relu_num=int()) + clip_relu_num=float()) # Leaky version of a Rectified Linear Unit ( alpha != 0 ). -# f(x) = alpha * x : x < 0 -# f(x) = x : x >= 0 +# f(x) = alpha * x : x < 0 +# f(x) = x : x >= 0 # Standard ReLU ( alpha = 0 ) # f(x) = 0 * x : x < 0 # f(x) = x : x >= 0 # note: alpha is fixed value OpsRegister.Register("ReLU").set_attr(alpha=float()) # Parametric Rectified Linear Unit -# f(x) = alpha * x : x < 0 -# f(x) = x : x >= 0 +# f(x) = alpha * x : x < 0 +# f(x) = x : x >= 0 # note: alpha is learned array with the same shape as x. -# ref: Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: -# <>, 2015. +# ref: Parametric ReLU described in K. He et al, Delving Deep into Rectifiers: +# <>, 2015. OpsRegister.Register("PReLU").set_attr(channel_shared=bool()) # Exponential Linear Unit. -# f(x) = alpha * (exp(x) - 1.0) : x < 0 -# f(x) = x : x >= 0 +# f(x) = alpha * (exp(x) - 1.0) : x < 0 +# f(x) = x : x >= 0 OpsRegister.Register("ELU").set_attr(alpha=int()) # dense op parameter -OpsRegister.Register("Dense").set_attr(out_dim=int(), - axis=int(), +OpsRegister.Register("Dense").set_attr(out_dim=int(), + axis=int(), bias_term=bool()) # dropout parameter -OpsRegister.Register("Dropout").set_attr(ratio=float()) +OpsRegister.Register("Dropout").set_attr(ratio=float()) -OpsRegister.Register("Flatten").set_attr(start_axis=int(), +OpsRegister.Register("Flatten").set_attr(start_axis=int(), end_axis=int()) # caffe unique layer -OpsRegister.Register("Reshape").set_attr(dims=list(), - axis=int(), +OpsRegister.Register("Reshape").set_attr(dims=list(), + axis=int(), num_axes=int(), layout='') @@ -101,12 +102,12 @@ # Cropping op for cropping data of (1/2/3D) by using axis info # cropping is the same as tf cropping parameter, which saved as tuple or int. -OpsRegister.Register("Cropping").set_attr(cropping=list(), +OpsRegister.Register("Crop").set_attr(cropping=list(), axis=int()) # slices an input layer to multiple output layers along a given dimension with given slice indices -OpsRegister.Register("Slice").set_attr(axis=int(), - slice_point=list(), +OpsRegister.Register("Slice").set_attr(axis=int(), + slice_point=list(), slice_dim=int(), num=int(), sections=list()) @@ -114,126 +115,126 @@ ############################# Normalization Op define ############################## # Batch normalization op -# explanation: -# Normalize the activations of the previous layer at each batch, -# i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. -OpsRegister.Register("BatchNorm").set_attr(momentum=float(), +# explanation: +# Normalize the activations of the previous layer at each batch, +# i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. +OpsRegister.Register("BatchNorm").set_attr(momentum=float(), epsilon=float()) # caffe need may use scale layer after batchnorm layer which tf/mxnet/keras needn't -OpsRegister.Register("Scale").set_attr(axis=int(), - num_axes=int(), +OpsRegister.Register("Scale").set_attr(axis=int(), + num_axes=int(), bias_term=bool()) -# Local Response Normalization op same as caffe, +# Local Response Normalization op same as caffe, # which performs a kind of "lateral inhibition" by normalizing over local input regions # enum NormRegion { -# ACROSS_CHANNELS -# WITHIN_CHANNEL +# ACROSS_CHANNELS +# WITHIN_CHANNEL # } -OpsRegister.Register("LRN").set_attr(local_size=int(), - alpha=float(), - beta=float(), - norm_region="ACROSS_CHANNELS", +OpsRegister.Register("LRN").set_attr(local_size=int(), + alpha=float(), + beta=float(), + norm_region="ACROSS_CHANNELS", k=float()) # Mean-Variance Normalization -OpsRegister.Register("MVN").set_attr(normalize_variance=bool(), - across_channels=bool(), +OpsRegister.Register("MVN").set_attr(normalize_variance=bool(), + across_channels=bool(), epsilon=float()) ############################# Pooling (1D/2D/3D) Op define ############################## -# enum type: +# enum type: # enum method { -# MAX, // [default] -# AVG, +# MAX, // [default] +# AVG, # AVGEXC, average_exclude_padding_value -# STOCHASTIC, +# STOCHASTIC, # } -OpsRegister.Register("Pooling").set_attr(pool_size=list(), - strides=list(), - padding=list(), - method="MAX", - global_pooling=bool(), +OpsRegister.Register("Pooling").set_attr(pool_size=list(), + strides=list(), + padding=list(), + method="MAX", + global_pooling=bool(), cmp_out_shape_floor_as_conv=False) -# Spatial Pyramid Pooling -# enum type: +# Spatial Pyramid Pooling +# enum type: # enum method { -# MAX, // [default] -# AVG, -# STOCHASTIC, +# MAX, // [default] +# AVG, +# STOCHASTIC, # } -OpsRegister.Register("SPP").set_attr(pyramid_height=int(), +OpsRegister.Register("SPP").set_attr(pyramid_height=int(), method="MAX",) ############################# Convolution (1D/2D/3D) Op define ############################## # convolution parameter -OpsRegister.Register("Convolution").set_attr(filter_num=int(), - kernel_size=list(), - strides=list(), - padding=list(), - dilation_rate=list(), - group=int(), - axis=int(), +OpsRegister.Register("Convolution").set_attr(filter_num=int(), + kernel_size=list(), + strides=list(), + padding=list(), + dilation_rate=list(), + group=int(), + axis=int(), bias_term=bool()) # Depthwise separable convolution, commonly called "separable convolution" in tf -OpsRegister.Register("DeSepConvolution").set_attr(filter_num=int(), - kernel_size=list(), - strides=list(), - padding=list(), - dilation_rate=list(), - group=int(), - axis=int(), +OpsRegister.Register("DeSepConvolution").set_attr(filter_num=int(), + kernel_size=list(), + strides=list(), + padding=list(), + dilation_rate=list(), + group=int(), + axis=int(), depth_multiplier=int()) # also called transposed convolution -OpsRegister.Register("Deconvolution").set_attr(filter_num=int(), - kernel_size=list(), - strides=list(), - padding=list(), - dilation_rate=list(), - group=int(), - axis=int(), +OpsRegister.Register("Deconvolution").set_attr(filter_num=int(), + kernel_size=list(), + strides=list(), + padding=list(), + dilation_rate=list(), + group=int(), + axis=int(), bias_term=bool()) # DeformableConvolution -OpsRegister.Register("DeformConvolution").set_attr(filter_num=int(), - kernel_size=list(), - strides=list(), - padding=list(), - dilation_rate=list(), - group=int(), - axis=int(), +OpsRegister.Register("DeformConvolution").set_attr(filter_num=int(), + kernel_size=list(), + strides=list(), + padding=list(), + dilation_rate=list(), + group=int(), + axis=int(), bias_term=bool()) ############################# Rnn Op define ############################## # Standard RNN (LSTM/GRU) -# enum rnn type: -# enum type { -# TANH, // base -# SIGMOID, // base -# RELU, // base -# LSTM, -# GRU, -# } -OpsRegister.Register("RNN").set_attr(hidden_size=int(), - input_size=int(), - bias_term=bool(), - dropout=float(), +# enum rnn type: +# enum type { +# TANH, // base +# SIGMOID, // base +# RELU, // base +# LSTM, +# GRU, +# } +OpsRegister.Register("RNN").set_attr(hidden_size=int(), + input_size=int(), + bias_term=bool(), + dropout=float(), type="GRU") ############################# embedding Op define ############################## # embedding layer, input_dim in tf or caffe means the voc num and output_dim means the emb size -OpsRegister.Register("Embedding").set_attr(input_dim=int(), - output_dim=int(), +OpsRegister.Register("Embedding").set_attr(input_dim=int(), + output_dim=int(), bias_term=bool()) ############################# Accuracy Op define ############################## -# NULL +# NULL ########### Object track and detection (for adu(caffe layer type)) Op define ############# @@ -254,45 +255,45 @@ OpsRegister.Register("Axpy").set_attr() -OpsRegister.Register("PriorBox").set_attr(min_size=list(), - max_size=list(), +OpsRegister.Register("PriorBox").set_attr(min_size=list(), + max_size=list(), aspect_ratio=list(), - fixed_size=list(), - fixed_ratio=list(), - density=list(), - is_flip=bool(), - is_clip=bool(), - variance=list(), - img_h=int(), - img_w=int(), - step_h=float(), - step_w=float(), + fixed_size=list(), + fixed_ratio=list(), + density=list(), + is_flip=bool(), + is_clip=bool(), + variance=list(), + img_h=int(), + img_w=int(), + step_h=float(), + step_w=float(), offset=float(), order=list()) # enum code_type { -# CORNER, -# CENTER_SIZE, -# CORNER_SIZE, +# CORNER, +# CENTER_SIZE, +# CORNER_SIZE, # } -OpsRegister.Register("DetectionOutput").set_attr(share_location=bool(), - variance_encode_in_target=bool(), - class_num=int(), - background_id=int(), - keep_top_k=int(), - code_type="CORNER", - conf_thresh=float(), - nms_top_k=int(), - nms_thresh=float(), +OpsRegister.Register("DetectionOutput").set_attr(share_location=bool(), + variance_encode_in_target=bool(), + class_num=int(), + background_id=int(), + keep_top_k=int(), + code_type="CORNER", + conf_thresh=float(), + nms_top_k=int(), + nms_thresh=float(), nms_eta=float()) ########### ADU Op define ############# -OpsRegister.Register("Argmax").set_attr(out_max_val=bool(), - top_k=int(), +OpsRegister.Register("Argmax").set_attr(out_max_val=bool(), + top_k=int(), axis=int(), axis_term=bool()) @@ -330,7 +331,7 @@ OpsRegister.Register("SequenceConv").set_attr(filter_num=int(), - kernel_size=list(), + kernel_size=list(), padding_trainable=bool(), context_stride=int(), context_start=int(), @@ -349,6 +350,11 @@ num_layers=int(), input_activation="null") +OpsRegister.Register("LSTMP").set_attr(outDim=int(), + skipNum=int(), + reActType='tanh', + cellDim=int()) + OpsRegister.Register("MatMul").set_attr(transpose_x=bool(), transpose_y=bool(), @@ -360,8 +366,12 @@ begin_norm_axis=int(), eps=float()) -OpsRegister.Register("Resize").set_attr(height_scale=float(), - width_scale=float()) + +OpsRegister.Register("Resize").set_attr(method="BILINEAR_ALIGN", + height_scale=float(), + width_scale=float(), + out_width=int(), + out_height=int()) OpsRegister.Register("Normalize").set_attr(begin_norm_axis=int(), is_across_spatial=bool(), @@ -448,3 +458,58 @@ #####################################Unpadding_padding op define ############################ ######### ###### it is named UnpaddingPaddingLayer in lego, OpsRegister.Register("ConvUnpaddingPadding").set_attr() #no paras, no weights. +# Fast-RCNN +OpsRegister.Register("AffineChannel").set_attr() #no paras, no weights. + +OpsRegister.Register("AnchorGenerator").set_attr(anchor_sizes=list(), + aspect_ratios=list(), + variances=list(), + stride=list(), + offset=float()) + +OpsRegister.Register("GenerateProposals").set_attr(pre_nms_top_n=int(), + post_nms_top_n=int(), + nms_thresh=float(), + min_size=float(), + eta=float()) + +OpsRegister.Register("RoiAlign").set_attr(spatial_scale=float(), + pooled_height=int(), + pooled_width=int(), + sampling_ratio=int()) + +OpsRegister.Register("RoiPool").set_attr(spatial_scale=float(), + pooled_height=int(), + pooled_width=int()) + +##################################### pytorch edsr model PixelShuffle op define ################################ +# PixelShuffle in_shape = [n, r * r * c, h, w] scale_factor = r ==> out_shape = [n, c, r * h, r * w] +OpsRegister.Register("PixelShuffle").set_attr(scale_factor=int()) + +OpsRegister.Register("Coord2Patch").set_attr(img_h=int(), + output_h=int(), + output_w=int()) + +OpsRegister.Register("DataNorm").set_attr(epsilon=float()) + +OpsRegister.Register("Pad2D").set_attr(mode="constant", + value=float(), + pad_h=list(), + pad_w=list()) + +OpsRegister.Register("SequencePoolConcat").set_attr(pooltype=str(), + slot_num=int(), + axis=int()) + +OpsRegister.Register("SRoiAlign").set_attr(pooled_h=int(), + pooled_w=int(), + spatial_scale=float()) + +OpsRegister.Register("SProposal").set_attr(feat_stride=int(), + basesize=int(), + boxminsize=int(), + pre_nms_topn=int(), + post_nms_topn=int(), + nms_thresh=float(), + scale=list(), + ratio=list()) diff --git a/tools/external_converter_v2/parser/operations/ops_fluid.py b/tools/external_converter_v2/parser/operations/ops_fluid.py index 9025a1dd9..7680ae43b 100755 --- a/tools/external_converter_v2/parser/operations/ops_fluid.py +++ b/tools/external_converter_v2/parser/operations/ops_fluid.py @@ -44,5 +44,71 @@ OpsRegister.Register("while").set_attr() OpsRegister.Register("array_to_lod_tensor").set_attr() +OpsRegister.Register("assign").set_attr() OpsRegister.Register("assign_value").set_attr() OpsRegister.Register("shape").set_attr() + +OpsRegister.Register("fake_quantize_abs_max").set_attr() +OpsRegister.Register("fake_dequantize_max_abs").set_attr() +OpsRegister.Register("fake_quantize_range_abs_max").set_attr() +OpsRegister.Register("fake_dequantize_range_max_abs").set_attr() + +OpsRegister.Register("increment").set_attr() + +OpsRegister.Register("fusion_dropout_add_ln_quant").set_attr() +OpsRegister.Register("dequantize_max_abs_rowwise").set_attr() +OpsRegister.Register("quantize_abs_max_rowwise").set_attr() +OpsRegister.Register("fusion_add_relu_dropout_quant").set_attr() +OpsRegister.Register("fill_constant_batch_size_like").set_attr() +OpsRegister.Register("beam_search_decode").set_attr() + +OpsRegister.Register('reduce').set_attr( + reduce_type=str(), + keep_dim=bool(), + reduce_dim=list(), + reduce_all=bool(), + coeff=float(), +) +OpsRegister.Register('arg_max').set_attr( + out_max_val=bool(), + top_k=int(), + axis=int(), +) +OpsRegister.Register('sequence_expand').set_attr( + ref_level=int(), +) +OpsRegister.Register('eltwise').set_attr( + type=str(), + coeff=float(), +) +OpsRegister.Register('cast').set_attr( + int_type=int(), + out_type=int(), +) +OpsRegister.Register('yolo_box').set_attr( + anchors=list(), + class_num=int(), + conf_thresh=float(), + downsample_ratio=int(), +) +OpsRegister.Register('slice').set_attr( + slice_dim=int(), + slice_point=list(), + axis=int(), +) +OpsRegister.Register('box_coder').set_attr( + axis=int(), + box_normalized=bool(), + variance=list(), +) +OpsRegister.Register('GroupNormal').set_attr( + has_scale=bool(), + has_bias=bool(), + eps=float(), + group=int(), +) +OpsRegister.Register('slice_v2').set_attr( + starts=list(), + ends=list(), + axes=list(), +) diff --git a/tools/external_converter_v2/parser/proto/__init__.py b/tools/external_converter_v2/parser/proto/__init__.py index 5dfb5b8c9..4e496abce 100644 --- a/tools/external_converter_v2/parser/proto/__init__.py +++ b/tools/external_converter_v2/parser/proto/__init__.py @@ -6,3 +6,4 @@ from node_pb2 import * from operator_pb2 import * from tensor_pb2 import * +from net_pb2 import * diff --git a/tools/external_converter_v2/parser/proto/graph.proto b/tools/external_converter_v2/parser/proto/graph.proto index 21120a56d..82a9bb354 100644 --- a/tools/external_converter_v2/parser/proto/graph.proto +++ b/tools/external_converter_v2/parser/proto/graph.proto @@ -27,9 +27,15 @@ message Info { bool is_optimized = 10; }; +message TargetProto { + string node = 1; + repeated float scale = 2; +}; + // string list message List { - repeated string val = 1; + repeated string val = 1; // Will be deprecated + repeated TargetProto target = 2; }; // Anakin Graph define @@ -44,7 +50,7 @@ repeated NodeProto nodes = 2; // map: node name ---> node name // edges saves storage of anakin model. map edges_in = 3; -map edges_out =4; +map edges_out = 4; // edges info [optional] // map: node_name_0 + "_" + node_name_1 ---> edge tensor (tensor not hold data) diff --git a/tools/external_converter_v2/parser/proto/helper.py b/tools/external_converter_v2/parser/proto/helper.py new file mode 100644 index 000000000..73b27ccfa --- /dev/null +++ b/tools/external_converter_v2/parser/proto/helper.py @@ -0,0 +1,60 @@ +"""proto helper +""" + +import tensor_pb2 + +def make_tensor( + dims, # type: list(int) + data_type, # type: tensor_pb2.DateTypeProto + vals, # type: list(float, int...) or bytes + layout=None, # type: tensor_pb2.LayoutProto + scale=None, # type: list(float) +): + """make tensor_pb2.TensorProto + """ + t = tensor_pb2.TensorProto() + + t.shape.dims.size = len(dims) + t.shape.dims.value = dims[:] + + # set TensorProto.data + t.data.type = data_type + if t.data.type is tensor_pb2.STR: + t.data.s[:] = vals + elif t.data.type is tensor_pb2.INT32: + t.data.i[:] = vals + elif t.data.type is tensor_pb2.INT8: + assert type(t.data.c) is bytes + t.data.c = vals + elif t.data.type in [tensor_pb2.FLOAT16, tensor_pb2.FLOAT, tensor_pb2.DOUBLE]: + t.data.f[:] = vals + elif t.data.type is tensor_pb2.BOOLEN: + t.data.b[:] = vals + else: + raise Exception('unsupported data_type={}'.format(data_type)) + t.data.size = len(vals) + + if layout is not None: + t.shape.layout = layout + if scale is not None: + t.shape.scale.f[:] = scale + t.shape.scale.type = tensor_pb2.FLOAT + t.shape.scale.size = len(scale) + + return t + + +def reverse_cache_data(data): # type: tensor_pb2.CacheDate -> None + """tensor_pb2.CacheDate => 1.0 / tensor_pb2.CacheDate + """ + if data.type is tensor_pb2.INT8: + data.c[:] = map(lambda x: 1.0 / x, data.c) + elif data.type is tensor_pb2.INT32: + data.i[:] = map(lambda x: 1.0 / x, data.i) + elif data.type in [tensor_pb2.FLOAT, tensor_pb2.FLOAT16, tensor_pb2.DOUBLE]: + data.f[:] = map(lambda x: 1.0 / x, data.f) + elif data.type is tensor_pb2.CACHE_LIST: + for x in data.l: + reverse_cache_data(x) + else: + raise Exception('unsupported data.type={}'.format(data.type)) diff --git a/tools/external_converter_v2/parser/proto/net.proto b/tools/external_converter_v2/parser/proto/net.proto new file mode 100644 index 000000000..221b07eb2 --- /dev/null +++ b/tools/external_converter_v2/parser/proto/net.proto @@ -0,0 +1,31 @@ +syntax = "proto3"; + +import "node.proto"; +import "tensor.proto"; +import "graph.proto"; + +message CtxProto { + int32 device_id = 1; + int32 data_stream_id = 2; + int32 compute_stream_id = 3; +}; + +message FuncProto { + string name = 1; + string type = 2; + CtxProto context = 3; + repeated TensorProto tensor_ins = 6; + repeated TensorProto tensor_outs = 7; + repeated int32 lane_ins = 8; + repeated int32 lane_outs = 9; + int32 current_lane = 11; + bool need_sync = 12; + NodeProto node_info = 13; +}; + +message NetProto { + string name = 1; + GraphProto graph = 2; + repeated FuncProto funcs = 3; +}; + diff --git a/tools/external_converter_v2/parser/proto/node.proto b/tools/external_converter_v2/parser/proto/node.proto index fc26b874c..54e025e24 100644 --- a/tools/external_converter_v2/parser/proto/node.proto +++ b/tools/external_converter_v2/parser/proto/node.proto @@ -39,5 +39,8 @@ message NodeProto { // Operator of node. OpsProto Op = 15; + + // Quantitative information + DateTypeProto bit_type = 16; }; diff --git a/tools/external_converter_v2/parser/proto/tensor.proto b/tools/external_converter_v2/parser/proto/tensor.proto index 4f129cc59..58da9bb23 100644 --- a/tools/external_converter_v2/parser/proto/tensor.proto +++ b/tools/external_converter_v2/parser/proto/tensor.proto @@ -7,20 +7,42 @@ message TensorShape { int64 size = 2; } Dim dim = 3; + LayoutProto layout = 4; }; +enum LayoutProto { + LAYOUT_INVALID = 0; + LAYOUT_W = 1; + LAYOUT_HW = 2; + LAYOUT_WH = 3; + LAYOUT_NC = 4; + LAYOUT_NH = 5; + LAYOUT_NW = 6; + LAYOUT_NHW = 7; + LAYOUT_NCHW = 8; + LAYOUT_NHWC = 9; + LAYOUT_NCHW_C4 = 10; + LAYOUT_NCHW_C8 = 11; + LAYOUT_NCHW_C16 = 12; + LAYOUT_OIHW16I16O = 13; + LAYOUT_GOIHW16I16O = 14; + LAYOUT_NCHW_C8R = 15; + LAYOUT_NCHW_C16R = 16; +}; + + // anakin data type. // maybe need to be improved enum DateTypeProto { - STR = 0; - INT8 = 2; + STR = 0; // When used as bit type, enum 0 means invalid. + INT8 = 2; INT32 = 4; - FLOAT16 = 8; + FLOAT16 = 8; FLOAT = 13; DOUBLE = 14; BOOLEN = 20; - CACHE_LIST = 30; - TENSOR = 31; + CACHE_LIST = 30; + TENSOR = 31; }; // list data cache @@ -29,31 +51,38 @@ message CacheDate { repeated int32 i = 2; /// list int repeated float f = 3; /// list float repeated bool b = 4; /// list bool - repeated CacheDate l = 5; /// list list - DateTypeProto type = 6; + repeated CacheDate l = 5; /// list list + bytes c = 8; /// string for int8 + DateTypeProto type = 6; int64 size = 7; }; // anakin tensor define // it maybe need to improved to support sequence data. message TensorProto { - // tensor id [optional] - // ( only used when anakin generates optimized model ) - bytes name = 1; + // tensor id [optional] + // ( only used when anakin generates optimized model ) + bytes name = 1; - // whether shared from other [optional] - // ( anakin generates optimized model ) - bool shared = 2; + // whether shared from other [optional] + // ( anakin generates optimized model ) + bool shared = 2; - // share_from is not null if shared [optional] - // ( only used when anakin generates optimized model) - bytes share_from = 3; + // share_from is not null if shared [optional] + // ( only used when anakin generates optimized model) + bytes share_from = 3; - // tensor shape + // tensor real shape TensorShape shape = 8; + // tensor valid shape + TensorShape valid_shape = 9; + // tensor data cache. CacheDate data = 10; + + // scale for int8 + CacheDate scale = 11; }; diff --git a/tools/external_converter_v2/parser/tensorflow/freeze.py b/tools/external_converter_v2/parser/tensorflow/freeze.py index c45ccba51..b88517a05 100644 --- a/tools/external_converter_v2/parser/tensorflow/freeze.py +++ b/tools/external_converter_v2/parser/tensorflow/freeze.py @@ -45,6 +45,20 @@ def freeze_graph(model_folder, output_name): with tf.Session() as sess: saver.restore(sess, input_checkpoint) + #fix batch norm nodes + for node in input_graph_def.node: + if node.op == 'RefSwitch': + node.op = 'Switch' + for index in range(len(node.input)): + if 'moving_' in node.input[index] and 'biased' in node.input[index]: + node.input[index] = node.input[index] + '/read' + elif node.op == 'AssignSub': + node.op = 'Sub' + if 'use_locking' in node.attr: del node.attr['use_locking'] + elif node.op == 'AssignAdd': + node.op = 'Add' + if 'use_locking' in node.attr: del node.attr['use_locking'] + # We use a built-in TF helper to export variables to constant output_graph_def = graph_util.convert_variables_to_constants( sess, diff --git a/tools/external_converter_v2/parser/tensorflow/med_graph.py b/tools/external_converter_v2/parser/tensorflow/med_graph.py index d17cc8702..824c6379f 100644 --- a/tools/external_converter_v2/parser/tensorflow/med_graph.py +++ b/tools/external_converter_v2/parser/tensorflow/med_graph.py @@ -4,12 +4,13 @@ class MedNodeUtil: @staticmethod - def new_med_node(): + def new_med_node(name=None): ''' return instance of empty standard med graph node :return: ''' - return {'name': None, 'ak_type': None, 'input': [], 'output': [], 'ak_attr': {}, 'type': None, + return {'name': name, 'ak_type': None, 'input': [], 'output': [], + 'ak_attr': {}, 'tf_attr': {}, 'type': None, 'med_visted': False} @staticmethod @@ -60,8 +61,65 @@ def redirecto_outputs_input_to_this(node, graph, this_name, this_shape): ''' for i in node['output']: tar_node = graph[i['name']] - tar_node['input'] = MedNodeUtil.replace_name_with_list(tar_node['input'], node['name'], - [{'name': this_name, 'shape': this_shape}]) + tar_node['input'] = MedNodeUtil.replace_name_with_list( + tar_node['input'], node['name'], [{'name': this_name, 'shape': this_shape}]) + + @staticmethod + def redirecto_outputs_input_to_this_any(node, graph, ori_name, this_name, this_shape): + ''' + get node_x in node`s outputs + make node_x`s inputs reference to node + :param node: + :param graph: + :param this_name: + :param this_shape: + :return: + ''' + for i in node['output']: + tar_node = graph[i['name']] + tar_node['input'] = MedNodeUtil.replace_name_with_list( + tar_node['input'], ori_name, [{'name': this_name, 'shape': this_shape}]) + + @staticmethod + def redirecto_inputs_output_to_this(node, graph, this_name, this_shape): + ''' + get node_x in node`s inputs + make node_x`s output reference to node + :param node: + :param graph: + :param this_name: + :param this_shape: + :return: + ''' + for i in node['input']: + tar_node = graph[i['name']] + tar_node['output'] = MedNodeUtil.replace_name_with_list( + tar_node['output'], node['name'], [{'name': this_name, 'shape': this_shape}]) + + @staticmethod + def redirecto_inputs_output_to_this_any(node, graph, ori_name, this_name, this_shape): + ''' + get node_x in node`s inputs + make node_x`s output reference to node + :param node: + :param graph: + :param this_name: + :param this_shape: + :return: + ''' + for i in node['input']: + tar_node = graph[i['name']] + tar_node['output'] = MedNodeUtil.replace_name_with_list( + tar_node['output'], ori_name, [{'name': this_name, 'shape': this_shape}]) + + @staticmethod + def remove_node_in_series_graph(med_node, med_graph): + assert len(med_node['input']) == 1 and len(med_node['output']) == 1 + med_node['ak_type'] = None + MedNodeUtil.redirecto_outputs_input_to_this( + med_node, med_graph, med_node['input'][0]['name'], med_node['input'][0]['shape']) + MedNodeUtil.redirecto_inputs_output_to_this( + med_node, med_graph, med_node['output'][0]['name'], med_node['output'][0]['shape']) MedGraph_Input_Cnt = 0 @@ -84,8 +142,10 @@ def append_node(father_node, son_node, graph): father_node['output'] = [{'name': son_node['name'], 'shape': son_shape}] for i in output: out_node = graph[i['name']] - out_node['input'] = MedNodeUtil.replace_name_with_list(out_node['input'], father_node['name'], - [{'name': son_node['name'], 'shape': son_shape}]) + out_node['input'] = MedNodeUtil.replace_name_with_list( + out_node['input'], father_node['name'], + [{'name': son_node['name'], 'shape': son_shape}]) + graph[son_node['name']] = son_node @staticmethod @@ -131,8 +191,35 @@ def _auto_input_name(med_node, med_graph): med_node['name'] = 'input_' + str(MedGraph_Input_Cnt) for i in med_node['output']: out_node = med_graph[i['name']] - out_node['input'] = MedNodeUtil.replace_name_with_list(out_node['input'], old_name, - [{'name': med_node['name'], 'shape': i['shape']}]) + out_node['input'] = MedNodeUtil.replace_name_with_list( + out_node['input'], old_name, [{'name': med_node['name'], 'shape': i['shape']}]) + + @staticmethod + def _fusionFlatten(med_node, med_graph): + ''' + fusion flatten node after convolution node + :param med_node: + :param med_graph: + :return: + ''' + assert len(med_node['output']) == 1 + next_node = med_graph[med_node['output'][0]['name']] + assert next_node['ak_type'] == 'Dense' + + assert len(next_node['input']) == 1 + + next_node['ak_attr']['axis'] = 1 + MedNodeUtil.remove_node_in_series_graph(med_node, med_graph) + + @staticmethod + def _remove_op(med_node, med_graph): + ''' + fusion scale node after convolution node + :param med_node: + :param med_graph: + :return: + ''' + MedNodeUtil.remove_node_in_series_graph(med_node, med_graph) @staticmethod def _fusionScale(med_node, med_graph): @@ -160,10 +247,10 @@ def _fusionScale(med_node, med_graph): else: input_attr['bias_weights'] = med_ak_attr['bias_weights'] med_node['ak_type'] = None - input_node['output'] = MedNodeUtil.replace_name_with_list(input_node['output'], med_node['name'], - med_node['output']) - MedNodeUtil.redirecto_outputs_input_to_this(med_node, med_graph, input_node['name'], - med_node['input'][0]['shape']) + input_node['output'] = MedNodeUtil.replace_name_with_list( + input_node['output'], med_node['name'], med_node['output']) + MedNodeUtil.redirecto_outputs_input_to_this( + med_node, med_graph, input_node['name'], med_node['input'][0]['shape']) input_node['fusion_out_name'] = med_node['name'] pass @@ -206,7 +293,10 @@ def solve(med_graph): ''' for node in med_graph.values(): node['med_visted'] = False + + #MedGraphUtil._all_search_table(med_graph, {'Reshape': MedGraphUtil._remove_op}) MedGraphUtil._all_search_table(med_graph, {'Scale': MedGraphUtil._fusionScale}) + #MedGraphUtil._all_search_table(med_graph, {'Flatten': MedGraphUtil._fusionFlatten}) MedGraphUtil._all_search_fusion(med_graph, MedGraphUtil._auto_split) MedGraphUtil._all_search_table(med_graph, {'Input': MedGraphUtil._auto_input_name}) diff --git a/tools/external_converter_v2/parser/tensorflow/parse_med_2_ak.py b/tools/external_converter_v2/parser/tensorflow/parse_med_2_ak.py index 2cc53107e..eb9a19dc6 100644 --- a/tools/external_converter_v2/parser/tensorflow/parse_med_2_ak.py +++ b/tools/external_converter_v2/parser/tensorflow/parse_med_2_ak.py @@ -26,7 +26,6 @@ def np_2_ak_tensor(np_tensor): } type_str = data_type_map.get(np_tensor.dtype) - # print(np_tensor.dtype) assert type_str != None ak_tensor = TensorProtoIO() ak_tensor.set_shape(shape_2_ak_shape(np_tensor.shape)) @@ -70,16 +69,31 @@ def Dense(self, med_attr, param): :param param: :return: ''' + if med_attr.get('trans_weights', False): + med_attr['weights'] = np.transpose(med_attr['weights']) param.weight_1 = np_2_ak_tensor(med_attr['weights']) - param.axis = 1 + param.axis = med_attr.get('axis', 1) + param.out_dim = med_attr.get('out_dim', 0) + if med_attr.get('bias_weights') is not None: param.bias_term = True param.weight_2 = np_2_ak_tensor(med_attr['bias_weights']) + if param.out_dim == 0: + param.out_dim = len(med_attr['bias_weights'].flatten()) else: param.bias_term = False pass - def Relu(self, med_attr, param): + def Permute(self, med_attr, param): + """ + fill Relu param in ak graph + :param med_attr: + :param param: + :return: + """ + param.dims = med_attr['dims'] + + def ReLU(self, med_attr, param): ''' fill Relu param in ak graph :param med_attr: diff --git a/tools/external_converter_v2/parser/tensorflow/parse_tf_2_med.py b/tools/external_converter_v2/parser/tensorflow/parse_tf_2_med.py index 6f26d8a55..17119df13 100644 --- a/tools/external_converter_v2/parser/tensorflow/parse_tf_2_med.py +++ b/tools/external_converter_v2/parser/tensorflow/parse_tf_2_med.py @@ -153,9 +153,16 @@ def all_search(graph, table): if table.get(type_name) != None: table[type_name](tf_node, graph) + def all_search_fix(graph, table): + for tf_node in graph.values(): + type_name = tf_node['ak_type'] + if table.get(type_name) != None: + table[type_name](tf_node, graph) + all_search(nodes, {'Identity': parse_Identity, 'Placeholder': parse_Placeholder, - 'Shape': parse_Shape + 'Shape': parse_Shape, + 'StridedSlice': parse_slim_flatten }) all_search(nodes, {'Reshape': parse_fusionReshape, }) @@ -177,9 +184,11 @@ def all_search(graph, table): 'Reshape': parse_Reshape, 'Squeeze': parse_Squeeze, 'Softmax': parse_Softmax, - + 'Transpose': parse_Transpose }) + all_search_fix(nodes, {'Dense': fix_Dense}) + return nodes def parse(self): diff --git a/tools/external_converter_v2/parser/tensorflow/parser_tf.py b/tools/external_converter_v2/parser/tensorflow/parser_tf.py index 7e800f2a3..92e165c37 100644 --- a/tools/external_converter_v2/parser/tensorflow/parser_tf.py +++ b/tools/external_converter_v2/parser/tensorflow/parser_tf.py @@ -14,7 +14,7 @@ class TFParser: def __init__(self, fluid_config_dict): # anakin graph model io # config info - self.ProtoPaths = fluid_config_dict['ProtoPaths'] + self.ProtoPaths = fluid_config_dict['ModelPath'] self.OutPuts = fluid_config_dict['OutPuts'] if self.OutPuts is not None: diff --git a/tools/external_converter_v2/parser/tensorflow/run_pb.py b/tools/external_converter_v2/parser/tensorflow/run_pb.py index 5b44bc23e..21cda7ead 100644 --- a/tools/external_converter_v2/parser/tensorflow/run_pb.py +++ b/tools/external_converter_v2/parser/tensorflow/run_pb.py @@ -16,7 +16,7 @@ def convert_name_tf2ak(tf_name, perfix='record_'): return perfix + ak_name -# ak_work_space='/your/anakin/workspace' +ak_work_space='/home/ljj/docker_mount_dev2/anakin2_developing/build' output_compare_op = None # graph_path='./vgg_model/frozen_vgg_16_i.pb' diff --git a/tools/external_converter_v2/parser/tensorflow/tf_trans_util.py b/tools/external_converter_v2/parser/tensorflow/tf_trans_util.py index 581a313bc..f068a4d84 100644 --- a/tools/external_converter_v2/parser/tensorflow/tf_trans_util.py +++ b/tools/external_converter_v2/parser/tensorflow/tf_trans_util.py @@ -141,6 +141,17 @@ def load_graph(graph_path): return graph +def find_layout_in(node, graph): + if node['ak_type'] in ('Dense'): + return None + if 'data_format' in node['tf_attr']: + return node['tf_attr']['data_format'] + elif len(node['input']) > 0: + return find_layout_in(graph[node['input'][0]['name']], graph) + else: + return None + + NCHW_TO_NHWC = [0, 2, 3, 1] NHWC_TO_NCHW = [0, 3, 1, 2] HWCN_TO_NCHW = [3, 2, 0, 1] @@ -207,6 +218,55 @@ def add_special_pad(padding, tf_node, graph): graph[padding_node['name']] = padding_node +def parse_slim_flatten(tf_node, graph): + ''' + parse shape for tensorflow graph + :param tf_node: + :param graph: + :return: + ''' + # try: + + assert len(tf_node['output']) == 1 + get_shape_node = graph[tf_node['input'][0]['name']] + pack_node = graph[tf_node['output'][0]['name']] + assert get_shape_node['type'] == 'Shape' + assert pack_node['type'] == 'Pack' + assert len(pack_node['output']) == 1 + reshape_node = graph[pack_node['output'][0]['name']] + assert reshape_node['type'] == 'Reshape' + assert reshape_node['input'][0]['name'] == get_shape_node['input'][0]['name'] + + tf_node['visted'] = True + get_shape_node['visted'] = True + pack_node['visted'] = True + reshape_node['visted'] = True + + the_node = MedNodeUtil.new_med_node(name=tf_node['name'] + '_flatten') + graph[the_node['name']] = the_node + + the_node['type'] = 'Flatten' + the_node['ak_type'] = 'Flatten' + the_node['input'] = get_shape_node['input'] + the_node['output'] = reshape_node['output'] + the_node['visted'] = True + MedNodeUtil.redirecto_outputs_input_to_this_any( + the_node, graph, reshape_node['name'], the_node['name'], the_node['output'][0]['shape']) + MedNodeUtil.redirecto_inputs_output_to_this_any( + the_node, graph, get_shape_node['name'], the_node['name'], the_node['input'][0]['shape']) + pre_out = graph[the_node['input'][0]['name']]['output'] + for index, out in enumerate(pre_out): + if out['name'] == reshape_node['name']: + del pre_out[index] + + # print(the_node['output']) + # print(graph[the_node['output'][0]['name']]['input']) + # exit() + + # except Exception,e: + # raise e + + def parse_Identity(tf_node, graph): ''' remove identity in tensorflow graph @@ -223,7 +283,8 @@ def parse_Identity(tf_node, graph): next_name = next['name'] next_node = graph[next_name] next_node['input'] = [input_0 if i['name'] == tf_node['name'] else i for i in next_node['input']] - in_node['output'] = MedNodeUtil.replace_name_with_list(in_node['output'], tf_node['name'], outputs) + in_node['output'] = MedNodeUtil.replace_name_with_list( + in_node['output'], tf_node['name'], outputs) def parse_Shape(tf_node, graph): @@ -261,7 +322,10 @@ def parse_Placeholder(tf_node, graph): ''' tf_node['visted'] = True tf_node['ak_type'] = 'Input' - tf_node['ak_attr']['shape'] = spatial_map(tf_node['output'][0]['shape'], NHWC_TO_NCHW) + if len(tf_node['output'][0]['shape']) == 4: + tf_node['ak_attr']['shape'] = spatial_map(tf_node['output'][0]['shape'], NHWC_TO_NCHW) + else: + tf_node['ak_attr']['shape'] = tf_node['output'][0]['shape'] def parse_Pad(tf_node, graph): @@ -282,6 +346,23 @@ def parse_Pad(tf_node, graph): ak_attr['pad_w'] = pad_shape[2].flatten().tolist() +def parse_Transpose(tf_node, graph): + ''' + :param tf_node: + :param graph: + :return: + ''' + tf_node['visted'] = True + tf_node['ak_type'] = 'Permute' + assert len(tf_node['input']) == 2 + arg_node = graph[tf_node['input'][1]['name']] + assert arg_node['type'] == 'Const' + tf_node['ak_attr']['dims'] = arg_node['tf_attr']['value'].flatten().tolist() + print(tf_node['ak_attr']['dims'], type(tf_node['ak_attr']['dims'])) + # exit() + pass + + def parse_Softmax(tf_node, graph): ''' convert softmax op, default axis is 3 @@ -321,7 +402,7 @@ def parse_Act(tf_node, graph): tf_node['visted'] = True tf_node['ak_type'] = 'Activation' if tf_node['type'] == 'Relu': - tf_node['ak_type'] = 'Relu' + tf_node['ak_type'] = 'ReLU' tf_node['ak_attr']['type'] = 'Relu' elif tf_node['type'] == 'Relu6': tf_node['ak_type'] = 'Activation' @@ -355,6 +436,7 @@ def parse_Add(tf_node, graph): :return: ''' tf_node['visted'] = True + print(tf_node) assert len(tf_node['input']) == 2 input_0 = graph[tf_node['input'][0]['name']] input_1 = graph[tf_node['input'][1]['name']] @@ -392,7 +474,8 @@ def parse_Mean(tf_node, graph): reduction_shape = reduction_shape_node['tf_attr']['value'].flatten().tolist() assert reduction_shape is not None assert keep_dims is True - assert reduction_shape == [1, 2] + # print('reduction ',reduction_shape,tf_node['name']) + # assert reduction_shape == [1, 2] ak_attr['strides'] = [1, 1] ak_attr['window'] = [tf_node['input'][0]['shape'][reduction_shape[0]], tf_node['input'][0]['shape'][reduction_shape[1]]] @@ -518,6 +601,22 @@ def get_bias(tf_node, graph): return bias_weight +def fix_Dense(tf_node, graph): + input_node = graph[tf_node['input'][0]['name']] + layout = find_layout_in(input_node, graph) + print(tf_node['name'], tf_node['input'], layout, type(layout)) + if layout == 'NHWC': + if input_node['ak_type'] in ('Flatten'): + input_node = graph[input_node['input'][0]['name']] + shape = input_node['output'][0]['shape'] + weights = tf_node['ak_attr']['weights'] + full_shape = [i for i in shape if i is not None] + full_shape.append(weights.shape[1]) + weights = weights.reshape(full_shape) + weights = weights.transpose((2, 0, 1, 3)) + tf_node['ak_attr']['weights'] = weights.reshape(tf_node['ak_attr']['weights'].shape) + + def parse_Conv2D(tf_node, graph): ''' convert conv2D to convolution @@ -583,14 +682,12 @@ def parse_MatMul(tf_node, graph): raise Exception('Whate hannpend both const') elif in_type_1 == 'Const' and tf_node['tf_attr']['transpose_a'] != True: weights = graph[in_name_1]['tf_attr']['value'] - if tf_node['tf_attr']['transpose_b']: - weights = weights.T + tf_node['ak_attr']['trans_weights'] = not tf_node['tf_attr']['transpose_b'] tf_node['ak_attr']['weights'] = weights MedNodeUtil.retain_input(tf_node, [tf_node['input'][0]]) elif in_type_0 == 'Const' and tf_node['tf_attr']['transpose_b'] != True: - weights = graph[in_name_1]['tf_attr']['value'].T - if tf_node['tf_attr']['transpose_a']: - weights = weights.T + weights = graph[in_name_1]['tf_attr']['value'] + tf_node['ak_attr']['trans_weights'] = tf_node['tf_attr']['transpose_a'] tf_node['ak_attr']['weights'] = weights MedNodeUtil.retain_input(tf_node, [tf_node['input'][1]]) else: diff --git a/tools/external_converter_v2/parser/tensorflow/tf_util.py b/tools/external_converter_v2/parser/tensorflow/tf_util.py index b8dfe78d9..95291a4d7 100644 --- a/tools/external_converter_v2/parser/tensorflow/tf_util.py +++ b/tools/external_converter_v2/parser/tensorflow/tf_util.py @@ -33,7 +33,8 @@ def tf_run_model(graph_path, inputs, output_tensor_list): tf.train.import_meta_graph(graph_path, clear_devices=True) tf.import_graph_def(graph_def, name='graph') - + for op in graph.get_operations(): + print(op.name, [i for i in op.inputs]) inputs_dict = {graph.get_tensor_by_name(i): inputs[i] for i in inputs} output_list = [graph.get_tensor_by_name(i) for i in output_tensor_list] print(output_list) diff --git a/tools/external_converter_v2/requirement.txt b/tools/external_converter_v2/requirement.txt new file mode 100644 index 000000000..4397eacff --- /dev/null +++ b/tools/external_converter_v2/requirement.txt @@ -0,0 +1,6 @@ +pyyaml +protobuf==3.1.0 +enum34 +numpy +flask +prettytable \ No newline at end of file diff --git a/tools/external_converter_v2/utils/__init__.py b/tools/external_converter_v2/utils/__init__.py new file mode 100644 index 000000000..0d070addb --- /dev/null +++ b/tools/external_converter_v2/utils/__init__.py @@ -0,0 +1,4 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import utils.net diff --git a/tools/external_converter_v2/utils/net/__init__.py b/tools/external_converter_v2/utils/net/__init__.py new file mode 100644 index 000000000..f2890b019 --- /dev/null +++ b/tools/external_converter_v2/utils/net/__init__.py @@ -0,0 +1,5 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import utils.net.net_parser +import utils.net.net_io diff --git a/tools/external_converter_v2/utils/net/net_io.py b/tools/external_converter_v2/utils/net/net_io.py new file mode 100644 index 000000000..6d8af4847 --- /dev/null +++ b/tools/external_converter_v2/utils/net/net_io.py @@ -0,0 +1,170 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys + +from parser.proto import net_pb2 +from parser.graph_io import GraphProtoIO +from google.protobuf import text_format + + +class FuncProtoIO(object): + """ + Func io class of FuncProto. + """ + + def __init__(self, proto=None): + """ + Initial the FuncProtoIO object. + """ + self.func_proto = None + if proto is None: + self.func_proto = net_pb2.FuncProto() + else: + self.func_proto = proto + + def get_name(self): + """ + Get the name of func_proto. + """ + return self.func_proto.name + + def set_name(self, name): + """ + Set the name of func_proto. + """ + self.func_proto.name = name + + def get_type(self): + """ + Get the type of func_proto. + """ + return self.func_proto.type + + def set_type(self, type_value): + """ + Set the type of func_proto. + """ + self.func_proto.type = type_value + + def get_node_io(self): + """ + Get the node io of this object. + """ + node_io = NodeProtoIO(self.func_proto.node_info) + return node_io + + def reset_node_io(self, node_io): + """ + Reset the node io of this object. + """ + node_proto = node_io() + self.func_proto.node_info.CopyFrom(node_proto) + + def __call__(self): + """ + Return func_proto. + """ + return self.func_proto + + +class NetProtoIO(object): + """ + Net io class of NetProto. + """ + + def __init__(self, proto=None): + """ + Init the NetProtoIO object. + """ + self.net_proto = None + if proto is None: + self.net_proto = net_pb2.NetProto() + else: + self.net_proto = proto + + def graph_io(self): + """ + Generate the graph io. + """ + graph_io = GraphProtoIO(self.net_proto.graph) + return graph_io + + def clear_graph(self): + """ + Clear the graph of net proto. + """ + self.net_proto.graph.Clear() + + def get_name(self): + """ + Get the name of net_proto. + """ + return self.net_proto.name + + def set_name(self, net_name): + """ + Set the name of net_proto. + """ + self.net_proto.name = net_name + + def add_func(self, func=None): + """ + Add a func proto. + """ + if func is None: + func = net_pb2.FuncProto() + self.net_proto.funcs.extend([func]) + + def func_io_list(self): + """ + Add func io list. + """ + func_io_list = list() + for func in self.net_proto.funcs: + func_io = FuncProtoIO(func) + func_io_list.append(func_io) + return func_io_list + + def save(self, file_path, use_txt=True, use_net_name=True): + """ + Save the Net proto. + """ + if use_net_name is True: + assert self.net_proto.name is not None + file_path = os.path.join(file_path, self.net_proto.name) + with open(file_path, "wb") as f: + if use_txt is True: + f.write(text_format.MessageToString(self.net_proto)) + else: + f.write(self.net_proto.SerializeToString()) + f.close() + + def parse_from_string(self, file_path): + """ + parser from optimized graph model + """ + with open(file_path, "rb") as f: + contents = f.read() + self.net_proto.ParseFromString(contents) + + def merge_from_io(self, net_io): + """ + Merge proto from io. + """ + self.net_proto.MergeFrom(net_io.net_proto) + + def merge_from_string(self, file_path): + """ + parser from optimized graph model + """ + with open(file_path, "rb") as f: + contents = f.read() + self.net_proto.MergeFromString(contents) + + def __call__(self): + """ + Return the net_proto. + """ + return self.net_proto diff --git a/tools/external_converter_v2/utils/net/net_parser.py b/tools/external_converter_v2/utils/net/net_parser.py new file mode 100644 index 000000000..061d32b35 --- /dev/null +++ b/tools/external_converter_v2/utils/net/net_parser.py @@ -0,0 +1,121 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +from utils.net.net_io import NetProtoIO + +class NetHolder(object): + """ + Net holder. + """ + def __init__(self, config): + """ + Init the net holder. + """ + assert 'NET' in config.DebugConfig.keys() + self.config = config.DebugConfig['NET'] + self.load_list = self.config['LoadPaths'] + self.save_format = self.config['SaveFormat'] + self.net_merged = NetProtoIO() + self.net_ins = dict() + self.load() + + def __str__(self): + """ + Help you by printing the object. + """ + return self.net_merged.net_proto.__str__() + + def parse(self): + """ + Parse the net. + """ + for path in self.net_ins.keys(): + net_io = self.net_ins[path] + node_parser = NetParser(net_io, self.config) + node_parser.net_reset_nodes() + self.net_merged.merge_from_io(net_io) + parser = NetParser(self.net_merged, self.config) + parser.nets_slice() + parser.save_funcs() + + def load(self): + """ + Load the net. + """ + for path in self.load_list: + assert path not in self.net_ins.keys() + net_io = NetProtoIO() + net_io.parse_from_string(path) + self.net_ins[path] = net_io + + def __call__(self): + """ + Return the net. + """ + return self.net_merged + + +class NetParser(object): + """ + Net parser object. + """ + def __init__(self, net_io, config): + # reset node in funcs + self.config = config + self.net_io_in = net_io + self.graph_io = self.net_io_in.graph_io() + self.func_io_list = self.net_io_in.func_io_list() + # funcs slice + self.nets_io_out = list() + self.funcs = dict() + self.save_path = self.config['SavePath'] + + def _clear_graph(self): + """ + Clear the graph. + """ + self.net_io_in.clear_graph() + + def _funcs_dict(self): + """ + The dict of funcs. + """ + for func_io in self.func_io_list: + func_type = func_io.get_type() + if func_type not in self.funcs.keys(): + self.funcs[func_type] = list() + self.funcs[func_type].append(func_io) + + def net_reset_nodes(self): + """ + Reset the nodes of net. + """ + for func_io in self.func_io_list: + func_name = func_io.get_name() + node_io = self.graph_io.get_node_io(func_name) + func_io.reset_node_io(node_io) + self._clear_graph() + return self.net_io_in + + def nets_slice(self): + """ + Slice the nets. + """ + self.nets_io_out = list() + self._funcs_dict() + for func_type in self.funcs.keys(): + net = NetProtoIO() + net.set_name(func_type) + funcs_list = self.funcs[func_type] + for func in funcs_list: + net.add_func(func()) + self.nets_io_out.append(net) + return self.nets_io_out + + def save_funcs(self): + """ + Save funcs. + """ + for net_io_out in self.nets_io_out: + net_io_out.save(self.save_path) + diff --git a/tools/mlu_build.sh b/tools/mlu_build.sh new file mode 100755 index 000000000..a392425a7 --- /dev/null +++ b/tools/mlu_build.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# This script shows how one can build a anakin for the platform +ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" +echo "-- Anakin root dir is: $ANAKIN_ROOT" + +# build the target into mlu_build. +BUILD_ROOT=$ANAKIN_ROOT/mlu_build + +#export PATH=/usr/local/protobuf-3.4.0/bin:$PATH +#export PATH=/usr/lib/ccache:$PATH +#export CNML_ROOT=$ANAKIN_ROOT/third-party/mlu +#export CNRT_ROOT=$ANAKIN_ROOT/third-party/mlu +# +#export LD_LIBRARY_PATH=$CNML_ROOT/lib:$CNRT_ROOT/lib:ANAKIN_ROOT/mlu_build:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=$CNML_ROOT/lib:$CNRT_ROOT/lib:ANAKIN_ROOT/mlu_build:$PWD/third-party/mklml/lib:$LD_LIBRARY_PATH + + +if [ ! -d "$BUILD_ROOT" ]; then + mkdir "$BUILD_ROOT" +fi +echo "-- Build anakin mlu into: $BUILD_ROOT" + +# Now, actually build the mlu target. +echo "-- Building anakin ..." +cd $BUILD_ROOT + + cmake .. \ + -DENABLE_DEBUG=NO \ + -DUSE_MLU_PLACE=YES \ + -DUSE_BANG=NO \ + -DUSE_OPENCV=NO \ + -DUSE_ARM_PLACE=NO \ + -DUSE_GPU_PLACE=NO \ + -DUSE_NV_GPU=NO \ + -DUSE_AMD_GPU=NO \ + -DUSE_X86_PLACE=YES \ + -DUSE_BM_PLACE=NO \ + -DBUILD_WITH_UNIT_TEST=YES \ + -DUSE_PYTHON=OFF \ + -DENABLE_VERBOSE_MSG=NO \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO \ + -DUSE_OPENMP=YES\ + -DBUILD_SHARED=YES\ + -DBUILD_WITH_FRAMEWORK=YES\ + -DUSE_GFLAGS=NO\ + -DUSE_BOOST=NO\ + -DBUILD_EXAMPLES=NO + +# build target lib or unit test. + +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" && make install +else + make "-j$(nproc)" install +fi + diff --git a/tools/release_build/release_unitest_build_nv.sh b/tools/release_build/release_unitest_build_nv.sh index c21cee9e3..b447fc1eb 100755 --- a/tools/release_build/release_unitest_build_nv.sh +++ b/tools/release_build/release_unitest_build_nv.sh @@ -41,8 +41,13 @@ cmake .. \ # build target lib or unit test. if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install + make "-j$(sysctl -n hw.ncpu)" install else - make "-j$(nproc)" && make install + #num1=$(nproc) + #num2=2 + #num=`expr $num1 / $num2` + #make "-j$num" + make "-j5" install + #make "-j$(nproc)" && make install fi diff --git a/tools/release_build/release_unitest_build_x86.sh b/tools/release_build/release_unitest_build_x86.sh index b8dccabbf..997d7bd3d 100644 --- a/tools/release_build/release_unitest_build_x86.sh +++ b/tools/release_build/release_unitest_build_x86.sh @@ -36,12 +36,13 @@ cmake .. \ -DBUILD_SHARED=YES\ -DBAIDU_RPC_ROOT=/opt/brpc \ -DPROTOBUF_ROOT=/opt \ + -DX86_COMPILE_482=YES\ -DBUILD_WITH_FRAMEWORK=YES # build target lib or unit test. if [ "$(uname)" = 'Darwin' ]; then - make "-j$(sysctl -n hw.ncpu)" && make install + make "-j$(sysctl -n hw.ncpu)" install else - make "-j$(nproc)" && make install + make "-j$(nproc)" install fi diff --git a/tools/release_build/release_unitest_build_x86_v4.sh b/tools/release_build/release_unitest_build_x86_v4.sh new file mode 100644 index 000000000..2c5abb79a --- /dev/null +++ b/tools/release_build/release_unitest_build_x86_v4.sh @@ -0,0 +1,63 @@ +#!/bin/bash +set -ex +#bash -c "$( curl http://jumbo.baidu.com/install_jumbo.sh )" && source ~/.bashrc +#jumbo install git +export LANG="zh_CN.UTF-8" +##export PATH=/home/public/git-2.17.1/:$PATH +#export PATH=~/.jumbo/bin/git:$PATH +export PATH=/home/public/cmake-3.3.0-Linux-x86_64/bin/:$PATH +export PATH=/home/scmtools/buildkit/cmake/cmake-3.12.3/bin:$PATH +export PATH=/usr/local/bin/:$PATH +export LD_LIBRARY_PATH=//home/scmtools/buildkit/protobuf/protobuf_2.6.1/:$LD_LIBRARY_PATH +export GIT_SSL_NO_VERIFY=1 +echo $PATH +echo "git install path" +which git +#git config core.filemode false +echo "git version:" +git --version +# This script shows how one can build a anakin for the x86 platform +ANAKIN_ROOT="$( cd "$(dirname "$0")"/../.. ; pwd -P)" +echo "-- Anakin root dir is: $ANAKIN_ROOT" + +# build the target into gpu_build. +BUILD_ROOT=$ANAKIN_ROOT/x86_native_build + +mkdir -p $BUILD_ROOT +echo "-- Build anakin x86_native into: $BUILD_ROOT" + +# Now, actually build the x86 target. +echo "-- Building anakin ..." +cd $BUILD_ROOT + + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_ARM_PLACE=NO \ + -DUSE_GPU_PLACE=NO \ + -DNVIDIA_GPU=NO \ + -DAMD_GPU=NO \ + -DUSE_X86_PLACE=YES \ + -DUSE_BM_PLACE=NO \ + -DBUILD_WITH_UNIT_TEST=YES \ + -DBUILD_RPC=OFF \ + -DUSE_PYTHON=OFF \ + -DUSE_GFLAGS=OFF \ + -DENABLE_DEBUG=OFF \ + -DENABLE_VERBOSE_MSG=NO \ + -DENABLE_MIN_DEPENDENCY=YES \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO \ + -DUSE_OPENMP=YES\ + -DBUILD_SHARED=YES\ + -DBAIDU_RPC_ROOT=/opt/brpc \ + -DX86_COMPILE_482=YES\ + -DBUILD_WITH_FRAMEWORK=YES + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" install +else + make "-j$(nproc)" install +fi + diff --git a/tools/sgx_build.sh b/tools/sgx_build.sh new file mode 100755 index 000000000..d59fc0d4b --- /dev/null +++ b/tools/sgx_build.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# This script shows how one can build a anakin for the platform, +# with sepcial support for running in SGX mode +ANAKIN_ROOT="$( cd "$(dirname "$0")"/.. ; pwd -P)" +echo "-- Anakin root dir is: $ANAKIN_ROOT" + +# build the target into sgx_build. +BUILD_ROOT=$ANAKIN_ROOT/sgx_build + +mkdir -p $BUILD_ROOT +echo "-- Build anakin sgx into: $BUILD_ROOT" + +# Now, actually build the gpu target. +echo "-- Building anakin ..." +cd $BUILD_ROOT + +cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DUSE_ARM_PLACE=NO \ + -DUSE_GPU_PLACE=NO \ + -DUSE_X86_PLACE=YES \ + -DUSE_SGX=YES \ + -DBUILD_WITH_UNIT_TEST=NO \ + -DUSE_PYTHON=OFF \ + -DENABLE_DEBUG=NO \ + -DENABLE_VERBOSE_MSG=NO \ + -DDISABLE_ALL_WARNINGS=YES \ + -DENABLE_NOISY_WARNINGS=NO + +# build target lib or unit test. +if [ "$(uname)" = 'Darwin' ]; then + make "-j$(sysctl -n hw.ncpu)" && make install +else + make "-j$(nproc)" && make install +fi + diff --git a/utils/logger/log_utils.h b/utils/logger/log_utils.h index 00116e6e9..2ebc0158f 100644 --- a/utils/logger/log_utils.h +++ b/utils/logger/log_utils.h @@ -36,6 +36,9 @@ #include // mkdir #include // STDERR_FILENO #include "anakin_config.h" +#ifdef USE_SGX +#include +#endif // Disable all warnings from gcc/clang: #if defined(__clang__) @@ -54,7 +57,7 @@ #define SUPPORT_PTHREADS 1 // support for pthreads -#if defined(ANDROID) || defined(__ANDROID__) +#if defined(ANDROID) || defined(__ANDROID__) || defined(LINUX_ARM_OS) //#ifdef TARGET_ANDROID #define STACKTRACES 0 #else diff --git a/utils/logger/logger.h b/utils/logger/logger.h index b357ee7b7..ee390a820 100644 --- a/utils/logger/logger.h +++ b/utils/logger/logger.h @@ -19,6 +19,8 @@ #define LOGGER_SHUTDOWN 0 #include "anakin_config.h" + +#ifndef USE_SGX #include "logger_core.h" #define SCOPE_LOGGER_CORE_FUNC logger::core::funcRegister @@ -204,6 +206,43 @@ CHECK_SYMBOL_WARP(CHECK_GT_IMPL, >) #define VLOG_IS_ON(verbose) ((verbose) <= SCOPE_LOGGER_CORE_CONFIG::current_verbosity_cutoff()) #endif +#else // USE_SGX +// define a nop logger for SGX build +namespace logger { + inline void init(const char*){} + + struct NopLogger { + template + constexpr const NopLogger &operator<<(const T &) const { + return *this; + } + + template + T *operator&() { + static_assert(sizeof(T) == 0, "Taking the address of NopLogger is disallowed."); + return nullptr; + } + }; + + static constexpr NopLogger __NOP; +} +// namespace logger + +#define NOPLOG(X) logger::__NOP +#define LOG NOPLOG +#define VLOG NOPLOG +#define DLOG NOPLOG +#define CHECK(X) (((X) == true ? void(nullptr) : abort()), logger::__NOP) +#define CHECK_NOTNULL(X) CHECK((X) != nullptr) +#define CHECK_EQ(X, Y) CHECK(((X) == (Y))) +#define CHECK_NE(X, Y) CHECK(((X) != (Y))) +#define CHECK_LT(X, Y) CHECK(((X) < (Y))) +#define CHECK_LE(X, Y) CHECK(((X) <= (Y))) +#define CHECK_GT(X, Y) CHECK(((X) > (Y))) +#define CHECK_GE(X, Y) CHECK(((X) >= (Y))) +#define ABORT_S() CHECK(false) + +#endif // USE_SGX #endif // LOGGER_H diff --git a/utils/logger/logger_core.h b/utils/logger/logger_core.h index 28edf6a41..c6fbb2994 100644 --- a/utils/logger/logger_core.h +++ b/utils/logger/logger_core.h @@ -759,7 +759,7 @@ inline void get_thread_name(char* buffer, unsigned long long length, bool right_ uint64_t thread_id = thread; #endif if (right_align_hext_id) { - snprintf(buffer, length, "%*X", length - 1, static_cast(thread_id)); + snprintf(buffer, length, "%*X", static_cast(length - 1), static_cast(thread_id)); } else { snprintf(buffer, length, "%X", static_cast(thread_id)); }